# 1.数据概览

In [2]:
#忽略警告
import warnings
warnings.filterwarnings("ignore")

In [175]:
#合并数据集：

import numpy as np
import pandas as pd

train=pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [176]:
print("train.shape:",train.shape,"test.csv",test.shape)

train.shape: (891, 12) test.csv (418, 12)


In [177]:
#合并数据集
full = train.append(test,ignore_index=True)
print("full.shape:",full.shape)

full.shape: (1309, 12)


In [178]:
#查看数据：
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [179]:
#查看每一列：发现age,Cabin,Embarked,Fare存在缺失
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1046 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1308 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


# 2.数据清洗

## 2.1 数据预处理

In [180]:
#缺失值处理：
#age/Fare用其平均值进行填充
full["Age"] = full["Age"].fillna(full["Age"].mean())
full["Fare"] = full["Age"].fillna(full["Fare"].mean())

In [181]:
full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


In [182]:
full["Embarked"].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [183]:
#对缺失值进行最多值填补
full["Embarked"] = full["Embarked"].fillna("S")

In [184]:
# Cabin的填充，使用U进行填充，表示未知：
full['Cabin'].head()
full['Cabin'] = full['Cabin'].fillna( 'U')

In [185]:
full.info()#查看最终填充情况

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
PassengerId    1309 non-null int64
Survived       1309 non-null int64
Pclass         1309 non-null int64
Name           1309 non-null object
Sex            1309 non-null object
Age            1309 non-null float64
SibSp          1309 non-null int64
Parch          1309 non-null int64
Ticket         1309 non-null object
Fare           1309 non-null float64
Cabin          1309 non-null object
Embarked       1309 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


## 2.2 特征提取

数值类型可直接使用，时间序列类型可转换为单独的年月日，分类数据类型可用数值代替类别，一般采用One-hot编码。这里有下面几种：

In [186]:
#: 分类数据：sex
sex_mapD ={'male':1, 'female':0}
full['Sex'] = full['Sex'].map(sex_mapD)
full.head()
full['Sex']

0       1
1       0
2       0
3       0
4       1
5       1
6       1
7       1
8       0
9       0
10      0
11      0
12      1
13      1
14      0
15      0
16      1
17      1
18      0
19      0
20      1
21      1
22      0
23      1
24      0
25      0
26      1
27      1
28      0
29      1
       ..
1279    1
1280    1
1281    1
1282    0
1283    1
1284    1
1285    1
1286    0
1287    1
1288    0
1289    1
1290    1
1291    0
1292    1
1293    0
1294    1
1295    1
1296    1
1297    1
1298    1
1299    0
1300    0
1301    0
1302    0
1303    0
1304    1
1305    0
1306    1
1307    1
1308    1
Name: Sex, Length: 1309, dtype: int64

In [187]:
#one-hot编码：get_dumies，对单列数据进行分类编码，并加上前缀
embarkedDF = pd.DataFrame()
embarkedDF = pd.get_dummies(full['Embarked'],prefix = 'Embarked')
embarkedDF.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [188]:
#将one-hot产生的数据集添加至原数据集并将原数据集Embarked特征列去除
full = pd.concat([full,embarkedDF],axis=1)#按列在后方进行连接
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html
full.drop('Embarked',axis =1, inplace = True)#将Embarked列原数据进行剔除
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,22.0,U,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,38.0,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,26.0,U,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,35.0,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,35.0,U,0,0,1


In [189]:
#使用get_dumies对Pclass进行one-hot编码
PclassDF = pd.DataFrame()
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html
PclassDF = pd.get_dummies(full['Pclass'],prefix = 'Pclass')
PclassDF.head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [190]:
full = pd.concat([full,PclassDF],axis=1)
full.drop('Pclass',axis = 1,inplace = True)
full.head()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,1,0,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,22.0,U,0,0,1,0,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,38.0,C85,1,0,0,1,0,0
2,3,1,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,26.0,U,0,0,1,0,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,35.0,C123,0,0,1,1,0,0
4,5,0,"Allen, Mr. William Henry",1,35.0,0,0,373450,35.0,U,0,0,1,0,0,1


In [208]:
#自定义头衔与名称的对应关系
def getTitle(name):
    str1 = name.split(',')[1]#对name分割取后者
    str2 = str1.split('.')[0]#对str1分割取前者
    str3 = str2.strip()#对str2进行首位去空格
    return str3
titleDf = pd.DataFrame()
titleDf['Title'] = full['Name'].map(getTitle)
title_mD = {
        'Capt':            'Officer',
        'Col':             'Officer',
        'Major':           'Officer',
        'Jonkheer':        'Royalty',
        'Don':             'Royalty',
        'Sir':             'Royalty',
        'Dr':              'Officer',
        'Rev':             'Officer',
        'the Countess':    'Royalty',
        'Dona':            'Royalty',
        'Mme':              'Mrs',
        'Mlle':             'Miss',
        'Ms':               'Mrs',
        'Mr':               'Mr',
        'Mrs':             'Mrs',
        'Miss':            'Miss',
        'Master':          'Master',
        'Lady':            'Royalty'
}
Counter(titleDf['Title'])

Counter({'Mr': 757,
         'Mrs': 197,
         'Miss': 260,
         'Master': 61,
         'Don': 1,
         'Rev': 8,
         'Dr': 8,
         'Mme': 1,
         'Ms': 2,
         'Major': 2,
         'Lady': 1,
         'Sir': 1,
         'Mlle': 2,
         'Col': 4,
         'Capt': 1,
         'the Countess': 1,
         'Jonkheer': 1,
         'Dona': 1})

In [209]:
#对称呼和自定义的五类名称进行映射one-pot
titleDf['Title'] = titleDf['Title'].map(title_mD)
titleDf = pd.get_dummies(titleDf['Title'])
titleDf.head()

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0


In [210]:
full = pd.concat([full,titleDf],axis = 1)

In [212]:
full.drop('Name',axis=1,inplace = True)

In [213]:
full.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,...,Embarked_S,Pclass_1,Pclass_2,Pclass_3,Master,Miss,Mr,Mrs,Officer,Royalty
0,1,0,1,22.0,1,0,A/5 21171,22.0,U,0,...,1,0,0,1,0,0,1,0,0,0
1,2,1,0,38.0,1,0,PC 17599,38.0,C85,1,...,0,1,0,0,0,0,0,1,0,0
2,3,1,0,26.0,0,0,STON/O2. 3101282,26.0,U,0,...,1,0,0,1,0,1,0,0,0,0
3,4,1,0,35.0,1,0,113803,35.0,C123,0,...,1,1,0,0,0,0,0,1,0,0
4,5,0,1,35.0,0,0,373450,35.0,U,0,...,1,0,0,1,0,0,1,0,0,0


In [215]:
#船舱的位置：使用one-hot
#lambda(匿名函数)
# add = lambda x, y : x+y
# add(1,2)  # 结果为3
CabinDF = pd.DataFrame()
full['Cabin'] = full['Cabin'].map(lambda x:x[0])#取船舱的第一位进行映射
CabinDF = pd.get_dummies(full['Cabin'],prefix = 'Cabin')
CabinDF.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1


In [216]:
#添加至full中，并删除原有列
full = pd.concat([full,CabinDF],axis = 1)
full.drop('Cabin',axis = 1,inplace = True)
full.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,...,Royalty,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,1,0,1,22.0,1,0,A/5 21171,22.0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,0,38.0,1,0,PC 17599,38.0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,3,1,0,26.0,0,0,STON/O2. 3101282,26.0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,0,35.0,1,0,113803,35.0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,5,0,1,35.0,0,0,373450,35.0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [220]:
#对家庭进行分类
#家庭成员人数f_size、根据家庭成员人数而分的f_S、f_s,f_L(小1，中2-4，大>5),工四个向量

FamilyDF =pd.DataFrame()
FamilyDF['FamilySize'] = full['Parch']+full['SibSp']+1#父母兄弟和自己
#lambda: if x==1 single;[2,4]->small;>5->large
# if x==1:
#     x=1
# else:
#     x=0
FamilyDF['Family_Single'] = FamilyDF['FamilySize'].map(lambda x : 1 if x==1 else 0)
FamilyDF['Family_Small'] = FamilyDF['FamilySize'].map(lambda x : 1 if 4>=x>=2 else 0)
FamilyDF['Family_Large'] = FamilyDF['FamilySize'].map(lambda x:1 if x>5 else 0)
FamilyDF.head()


Unnamed: 0,FamilySize,Family_Single,Family_Small,Family_Large
0,2,0,1,0
1,2,0,1,0
2,1,1,0,0
3,2,0,1,0
4,1,1,0,0


In [221]:
#将家庭分类添加至full，并删除原有列
full = pd.concat([full,FamilyDF],axis=1)
full.head() #最终的full表格

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_C,Embarked_Q,...,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,FamilySize,Family_Single,Family_Small,Family_Large
0,1,0,1,22.0,1,0,A/5 21171,22.0,0,0,...,0,0,0,0,0,1,2,0,1,0
1,2,1,0,38.0,1,0,PC 17599,38.0,1,0,...,0,0,0,0,0,0,2,0,1,0
2,3,1,0,26.0,0,0,STON/O2. 3101282,26.0,0,0,...,0,0,0,0,0,1,1,1,0,0
3,4,1,0,35.0,1,0,113803,35.0,0,0,...,0,0,0,0,0,0,2,0,1,0
4,5,0,1,35.0,0,0,373450,35.0,0,0,...,0,0,0,0,0,1,1,1,0,0


# 相关性分析来提取特征值

In [223]:
#相关性分析的方法有很多，这里考虑使用相关系数分析：
corrDF = full.corr()
corrDF['Survived'].sort_values(ascending = False)

Survived         1.000000
Mrs              0.431571
Miss             0.425890
Family_Small     0.267628
Pclass_1         0.234338
Cabin_B          0.141251
Embarked_C       0.121156
Cabin_E          0.113344
Cabin_C          0.113088
Parch            0.108919
Cabin_D          0.108186
FamilySize       0.061090
Pclass_2         0.048091
Embarked_Q       0.040950
Royalty          0.040497
Cabin_F          0.038571
Cabin_G          0.028438
Cabin_A          0.020811
SibSp            0.002370
Master          -0.000154
PassengerId     -0.020370
Cabin_T         -0.021527
Officer         -0.044142
Fare            -0.048483
Age             -0.048483
Family_Large    -0.072671
Embarked_S      -0.133020
Family_Single   -0.216299
Pclass_3        -0.242183
Cabin_U         -0.259023
Mr              -0.653221
Sex             -0.688371
Name: Survived, dtype: float64

In [235]:
#根据相关系数，求出新的数据集：
full_X=pd.concat([titleDf,
                  PclassDF,
                  FamilyDF,
                  CabinDF,
                  embarkedDF,
                  full['Sex'],
                  full['Survived']
                      
],axis=1)


In [236]:
full_X

Unnamed: 0,Master,Miss,Mr,Mrs,Officer,Royalty,Pclass_1,Pclass_2,Pclass_3,FamilySize,...,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,Embarked_C,Embarked_Q,Embarked_S,Sex,Survived
0,0,0,1,0,0,0,0,0,1,2,...,0,0,0,0,1,0,0,1,1,0
1,0,0,0,1,0,0,1,0,0,2,...,0,0,0,0,0,1,0,0,0,1
2,0,1,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,1,0,1
3,0,0,0,1,0,0,1,0,0,2,...,0,0,0,0,0,0,0,1,0,1
4,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,1,1,0
5,0,0,1,0,0,0,0,0,1,1,...,0,0,0,0,1,0,1,0,1,0
6,0,0,1,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,1,0
7,1,0,0,0,0,0,0,0,1,5,...,0,0,0,0,1,0,0,1,1,0
8,0,0,0,1,0,0,0,0,1,3,...,0,0,0,0,1,0,0,1,0,1
9,0,0,0,1,0,0,0,1,0,2,...,0,0,0,0,1,1,0,0,0,1


# 数据建模

## 1.随机森林

In [227]:
train.shape,test.shape

((891, 12), (418, 12))

In [228]:
full_X.shape

(1309, 26)

In [240]:
#数据分类：
train_new=full_X.iloc[:891,:]

test_new = full_X.iloc[891:,:]

train_X=train_new.iloc[:,:-1]

train_Y=train_new.iloc[:,-1]

test_X=test_new.iloc[:,:-1]

test_Y=test_new.iloc[:,-1]

In [249]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(n_estimators= 1000,n_jobs=1)
model1.fit(train_X,train_Y)
model1.score(test_X,test_Y)

0.84688995215311

## 2.KNN：

In [251]:
from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier()
model2.fit(train_X,train_Y)
model2.score(test_X,test_Y)

0.8564593301435407

## 3.SVC：

In [252]:
from sklearn.svm import SVC,LinearSVC
model3 = SVC()
model3.fit(train_X,train_Y)
model3.score(test_X,test_Y)

0.9449760765550239

## 4.多层感知机：

In [255]:
from sklearn.neural_network import MLPClassifier
mpl =MLPClassifier(solver = 'lbfgs')
mpl.fit(train_X,train_Y)
pre = mpl.predict(test_X)
from sklearn.metrics import accuracy_score
score = accuracy_score(test_Y,pre)
score

0.8492822966507177