---
title: 泰坦尼克之灾（六大模型baseline）
date: 2019-08-15
categories: [人工智能, 项目]
mathjax: false
---

## 数据集载入

In [88]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("2019-08-15_泰坦尼克之灾_train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [90]:
typedic = {} # 类型字典
for name in df.columns:
    typedic[str(df[name].dtype)] = typedic.get(str(df[name].dtype),[])+[name]

for key,value in typedic.items():
    print("{}格式共有{}个: {}".format(key,len(value),value))
    print("")

int64格式共有5个: ['PassengerId', 'Survived', 'Pclass', 'SibSp', 'Parch']

object格式共有5个: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

float64格式共有2个: ['Age', 'Fare']



至此可以看到数据不全的有：['Age','Cabin']

需要调整类型或删除无用资讯的包含：['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

In [91]:
df.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

## 数据探索

In [94]:
f, ax = plt.subplots(2, 3, figsize=(20, 10))

# 年龄以20-40为主，5岁以下密度有局部高峰
x = df['Age'].dropna()
sns.distplot(x, ax = ax[0][0])

# 存活率大约为3分之2
sns.countplot(x = 'Survived', data = df, ax = ax[0][1])

# 年龄跟舱等的分布
df['Age'][df.Pclass == 1].plot(kind='kde', ax = ax[0][2])
df['Age'][df.Pclass == 2].plot(kind='kde', ax = ax[0][2])
df['Age'][df.Pclass == 3].plot(kind='kde', ax = ax[0][2])

# 以三舱等为主
sns.countplot(x = 'Pclass', data = df, ax = ax[1][0])

# 登船口岸以S为大宗
sns.countplot(x = 'Embarked', data = df, ax = ax[1][1])

# 性别男生为主
sns.countplot(x = 'Sex', data = df, ax = ax[1][2])

plt.savefig("../img/2019-08-15_泰坦尼克之灾_1.png")
plt.close()

![](/img/2019-08-15_泰坦尼克之灾_1.png)

In [95]:
f, ax = plt.subplots(2, 2, figsize=(20, 10))

# 按年龄看获救情况，老人没获救比例高
sns.stripplot(x = 'Survived', y ='Age', data = df, ax = ax[0][0])

# 按舱等看获救情况，3舱等大都没获救
sns.countplot(x = 'Survived', hue ='Pclass', data = df, ax = ax[0][1])

# 按登船口岸看获救情况，S登船的大都没被获救
sns.countplot(x = 'Survived', hue = 'Embarked', data = df, ax = ax[1][0])

# 按登船口岸看获救情况，男生大都没被获救
sns.countplot(x = 'Survived', hue = 'Sex', data = df, ax = ax[1][1])

plt.savefig("../img/2019-08-15_泰坦尼克之灾_2.png")
plt.close()

![](/img/2019-08-15_泰坦尼克之灾_2.png)

## 特征预处理

In [96]:
df_tuned = df.copy(deep=True)
df_tuned.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
434,435,0,1,"Silvey, Mr. William Baird",male,50.0,1,0,13507,55.9,E44,S
728,729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25.0,1,0,236853,26.0,,S
197,198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42.0,0,1,4579,8.4042,,S
272,273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41.0,0,1,250644,19.5,,S
176,177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S
324,325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
607,608,1,1,"Daniel, Mr. Robert Williams",male,27.0,0,0,113804,30.5,,S
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
229,230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S
91,92,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,347466,7.8542,,S


### 删除无用

#### Name

In [97]:
df_tuned = df_tuned.drop(['Name'],axis=1)

### 填充空值

#### Age

In [98]:
# 有177个缺失,将近2成
print(df_tuned['Age'].isnull().sum())
print(df_tuned['Age'].isnull().sum() * 100 / df_tuned.shape[0])

177
19.865319865319865


In [None]:
from sklearn.preprocessing import Imputer

feature = df_tuned['Age'].values.reshape(-1,1)
imp_mode = Imputer(strategy = 'most_frequent')
df_tuned['Age'] = np.mean(imp_mode.fit_transform(feature))

# 现在无缺失了
print(df_tuned['Age'].isnull().sum())

0


#### Cabin

In [78]:
df_tuned['Cabin'][df_tuned['Cabin'].isnull()] = 0
df_tuned['Cabin'][df_tuned['Cabin'].notnull()] = 1

### 处理分类特征

#### Cabin, Embarked, Sex, Pclass

In [79]:
dummies_Cabin = pd.get_dummies(df_tuned['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(df_tuned['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(df_tuned['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(df_tuned['Pclass'], prefix= 'Pclass')

df_tuned = pd.concat([df_tuned, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_tuned.drop(['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

In [80]:
# 最后检查
df_tuned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin_1        891 non-null uint8
Embarked_C     891 non-null uint8
Embarked_Q     891 non-null uint8
Embarked_S     891 non-null uint8
Sex_female     891 non-null uint8
Sex_male       891 non-null uint8
Pclass_1       891 non-null uint8
Pclass_2       891 non-null uint8
Pclass_3       891 non-null uint8
dtypes: float64(2), int64(4), uint8(9)
memory usage: 49.7 KB


### 数据划分

In [81]:
from sklearn.model_selection import train_test_split

X = df_tuned[:].drop("Survived",axis=1)
y = df_tuned["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = False)

### 数据标准化

In [82]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = ss.fit_transform(X_train)
X_train = pd.DataFrame(data,columns = X.columns)

data = ss.fit_transform(X_test)
X_test = pd.DataFrame(data,columns = X.columns)

## 数据建模

### 逻辑回归

In [83]:
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

LR = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)
LR = LR.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

svc = SVC(kernel='linear', probability = True)
svc = svc.fit(X_train, y_train)
# svc = cross_val_score(svc,X_train,y_train, cv = 5)

DT = DecisionTreeClassifier(max_depth = 6)
DT = DT.fit(X_train, y_train)
# DT = cross_val_score(DT, X_train, y_train, cv = 5)

RF = RandomForestClassifier()
RF = RF.fit(X_train, y_train)
# RF = cross_val_score(RF, X_train, y_train, cv = 5)

KNN = KNeighborsClassifier()
KNN = KNN.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

GBDT = GradientBoostingClassifier()
GBDT = GBDT.fit(X_train, y_train)
# LR = cross_val_score(LR, X_train, y_train, cv = 5)

names = ["LR", "SVC", 'DT', "RF", "KNN", "GBDT"]
models = [LR, svc, DT, RF, KNN, GBDT]
evaluates = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

In [86]:
df_list = []
for name,model in zip(names,models):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #accuracy
    train_accuracy = model.score(X_train,y_train)
    test_accuracy = model.score(X_test,y_test)
    
    
    #precision   
    train_precision = precision_score(y_train,y_train_pred)
    test_precision = precision_score(y_test,y_test_pred)   
    
    #recall
    train_recall = recall_score(y_train,y_train_pred)
    test_recall = recall_score(y_test,y_test_pred)   
    
    #f1
    train_f1 = f1_score(y_train,y_train_pred)
    test_f1 = f1_score(y_test,y_test_pred)   
    
    #auc
    y_train_pred = model.predict_proba(X_train)[:,1]
    y_test_pred = model.predict_proba(X_test)[:,1]
    
    train_auc = roc_auc_score(y_train,y_train_pred)
    test_auc = roc_auc_score(y_test,y_test_pred)
    
#     print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
#     print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
#     print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)

pd.concat(df_list,axis=0,keys=names)

Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,0.800562,0.783333,0.676259,0.725869,0.838942
LR,test,0.826816,0.779661,0.71875,0.747967,0.844158
SVC,train,0.783708,0.742188,0.683453,0.71161,0.819083
SVC,test,0.798883,0.741379,0.671875,0.704918,0.804755
DT,train,0.856742,0.84375,0.776978,0.808989,0.909753
DT,test,0.837989,0.807018,0.71875,0.760331,0.836141
RF,train,0.984551,1.0,0.960432,0.979817,0.999631
RF,test,0.804469,0.773585,0.640625,0.700855,0.874117
KNN,train,0.832865,0.841202,0.705036,0.767123,0.910006
KNN,test,0.826816,0.823529,0.65625,0.730435,0.85659


### 数据预测

In [85]:
df_test = pd.read_csv("2019-08-15_泰坦尼克之灾_test.csv")
df_tuned_test = df_test.copy(deep=True)

df_tuned_test = df_tuned_test.drop(['Name'],axis=1)

feature = df_tuned_test['Age'].values.reshape(-1,1)
imp_mode = Imputer(strategy = 'most_frequent')
df_tuned_test['Age'] = np.mean(imp_mode.fit_transform(feature))

feature = df_tuned_test['Fare'].values.reshape(-1,1)
imp_mode = Imputer(strategy = 'most_frequent')
df_tuned_test['Fare'] = np.mean(imp_mode.fit_transform(feature))

df_tuned_test['Cabin'][df_tuned_test['Cabin'].isnull()] = 0
df_tuned_test['Cabin'][df_tuned_test['Cabin'].notnull()] = 1

dummies_Cabin = pd.get_dummies(df_tuned_test['Cabin'], prefix= 'Cabin')
dummies_Embarked = pd.get_dummies(df_tuned_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(df_tuned_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(df_tuned_test['Pclass'], prefix= 'Pclass')

df_tuned_test = pd.concat([df_tuned_test, dummies_Cabin, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_tuned_test.drop(['Pclass', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = ss.fit_transform(df_tuned_test)
df_tuned_test = pd.DataFrame(data,columns = df_tuned_test.columns)

In [67]:
predictions = GBDT.predict(df_tuned_test)
result = pd.DataFrame({'PassengerId':df_test['PassengerId'].as_matrix(), 'Survived':predictions.astype(np.int32)})
result.to_csv("2019-08-15_泰坦尼克之灾_predictions.csv", index=False)