In [1]:
"""
    使用SVM对泰坦尼克号的的训练集进行模型训练，再对测试集进行survived的预测
    步骤：数据清洗、构建特征（编码）、选择特征构建分类器、训练模型、预测survived
"""
import sys
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder #编码的转换

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import svm
from sklearn.model_selection import train_test_split

In [2]:
#读取数据
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

In [3]:
data_test.head() #显示前5个数据

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
#将文件当中的列名称转换为小写格式
data_train.columns = data_train.columns.str.lower()  #转换为小写
data_test.columns = data_test.columns.str.lower()

In [5]:
data_train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 清洗数据

In [6]:
#合并两个数据集，进行统一清洗
data_All = [data_train,data_test]

In [7]:
data_train.isnull().sum() #查看训练集中的空值

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [8]:
data_test.isnull().sum() #查看测试集中的空值

passengerid      0
pclass           0
name             0
sex              0
age             86
sibsp            0
parch            0
ticket           0
fare             1
cabin          327
embarked         0
dtype: int64

In [9]:
#对训练集进行描述
data_train.describe(include='all')

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Doharr, Mr. Tannous",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [10]:
#对原始数据集（训练集+测试集）进行清洗
for dataset in data_All:
    #填补空缺值
    dataset['age'].fillna(dataset['age'].median(), inplace=True) #中位数，True表示保存在原始
    dataset['fare'].fillna(dataset['fare'].median(), inplace=True)
    dataset['embarked'].fillna(dataset['embarked'].mode()[0], inplace=True)#港口没有中位数

In [11]:
#删除一些字段：舱位号、乘客ID、票根号
drop_columns = ['cabin', 'passengerid', 'ticket']
data_train.drop(drop_columns, axis=1, inplace=True)
data_test.drop(drop_columns, axis=1, inplace=True)

In [12]:
data_train.isnull().sum()#训练集中各个特征：0表示无空值

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

In [13]:
data_test.isnull().sum()#测试集中各个特征：0表示无空值

pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

# 构建特征

In [14]:
#进行特征构建
for dataset in data_All:
    # 构建新的字段：
    # (1)family_size家庭规模：sibsp + parch
    dataset['family_size'] = dataset['sibsp'] + dataset['parch'] + 1
    # (2)单身single, 1:单身， 0：非单身
    dataset['single'] = 1
    dataset['single'].loc[dataset['family_size'] > 1] = 0 #非单身
    # (3)身份title，就是乘客名字中的称谓，如：Mr、Miss、Mrs、Master、Dr....等
    dataset['title'] = dataset['name'].str.split(', ', expand=True)[1].str.split('.',expand=True)[0] #True返回表格形式
    #dataset['title] = dataset['name'].apply(lambda x : x.split(', ')[1]).apply(lambda x : x.split('.')[0])
    # (4)票价fare_bin
    dataset['fare_bin'] = pd.qcut(dataset['fare'], 4) #根据票价，分成4组（每组元素个数一样）
    # （5）年龄age_bin
    dataset['age_bin'] = pd.cut(dataset['age'].astype(int), 5) #根据年龄分组，分为5组（每组元素个数一样）
    
###运行之后出现的反馈若不是error，直接忽视

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
#查看构建特征后的情况，只显示前五行
dataset.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,fare,embarked,family_size,single,title,fare_bin,age_bin
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,1,Mr,"(-0.001, 7.896]","(30.4, 45.6]"
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,2,0,Mrs,"(-0.001, 7.896]","(45.6, 60.8]"
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,1,Mr,"(7.896, 14.454]","(60.8, 76.0]"
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,1,Mr,"(7.896, 14.454]","(15.2, 30.4]"
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,0,Mrs,"(7.896, 14.454]","(15.2, 30.4]"


# 名字中的称谓是个很关键的因素

In [16]:
# 根据训练集中的title（称谓）统计人数
data_train['title'].value_counts()

Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
Jonkheer          1
Don               1
Mme               1
the Countess      1
Sir               1
Capt              1
Ms                1
Lady              1
Name: title, dtype: int64

In [17]:
# 根据测试集中的title（称谓）统计人数
data_test['title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: title, dtype: int64

In [18]:
# 训练集 各位乘客名称处理
title_names = (data_train['title'].value_counts()<10)  #如果数量小于10，就为Ture
title_names  

Mr              False
Miss            False
Mrs             False
Master          False
Dr               True
Rev              True
Mlle             True
Major            True
Col              True
Jonkheer         True
Don              True
Mme              True
the Countess     True
Sir              True
Capt             True
Ms               True
Lady             True
Name: title, dtype: bool

In [19]:
# 测试集 各位乘客名称处理
title_names_test = (data_test['title'].value_counts()<10)  #如果数量小于10，就为Ture
title_names_test

Mr        False
Miss      False
Mrs       False
Master    False
Col        True
Rev        True
Ms         True
Dr         True
Dona       True
Name: title, dtype: bool

In [20]:
# train数据的title: 将那些称谓所在的人数小于10的数据，全部归为一类other
data_train['title'] = data_train['title'].apply(lambda x : 'other' if title_names[x] else x)
data_train['title'].value_counts()

Mr        517
Miss      182
Mrs       125
Master     40
other      27
Name: title, dtype: int64

In [21]:
# test数据的title: 将那些称谓所在的人数小于10的数据，全部归为一类other
data_test['title'] = data_test['title'].apply(lambda x : 'other' if title_names_test[x] else x)
data_test['title'].value_counts()

Mr        240
Miss       78
Mrs        72
Master     21
other       7
Name: title, dtype: int64

# 构建新的字段，将英文转换为数字编码

In [22]:
#构建新的字段，基于scikit-learn中的LabelEncoder()
label = LabelEncoder()
for dataset in data_All:
    # (1)新字段： sex_code
    dataset['sex_code'] = label.fit_transform(dataset['sex'])
    # (2)新字段： embarked_code
    dataset['embarked_code'] = label.fit_transform(dataset['embarked'])
    # (3)新字段： title_code
    dataset['title_code'] = label.fit_transform(dataset['title'])
    # (4)新字段： age_bin_code
    dataset['age_bin_code'] = label.fit_transform(dataset['age_bin'])
    # (5)新字段： fare_bin_code
    dataset['fare_bin_code'] = label.fit_transform(dataset['fare_bin'])
   

In [23]:
data_train.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family_size,single,title,fare_bin,age_bin,sex_code,embarked_code,title_code,age_bin_code,fare_bin_code
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,2,0,Mr,"(-0.001, 7.91]","(16.0, 32.0]",1,2,2,1,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,2,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]",0,0,3,2,3
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,1,Miss,"(7.91, 14.454]","(16.0, 32.0]",0,2,1,1,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,2,0,Mrs,"(31.0, 512.329]","(32.0, 48.0]",0,2,3,2,3
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1,1,Mr,"(7.91, 14.454]","(32.0, 48.0]",1,2,2,2,1


In [24]:
# 训练集的列的名称
data_train.columns.tolist()

['survived',
 'pclass',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'family_size',
 'single',
 'title',
 'fare_bin',
 'age_bin',
 'sex_code',
 'embarked_code',
 'title_code',
 'age_bin_code',
 'fare_bin_code']

In [25]:
# 测试集的列的名称
data_test.columns.tolist()

['pclass',
 'name',
 'sex',
 'age',
 'sibsp',
 'parch',
 'fare',
 'embarked',
 'family_size',
 'single',
 'title',
 'fare_bin',
 'age_bin',
 'sex_code',
 'embarked_code',
 'title_code',
 'age_bin_code',
 'fare_bin_code']

# 将处理之后的训练集保存csv，以便用于在另一个程序中可视化

In [26]:
data_train.to_csv("data_train_new.csv", index=False)

# 特征选择(参考另一个程序中可视化结果选特征)，用于训练

In [27]:
#不同的特征选择会产生不同的结果
Target = ['survived'] #标签
data_columns_one = ['sex', 'pclass', 'embarked', 'title', 'sibsp', 'parch', 'age', 'fare', 'family_size',
                   'single']

In [28]:
#通过Pandas中的get_dummies()进行编码
data_one_dummy_train = pd.get_dummies(data_train[data_columns_one])

data_one_dummy_test = pd.get_dummies(data_test[data_columns_one])

In [29]:
data_one_dummy_train.head()

Unnamed: 0,pclass,sibsp,parch,age,fare,family_size,single,sex_female,sex_male,embarked_C,embarked_Q,embarked_S,title_Master,title_Miss,title_Mr,title_Mrs,title_other
0,3,1,0,22.0,7.25,2,0,0,1,0,0,1,0,0,1,0,0
1,1,1,0,38.0,71.2833,2,0,1,0,1,0,0,0,0,0,1,0
2,3,0,0,26.0,7.925,1,1,1,0,0,0,1,0,1,0,0,0
3,1,1,0,35.0,53.1,2,0,1,0,0,0,1,0,0,0,1,0
4,3,0,0,35.0,8.05,1,1,0,1,0,0,1,0,0,1,0,0


In [30]:
#提取训练集和测试集的列名称
data_one_dummy_list_train = data_one_dummy_train.columns.tolist()

data_one_dummy_list_test = data_one_dummy_test.columns.tolist()

In [31]:
data_one_dummy_list_train

['pclass',
 'sibsp',
 'parch',
 'age',
 'fare',
 'family_size',
 'single',
 'sex_female',
 'sex_male',
 'embarked_C',
 'embarked_Q',
 'embarked_S',
 'title_Master',
 'title_Miss',
 'title_Mr',
 'title_Mrs',
 'title_other']

In [32]:
data_one_dummy_list_test

['pclass',
 'sibsp',
 'parch',
 'age',
 'fare',
 'family_size',
 'single',
 'sex_female',
 'sex_male',
 'embarked_C',
 'embarked_Q',
 'embarked_S',
 'title_Master',
 'title_Miss',
 'title_Mr',
 'title_Mrs',
 'title_other']

#    使用支持向量机SVM

In [33]:
#划分训练集与测试集
train_x, train_y, x_label, y_label = train_test_split(data_one_dummy_train[data_one_dummy_list_train], \
                                                      data_train[Target], random_state=1, \
                                                      train_size=0.75, test_size=0.25)

In [34]:
print("训练集大小", train_x.shape)
print("测试集大小", train_y.shape)

训练集大小 (668, 17)
测试集大小 (223, 17)


# svm方法一：使用rbf核函数

In [35]:
#训练SVM分类器
classifier_one = svm.SVC(C=10, kernel='rbf', gamma=10, decision_function_shape="ovr")
classifier_one.fit(train_x, x_label)

  y = column_or_1d(y, warn=True)


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
#计算分类的准确率
print("使用rbf核函数建模的结果")
print("训练集： ", classifier_one.score(train_x, x_label))
print("测试集： ", classifier_one.score(train_y, y_label))

使用rbf核函数建模的结果
训练集：  0.9865269461077845
测试集：  0.5964125560538116


# svm方法二：使用linear核函数

In [51]:
#训练svm分类器
classifier_two = svm.SVC(C=50, kernel='linear', gamma=10, decision_function_shape='ovr')
classifier_two.fit(train_x, x_label)

  y = column_or_1d(y, warn=True)


SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=10, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
#计算分类的准确率
print("使用linear核函数建模的结果")
print("训练集： ", classifier_two.score(train_x, x_label))
print("测试集： ", classifier_two.score(train_y, y_label))

使用linear核函数建模的结果
训练集：  0.8383233532934131
测试集：  0.8026905829596412


# 明显linear核函数的分类更准确

# 选择第二个分类器进行预测并输出结果到submission.csv文件中

In [39]:
#进行survived的预测
pred = classifier_two.predict(data_one_dummy_test[data_one_dummy_list_test])

In [40]:
#将结果存放在submission.csv文件中
submission = 'submission.csv'
data_test = pd.read_csv('test.csv')
output = pd.DataFrame({'PassengerId': data_test.PassengerId, 'Survived': pred})
output.to_csv(submission, index=False)

In [41]:
#打印出预测出的结果的存活数量
dead_num = 0
live_num = 0
for i in pred:
    if i == 0:
        dead_num += 1
    else:
        live_num += 1
print('dead_num', dead_num)
print('live_num', live_num)

dead_num 256
live_num 162


In [42]:
#submission.csv文件中的前5个数据
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# 加分项（随机森林）

In [43]:
from sklearn import ensemble
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [44]:
x_train_one,x_test_one,y_train_one,y_test_one = model_selection.train_test_split(
                                                        data_one_dummy_train[data_one_dummy_list_train],
                                                                data_train[Target],
                                                                random_state = 0)

In [45]:
x_train_one.shape

(668, 17)

In [46]:
rf = RandomForestClassifier(max_features='auto', #自动搜寻最大特征
                           random_state=1,  #对比
                           n_jobs=-1) #-1多线程进行训练

In [47]:
#网格参数
param_gird = { 
    'criterion': ['gini', 'entropy'], #两种方法
    'min_samples_leaf': [1, 5, 10],   #最小的叶子结点
    'min_samples_split': [2,4,10,12,16],   #最少的分割
    'n_estimators': [50,100,400,700,1000]   #决策树的数量
}

In [48]:
gs = GridSearchCV(estimator=rf,
                 param_grid=param_gird,
                 scoring='accuracy',
                 cv=3,
                 n_jobs=-1)

In [49]:
gs = gs.fit(x_train_one, y_train_one) #对特征进行训练

  self.best_estimator_.fit(X, y, **fit_params)


In [50]:
print(gs.best_score_)

0.8323353293413174


In [51]:
# 在test上进行预测
pred = gs.predict(x_test_one)

In [52]:
pred_df = pd.DataFrame(pred, columns=['survived'])

In [53]:
pred_df.head()

Unnamed: 0,survived
0,0
1,0
2,0
3,1
4,1
