## 泰坦尼克号预测存活
决策树和随机森林对比实验，比较效果

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
import graphviz
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('../data/taitanic_data/data.csv')
df.info()
#数据预处理
df.drop(['PassengerId','Name','Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
df['Age'] = SimpleImputer(strategy='mean').fit_transform(df[['Age']]) #均值填充age
df.dropna(axis=0, inplace=True)     #删除有缺失值的行
df.reset_index(drop=True, inplace=True)
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.000000,1,0,S
1,1,1,female,38.000000,1,0,C
2,1,3,female,26.000000,0,0,S
3,1,1,female,35.000000,1,0,S
4,0,3,male,35.000000,0,0,S
...,...,...,...,...,...,...,...
884,0,2,male,27.000000,0,0,S
885,1,1,female,19.000000,0,0,S
886,0,3,female,29.699118,1,2,S
887,1,1,male,26.000000,0,0,C


In [4]:
# 分割出标签
y_data = df['Survived']
x_data = df.drop(['Survived'], axis=1)
# 编码
enc = OneHotEncoder(dtype=int)
result = enc.fit_transform( x_data[['Sex','Embarked']]).toarray()
print(enc.categories_)
print(enc.feature_names_in_) #参与one-hot编码的特征名字
code_df = pd.DataFrame(result)
print(code_df)
column_names = enc.get_feature_names_out()
column_names

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
['Sex' 'Embarked']
     0  1  2  3  4
0    0  1  0  0  1
1    1  0  1  0  0
2    1  0  0  0  1
3    1  0  0  0  1
4    0  1  0  0  1
..  .. .. .. .. ..
884  0  1  0  0  1
885  1  0  0  0  1
886  1  0  0  0  1
887  0  1  1  0  0
888  0  1  0  1  0

[889 rows x 5 columns]


array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype=object)

In [5]:
x_data = pd.concat([x_data, code_df], axis=1)
x_data.drop(columns=enc.feature_names_in_,inplace=True)
keys = range(column_names.shape[0])
x_data.rename(columns=dict(zip(keys, column_names)), inplace=True)
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.3)
x_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
232,3,5.000000,4,2,1,0,0,0,1
797,3,30.000000,0,0,0,1,1,0,0
579,2,25.000000,1,1,1,0,0,0,1
880,3,22.000000,0,0,1,0,0,0,1
69,2,32.000000,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
660,3,40.000000,0,0,0,1,1,0,0
688,1,15.000000,0,1,1,0,0,0,1
457,2,50.000000,0,0,1,0,0,0,1
830,3,29.699118,0,0,0,1,1,0,0


In [40]:
# 网格搜索，确定最佳参数
param_grid = {
    'max_depth':[*range(5,20)],
    'criterion':['gini', 'entropy'],
    'min_samples_leaf': [*range(1,30,2)],
    'splitter':['best','random']
}
clf = DecisionTreeClassifier(random_state=2)
GS = GridSearchCV(clf, param_grid)
GS.fit(x_train, y_train)

print(GS.best_params_)
print(GS.best_score_)

{'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 3, 'splitter': 'best'}
0.824709677419355


In [6]:
clf = DecisionTreeClassifier(random_state=2, criterion='gini',max_depth=8, 
                             min_samples_leaf=3, splitter='best' ) 
            # 网格搜索得到的参数不一定最优，很大程度上依赖训练集和测试集
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.8164794007490637

In [10]:
clf = RandomForestClassifier(n_estimators=30, random_state=1024, criterion='gini',max_depth=8, 
                             min_samples_leaf=3 ) 

clf.fit(x_train,y_train)
clf.score(x_test,y_test)

0.8052434456928839