In [1]:
'''
数据分析
'''
import pandas as pd


#读取训练和测试数据。
train_data = pd.read_csv('../datasets/titanic/train.csv')
test_data = pd.read_csv('../datasets/titanic/test.csv')

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [4]:
'''
数据预处理
'''
def data_preprocess(df):
    '''
    丢弃Cabin、Ticket、Name特征；填充Age、Fare、Embarked特征。
    '''
    df = df.drop(['Cabin', 'Ticket', 'Name'], axis=1)
    df = df.fillna({'Age': df['Age'].median(), 'Fare': df['Fare'].mean(), 'Embarked': df['Embarked'].value_counts().idxmax()})
    return df


train_data = data_preprocess(train_data)
test_data = data_preprocess(test_data)

In [5]:
X_train = train_data.drop(['Survived', 'PassengerId'], axis=1)
y_train = train_data['Survived']
X_test = test_data.drop(['PassengerId'], axis=1)

In [6]:
#获得训练和测试集中的数值型特征。
num_X_train = X_train[['Age', 'Fare', 'SibSp', 'Parch']].values
num_X_test = X_test[['Age', 'Fare', 'SibSp', 'Parch']].values

In [7]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

#获得训练和测试集中的类别型特征，并转换为独热编码。
cate_X_train = ohe.fit_transform(X_train[['Pclass', 'Sex', 'Embarked']]).todense()
cate_X_test = ohe.transform(X_test[['Pclass', 'Sex', 'Embarked']]).todense()

In [8]:
import numpy as np


#将数值特征与类别特征的独热编码进行拼接。
X_train = np.concatenate([num_X_train, cate_X_train], axis=1)
X_test = np.concatenate([num_X_test, cate_X_test], axis=1)

In [9]:
'''
采用随机森林分类器，并且交叉验证、超参数寻优。
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


parameters = {'n_estimators':[10, 50, 100], 'criterion':['gini', 'entropy']}

rfc = RandomForestClassifier()

clf = GridSearchCV(rfc, parameters, scoring='accuracy', n_jobs=4)

clf.fit(X_train, y_train)

print('最优超参数设定为：%s' %clf.best_params_)

print('交叉验证得到的最佳准确率为：%f' %clf.best_score_)

最优超参数设定为：{'criterion': 'gini', 'n_estimators': 50}
交叉验证得到的最佳准确率为：0.815969


In [10]:
'''
使用最优的模型，依据测试数据的特征进行类别预测。
'''
y_predict = clf.predict(X_test)

submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_predict})

submission.to_csv('../Kaggle_submissions/titanic_submission.csv', index=False)