In [165]:
# 导库
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# 数据文件地址 + 数据读取
file_path = './input/train.csv'
home_data = pd.read_csv(file_path)

# 删除有空缺信息的列
home_data = home_data.fillna(30)

# 对sex信息进行处理
home_data['Sex'] = home_data['Sex'].map({'female': 0, 'male': 1})

# 查看列的信息
print(home_data.columns.to_list())

# 找 target
y = home_data.Survived

# 找 features 和 对应数据集 X
features = ['Pclass', 'Sex', 'SibSp', 'Age', 'Parch']
X = home_data[features]

# 划分数据 训练集 + 验证集
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 1)

# 定义随机森林模型
rf_model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
rf_model.fit(train_X, train_y)

# 通过验证集求解 MAE平均绝对误差
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print(rf_val_predictions)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))


['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
[1 0 1 1 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 0 0
 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 0
 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1
 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0
 0]
Validation MAE for Random Forest Model: 0


In [167]:
# 全训练数据上的模型

rf_model_on_full_data = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
rf_model_on_full_data.fit(X, y)

In [169]:
# 测试数据路径
test_data_path = './input/test.csv'

# 测试数据
test_data = pd.read_csv(test_data_path)

# 删除有空缺信息的列
test_data = test_data.fillna(30)

# 对sex信息进行处理
test_data['Sex'] = test_data['Sex'].map({'female': 0, 'male': 1})

# 预测的输入集
test_X = test_data[features]

#预测
test_preds = rf_model_on_full_data.predict(test_X)

In [171]:
# 生成测试集的预测数据
output = pd.DataFrame({'PassengerId': test_data.PassengerId,
                       'Survived': test_preds})
output.to_csv('submission.csv', index = False)

In [173]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,0.636364,30.216507,0.447368,0.392344,35.613726
std,120.810458,0.841838,0.481622,12.635016,0.89676,0.981429,55.841179
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958
50%,1100.5,3.0,1.0,30.0,0.0,0.0,14.4542
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.471875
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292
