In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

# 读取数据集
train_data = pd.read_csv('Titanic_dataset/train.csv')
test_data = pd.read_csv('Titanic_dataset/test.csv')

# 选择用于训练的特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_train = train_data[features]
x_test = test_data[features]

y_train = train_data['Survived']

In [2]:
# 检查缺失值
print ('训练数据信息：')
x_train.info()
print ('-'*30)
print ('测试数据信息：')
x_test.info()

训练数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB
------------------------------
测试数据信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 

In [3]:
# 使用登录最多的港口来填充登录港口的nan值
print ('\n\n\n登录港口信息：')
print (x_train['Embarked'].value_counts())
x_train['Embarked'].fillna('S', inplace=True)
x_test['Embarked'].fillna('S', inplace=True)




登录港口信息：
S    644
C    168
Q     77
Name: Embarked, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [4]:
# 使用平均年龄来填充年龄中的nan值
x_train['Age'].fillna(x_train['Age'].mean(), inplace=True)
x_test['Age'].fillna(x_test['Age'].mean(), inplace=True)

# 使用票价的均值填充票价中的nan值
x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)

In [5]:
# 将特征值转换成特征向量
dvec = DictVectorizer(sparse=False)

x_train = dvec.fit_transform(x_train.to_dict(orient='record'))
x_test = dvec.transform(x_test.to_dict(orient='record'))

# 打印特征向量格式
print ('\n\n\n特征向量格式')
print (dvec.feature_names_)




特征向量格式
['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']




In [6]:
# 支持向量机
svc = SVC()
# 决策树
dtc = DecisionTreeClassifier()
# 随机森林
rfc = RandomForestClassifier()
# 逻辑回归
lr = LogisticRegression()
# 贝叶斯
nb = MultinomialNB()
# K邻近
knn = KNeighborsClassifier()
# AdaBoost
boost = AdaBoostClassifier()

# 對模型的驗證使用十倍交叉驗證。
print ('\n\n\n模型验证:')
print ('SVM acc is', np.mean(cross_val_score(svc, x_train, y_train, cv=10)))
print ('DecisionTree acc is', np.mean(cross_val_score(dtc, x_train, y_train, cv=10)))
print ('RandomForest acc is', np.mean(cross_val_score(rfc, x_train, y_train, cv=10)))
print ('LogisticRegression acc is', np.mean(cross_val_score(lr, x_train, y_train, cv=10)))
print ('NaiveBayes acc is', np.mean(cross_val_score(nb, x_train, y_train, cv=10)))
print ('KNN acc is', np.mean(cross_val_score(knn, x_train, y_train, cv=10)))
print ('AdaBoost acc is', np.mean(cross_val_score(boost, x_train, y_train, cv=10)))




模型验证:
SVM acc is 0.6813233458177278
DecisionTree acc is 0.7767041198501874
RandomForest acc is 0.8059176029962547


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression acc is 0.7968664169787765
NaiveBayes acc is 0.6925967540574283
KNN acc is 0.7093757802746566
AdaBoost acc is 0.8070037453183521


In [7]:
# 使用AdaBoost分類器來進行生存預測，並儲存預測結果。
# 训练boost
boost.fit(x_train, y_train)
# 预测
y_predict = boost.predict(x_test)

# 保存结果
# result = {'PassengerId': test_data['PassengerId'],
#           'Survived': y_predict}
# result = pd.DataFrame(result)
# result.to_csv('submission.csv',index=False)