In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.optim as optim
from torch import nn
from d2l import torch as d2l
from sklearn.ensemble import AdaBoostClassifier

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

data0 = train_df.copy()
data1 = test_df.copy()

data = pd.concat([train_x,data1],axis=0,ignore_index=True)

In [12]:
data0.drop(labels='Ticket',axis=1,inplace=True) #把Ticket列删除
data0.drop(labels='Cabin',axis=1,inplace=True) #Cabin
data0.drop(labels='Name',axis=1,inplace=True) #Name

In [20]:
data0.drop(labels='PassengerId',axis=1,inplace=True) #PassengerId

In [13]:
# 计算年龄的均值
data0['Age'].mean()

29.69911764705882

In [14]:
# 空缺值处理
data0['Age'].fillna(30,inplace=True)

In [15]:
# Embarked,Sex属性列由字符列改为数值列
f_names = ['Sex','Embarked']
for x in f_names:
    label = preprocessing.LabelEncoder()
    data0[x] = label.fit_transform(data0[x])

In [21]:
train_y = data0['Survived']
train_x = data0.drop('Survived',axis=1)

In [3]:
# 特征工程
cate_cols = []
num_cols =[]
dtypes = data.dtypes

for col, dtype in dtypes.items():
    if dtype == 'object':
        cate_cols.append(col)
    else:
        num_cols.append(col)
             
print(cate_cols)
print(num_cols)

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']
['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [4]:
data.drop(labels='Ticket',axis=1,inplace=True) #把Ticket列删除
data.drop(labels='Cabin',axis=1,inplace=True) #Cabin
data.drop(labels='Name',axis=1,inplace=True) #Name

In [5]:
# 计算年龄的均值
data['Age'].mean()

29.881137667304014

In [6]:
# 空缺值处理
data['Age'].fillna(30,inplace=True)

In [7]:
# Embarked,Sex属性列由字符列改为数值列
f_names = ['Sex','Embarked']
for x in f_names:
    label = preprocessing.LabelEncoder()
    data[x] = label.fit_transform(data[x])

In [9]:
n_train = data0.shape[0]
train_features = data[:n_train]
test_features = data[n_train:]
train_labels = train_y
print(train_features.shape)
print(test_features.shape)
print(train_x.shape)
print(train_y.shape)

(891, 8)
(418, 8)
(891, 11)
(891,)


In [148]:
clf = AdaBoostClassifier(base_estimator=None,
                         algorithm='SAMME.R',
                         n_estimators=1000, 
                         learning_rate=0.7,
                         random_state=0)

In [149]:
clf.fit(train_x, train_y)

AdaBoostClassifier(learning_rate=0.7, n_estimators=1000, random_state=0)

In [74]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,2
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,2
3,1,0,35.0,1,0,53.1,2
4,3,1,35.0,0,0,8.05,2


In [150]:
clf.feature_importances_ 

array([0.009, 0.014, 0.177, 0.01 , 0.007, 0.77 , 0.013])

In [151]:
clf.score(train_x, train_y)  

0.8630751964085297

In [129]:
sub_data = pd.read_csv('gender_submission.csv')

In [130]:
data1.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [131]:
data1.drop(labels='Ticket',axis=1,inplace=True) #把Ticket列删除
data1.drop(labels='Cabin',axis=1,inplace=True) #Cabin
data1.drop(labels='Name',axis=1,inplace=True) #Name

In [132]:
# 计算年龄的均值
data1['Age'].mean()

30.272590361445783

In [133]:
# 空缺值处理
data1['Age'].fillna(30,inplace=True)

In [134]:
# Embarked,Sex属性列由字符列改为数值列
f_names = ['Sex','Embarked']
for x in f_names:
    label = preprocessing.LabelEncoder()
    data1[x] = label.fit_transform(data1[x])

In [135]:
data1.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           1
Embarked       0
dtype: int64

In [136]:
data1['Fare'].fillna(0,inplace=True)

In [137]:
data1.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [138]:
data1.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,2
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,2
4,896,3,0,22.0,1,1,12.2875,2


In [139]:
data1.drop(labels='PassengerId',axis=1,inplace=True)

In [152]:
data1.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


In [153]:
preds = clf.predict(data1).reshape(1,-1)[0]

In [154]:
preds

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [155]:
sub_data['Survived'] = pd.Series(preds.reshape(1, -1)[0])

In [156]:
sub_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [157]:
submission = sub_data
submission.to_csv('submission_v7.csv', index=False)