In [104]:
# self baseline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 集成学习
from sklearn import ensemble

from sklearn import model_selection
from sklearn import preprocessing

# 线性回归和逻辑回归
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression



In [11]:
# 读取训练集和测试并进行连接
train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

train_lable = train_data['Survived']

all_data = pd.concat([train_data,test_data], ignore_index = True)
# 特征工程前删掉标签
all_data = all_data.drop(['Survived', 'PassengerId'], axis=1)

In [12]:
all_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Ticket      1309 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 102.3+ KB


### 2. 特征工程

In [13]:
# ========================= 1. Embarked 登船口 1307 =========================

# all_data['Embarked'].value_counts()
# S    914
# C    270
# Q    123

# 获取众数， mode()返回的是一个DataFrame, 因为可能有多个众数
# iloc按行索引获得众数的值
embarked_mode = all_data['Embarked'].mode().iloc[0]
# 填充缺失值， inplace = True直接修改原数据
all_data['Embarked'].fillna(embarked_mode, inplace = True)

In [15]:
# ========================= 2. Name =========================

# x.split(',')[1].split('.')[0].strip() 
# 先按','分割取第二个，再按‘.’分割取第一个
# 如McGowan, Miss. Anna "Annie" 分割得到Miss
all_data['Title'] = all_data['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
all_data['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Ms                2
Major             2
Mlle              2
Sir               1
Dona              1
Jonkheer          1
Don               1
the Countess      1
Capt              1
Mme               1
Lady              1
Name: Title, dtype: int64

In [16]:
# 将原始的Title进行分类
Title_Dict = {}
Title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
Title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
Title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
Title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
Title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
Title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))
# 将数据进行映射
new_title = all_data['Title'].map(Title_Dict)
new_title.value_counts()

Mr         757
Miss       262
Mrs        200
Master      62
Officer     23
Royalty      5
Name: Title, dtype: int64

In [17]:
all_data['Title'] = new_title
all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,Mr


In [36]:
# ================= 3.Fare (票价) ======================
all_data[all_data['Fare'].isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title
1043,60.5,,S,,"Storey, Mr. Thomas",0,3,male,0,3701,Mr


In [66]:

# all_data[(all_data['Embarked'] == 'S') & (all_data['Pclass'] == 3)]['Fare'].mean()
# groupby + transform 可以实现分组+计算+合并 
gb_mean = all_data.groupby(['Embarked','Pclass'])['Fare'].transform('mean')
all_data['Fare'].fillna(gb_mean, inplace = True)
all_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1309 non-null object
Fare        1309 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Ticket      1309 non-null object
Title       1309 non-null object
dtypes: float64(2), int64(3), object(6)
memory usage: 112.6+ KB


In [71]:
# ================= 4.Family ======================
all_data['Family_size'] = all_data['Parch'] + all_data['Parch'] + 1

def map_family_label(nums):
    if nums <= 1:
        return 'Single'
    elif nums <=3:
        return 'Mid'
    else:
        return 'Large'

all_data['Family_label'] = all_data['Family_size'].map(map_family_label)

all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title,Family_size,Family_label
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr,1,Single
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs,1,Single
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss,1,Single
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs,1,Single
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,Mr,1,Single


In [72]:
all_data['Family_label'].value_counts()

Single    1002
Mid        170
Large      137
Name: Family_label, dtype: int64

In [78]:
# ================= 5.Ticket ======================
# 处理团体票

all_data['Ticket_group'] = all_data.groupby('Ticket')['Ticket'].transform('count')
all_data['Fare_pre'] = all_data['Fare'] / all_data['Ticket_group']

all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title,Family_size,Family_label,Ticket_group,Fare_pre
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr,1,Single,1,7.25
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs,1,Single,2,35.64165
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss,1,Single,1,7.925
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs,1,Single,2,26.55
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,Mr,1,Single,1,8.05


In [80]:
# ================= 6.Cabin ======================

all_data['Cabin'].value_counts()

# 将有Cabin值的按照首字母分组，没有的则设置为U
all_data['Deck'] = all_data['Cabin'].apply(lambda x : 'U' if pd.isnull(x) else str(x)[0])

all_data['Deck'].value_counts()



U    1014
C      94
B      65
D      46
E      41
A      22
F      21
G       5
T       1
Name: Deck, dtype: int64

In [92]:
# ================= 7.age ======================

def fill_missing_age(train_age, test_age):
    # 数据处理
    train_x = train_age.drop(columns='Age')
    train_y = train_age['Age']
    test_x = test_age.drop(columns='Age')
    # 线性回归进行拟合
    linear_reg = LinearRegression()
    linear_reg.fit(train_x, train_y)
    # 线性回归进行预测
    test_x['Age'] = linear_reg.predict(test_x)
    return test_x


In [102]:

# 从原始df中提取需要的特征
missing_age_df = all_data[['Age','Embarked','Pclass','Sex', 'Title', 'Family_label', 'Deck', 'Fare_pre']] 
# 将部分特征进行展开
missing_age_df = pd.get_dummies(missing_age_df,columns=['Embarked','Pclass','Sex', 'Title', 'Family_label', 'Deck'])
# 将有Age的部分作为训练集，nan的部分作为测试集
missing_age_train = missing_age_df[missing_age_df['Age'].notnull()]
missing_age_test = missing_age_df[missing_age_df['Age'].isnull()]
# 年龄预测并填充
missing_age_predict = fill_missing_age(missing_age_train, missing_age_test)
all_data.loc[(all_data['Age'].isnull()),'Age'] = missing_age_predict['Age']

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
Int64Index: 263 entries, 5 to 1308
Data columns (total 28 columns):
Age                    0 non-null float64
Fare_pre               263 non-null float64
Embarked_C             263 non-null uint8
Embarked_Q             263 non-null uint8
Embarked_S             263 non-null uint8
Pclass_1               263 non-null uint8
Pclass_2               263 non-null uint8
Pclass_3               263 non-null uint8
Sex_female             263 non-null uint8
Sex_male               263 non-null uint8
Title_Master           263 non-null uint8
Title_Miss             263 non-null uint8
Title_Mr               263 non-null uint8
Title_Mrs              263 non-null uint8
Title_Officer          263 non-null uint8
Title_Royalty          263 non-null uint8
Family_label_Large     263 non-null uint8
Family_label_Mid       263 non-null uint8
Family_label_Single    263 non-null uint8
Deck_A                 263 non-null uint8
Deck_B                 263 non-null ui

In [103]:
all_data.info()

&lt;class &#39;pandas.core.frame.DataFrame&#39;&gt;
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 16 columns):
Age             1309 non-null float64
Cabin           295 non-null object
Embarked        1309 non-null object
Fare            1309 non-null float64
Name            1309 non-null object
Parch           1309 non-null int64
Pclass          1309 non-null int64
Sex             1309 non-null object
SibSp           1309 non-null int64
Ticket          1309 non-null object
Title           1309 non-null object
Family_size     1309 non-null int64
Family_label    1309 non-null object
Ticket_group    1309 non-null int64
Fare_pre        1309 non-null float64
Deck            1309 non-null object
dtypes: float64(3), int64(5), object(8)
memory usage: 163.7+ KB


###  3. 模型构建

In [110]:
# 数据处理

# Age', 'Fare_pre 进行归一化处理
normal_data = preprocessing.StandardScaler().fit(all_data[['Age', 'Fare_pre']])
all_data[['Age', 'Fare_pre']] = normal_data.transform(all_data[['Age', 'Fare_pre']])

all_data.head() 

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Ticket,Title,Family_size,Family_label,Ticket_group,Fare_pre,Deck
0,-0.582586,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,A/5 21171,Mr,1,Single,1,-0.554243,U
1,0.610005,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,PC 17599,Mrs,1,Single,2,1.541811,C
2,-0.284438,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,STON/O2. 3101282,Miss,1,Single,1,-0.50441,U
3,0.386394,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,113803,Mrs,1,Single,2,0.870607,C
4,0.386394,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,373450,Mr,1,Single,1,-0.495182,U


In [113]:

all_data = all_data[['Age','Embarked','Pclass','Sex', 'Title', 'Family_label', 'Deck', 'Fare_pre']] 
# 将部分特征进行展开
all_data = pd.get_dummies(all_data,columns=['Embarked','Pclass','Sex', 'Title', 'Family_label', 'Deck'])

In [114]:
# 将数据集进行分割
train_data_x = all_data[:891]
train_data_y = train_lable
test_data_x = all_data[891:]

In [123]:
rf_cls = ensemble.RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

rf_cls_grid = model_selection.GridSearchCV(rf_cls, param_grid, n_jobs=25, cv=10, scoring = 'roc_auc',verbose=1)
rf_cls_grid.fit(train_data_x, train_data_y)


Fitting 10 folds for each of 60 candidates, totalling 600 fits
[Parallel(n_jobs=25)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=25)]: Done 150 tasks      | elapsed:   19.4s
[Parallel(n_jobs=25)]: Done 400 tasks      | elapsed:   44.5s
[Parallel(n_jobs=25)]: Done 600 out of 600 | elapsed:  1.1min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             n_jobs=25,
             param_grid={&#39;criterion&#39;: [&#39;gini&#39;, &#39;entropy&#39;],
                         &#39;max_depth&#39;: [4, 5, 6, 7, 8],
                         &#39;max_features&#39;: [&#39;auto&#39;, &#39;sqrt&#39;, &#39;log2&#39;],
                         &#39;n_estimators&#39;: [200, 500]},
             scoring=&#39;roc_auc&#39;, verbose=1)

In [125]:
submit = rf_cls_grid.predict(test_data_x)



In [132]:
submit_df = pd.DataFrame(submit)

test_df = pd.read_csv('dataset/test.csv')
index_df = test_df[['PassengerId']]

index_df['Survived'] = submit_df

# submit_df = pd.concat([index_df, submit_df], axis=1)

index_df.to_csv('./submit/test1.csv', index=False)