# 作業 : (Kaggle)鐵達尼生存預測
https://www.kaggle.com/c/titanic

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

# [作業重點]
- 仿造範例, 完成計數編碼以及搭配邏輯斯迴歸的預測 (In[4], Out[4], In[5], Out[5]) 
- 仿造範例, 完成雜湊編碼, 以及計數編碼+雜湊編碼 搭配邏輯斯迴歸的預測 (In[6], Out[6], In[7], Out[7]) 
- 試著回答上述執行結果的觀察

# 作業1
* 參考範例，將鐵達尼的艙位代碼( 'Cabin' )欄位使用特徵雜湊 / 標籤編碼 / 目標均值編碼三種轉換後，  
與其他數值型欄位一起預估生存機率

In [37]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = '../data/Part02/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)


In [38]:
df_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [39]:
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [40]:
# After concat
df = pd.concat([df_train,df_test], ignore_index=True)     #結合資料時要 ignore_index=True 忽略索引 (資料列長度可能不一樣)
df.head(5)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
numeric_features = []

for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
    else:
        numeric_features.append(feature)

print(f'{len(object_features)} Object Features : {object_features}\n')
print(f'{len(numeric_features)} Numeric Features : {numeric_features}\n')

# 只留類別型欄位
df_obj = df[object_features]
df_obj = df_obj.fillna('None')           # 使用 None 填充 Null 欄位
train_obj = train_Y.shape[0]
#print(df_obj.head())

#將物件類型的欄位做標籤編碼
df_labenc = pd.DataFrame()
for col in object_features:
    df_labenc[col] = LabelEncoder().fit_transform(df_obj[col])
df_labenc.head()

# numeric features
df_num = df[numeric_features]
df_num = df_num.fillna(df_num.mean())    # 使用 平均數 填充 Null 欄位
df2 = pd.concat([df_labenc, df_num], axis=1).reset_index()
train_num = train_Y.shape[0]
df2.head()

5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']



Unnamed: 0,index,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare
0,0,155,1,720,185,3,3,22.0,1,0,7.25
1,1,286,0,816,106,0,1,38.0,1,0,71.2833
2,2,523,0,914,185,3,3,26.0,0,0,7.925
3,3,422,0,65,70,3,1,35.0,1,0,53.1
4,4,22,1,649,185,3,3,35.0,0,0,8.05


In [45]:
# 觀察欄位相異值數量
df.select_dtypes(include=["object"]).apply(pd.Series.nunique)

Name        1307
Sex            2
Ticket       929
Cabin        186
Embarked       3
dtype: int64

In [85]:
# 加上 'Cabin' 欄位的計數編碼
#cabin_count = df2.groupby(['Cabin'])['Name'].agg({'Cabin_count':'size'}).reset_index()       # 舊的寫法，使用字典方式
cabin_count = df2.groupby(['Cabin'])['Name'].agg([('Cabin_count','size')])                    # 新的寫法，使用 tuple-list
#print(Cabin_count)

df_cabin_count = pd.merge(df2, cabin_count, on=['Cabin'], how='left')
df_cabin_count = df_cabin_count.drop('index', axis=1)
df_cabin_count.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare,Cabin_count
0,155,1,720,185,3,3,22.0,1,0,7.25,1014
1,286,0,816,106,0,1,38.0,1,0,71.2833,2
2,523,0,914,185,3,3,26.0,0,0,7.925,1014
3,422,0,65,70,3,1,35.0,1,0,53.1,2
4,22,1,649,185,3,3,35.0,0,0,8.05,1014


In [89]:
estimator = LogisticRegression(solver='lbfgs', max_iter=2048)
# 'Cabin' 計數編碼 與其他數值型欄位一起預估生存機率
train_X = df_cabin_count[:train_num]
print(f'cabin count, all features, logistic regression: {cross_val_score(estimator,train_X,train_Y,cv=5).mean()}')
df_cabin_count.head()

cabin count, all features, logistic regression: 0.7901428015662176


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Pclass,Age,SibSp,Parch,Fare,Cabin_count
0,155,1,720,185,3,3,22.0,1,0,7.25,1014
1,286,0,816,106,0,1,38.0,1,0,71.2833,2
2,523,0,914,185,3,3,26.0,0,0,7.925,1014
3,422,0,65,70,3,1,35.0,1,0,53.1,2
4,22,1,649,185,3,3,35.0,0,0,8.05,1014


# 作業2
* 承上題，三者比較效果何者最好?

> 'Cabin'計數編碼 + 'Cabin'特徵雜湊 + 邏輯斯迴歸 這組的分數較高一些。

In [93]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df_obj.columns:
    df_temp[c] = LabelEncoder().fit_transform(df_obj[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression(solver='lbfgs', max_iter=2048)
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()

0.7789000729487724


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [109]:
# 加上 'Cabin' 欄位的計數編碼
cabin_count = df_temp.groupby(['Cabin'])['Name'].agg([('Cabin_count','size')]).reset_index()
df_cabincount = pd.merge(df_temp, cabin_count, on=['Cabin'], how='left')
df_cabincount.head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_count
0,155,1,720,185,3,1014
1,286,0,816,106,0,2
2,523,0,914,185,3,1014
3,422,0,65,70,3,2
4,22,1,649,185,3,1014


In [110]:
# 'Cabin'計數編碼 + 邏輯斯迴歸
train_X = df_cabincount[:train_num]
print(f'Score is: {cross_val_score(estimator,train_X,train_Y,cv=5).mean()}')    #此時的 estimator 還是使用 In[89] 的邏輯斯迴歸
train_X.head()

Score is: 0.7822710013203125


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_count
0,155,1,720,185,3,1014
1,286,0,816,106,0,2
2,523,0,914,185,3,1014
3,422,0,65,70,3,2
4,22,1,649,185,3,1014


In [111]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_cabinhash = df_cabincount.copy()
df_cabinhash['Cabin_hash'] = df_cabinhash['Cabin'].map(lambda x: hash(x)%10)
df_cabinhash = df_cabinhash.drop('Cabin_count', axis=1)
#df_cabinhash.head()

train_X = df_cabinhash[:train_num]
print(f'Score is: {cross_val_score(estimator,train_X,train_Y,cv=5).mean()}')    #此時的 estimator 還是使用 In[89] 的邏輯斯迴歸
train_X.head()

Score is: 0.77329464956041


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_hash
0,155,1,720,185,3,5
1,286,0,816,106,0,6
2,523,0,914,185,3,5
3,422,0,65,70,3,0
4,22,1,649,185,3,5


In [117]:
# 'Cabin'計數編碼 + 'Cabin'特徵雜湊 + 邏輯斯迴歸

df_cabincounthash = pd.merge(df_cabinhash, cabin_count, on=['Cabin'], how='left')
#df_cabincounthash.head()

train_X = df_cabincounthash[:train_num]
print(f'Score is: {cross_val_score(estimator,train_X,train_Y,cv=5).mean()}')
train_X.head()

Score is: 0.7822711431749987


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_hash,Cabin_count
0,155,1,720,185,3,5,1014
1,286,0,816,106,0,6,2
2,523,0,914,185,3,5,1014
3,422,0,65,70,3,0,2
4,22,1,649,185,3,5,1014
