# 作業 : (Kaggle)鐵達尼生存預測
https://www.kaggle.com/c/titanic

# [作業目標]
- 試著模仿範例寫法, 在鐵達尼生存預測中, 觀察計數編碼與特徵雜湊的效果

# [作業重點]
- 仿造範例, 完成計數編碼以及搭配邏輯斯迴歸的預測 (In[4], Out[4], In[5], Out[5]) 
- 仿造範例, 完成雜湊編碼, 以及計數編碼+雜湊編碼 搭配邏輯斯迴歸的預測 (In[6], Out[6], In[7], Out[7]) 
- 試著回答上述執行結果的觀察

# 作業1
* 參考範例，將鐵達尼的艙位代碼( 'Cabin' )欄位使用特徵雜湊 / 標籤編碼 / 目標均值編碼三種轉換後，  
與其他類別型欄位一起預估生存機率

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

data_path = 'data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived'] , axis=1)
df_test = df_test.drop(['PassengerId'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Object Features : {object_features}\n')

# 只留類別型欄位
df = df[object_features]
df = df.fillna('None')
train_num = train_Y.shape[0]
df.head()

5 Object Features : ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']



Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


In [7]:
df.dropna().head(10)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_count
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S,1014
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,2
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S,1014
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,2
4,"Allen, Mr. William Henry",male,373450,,S,1014
5,"Moran, Mr. James",male,330877,,Q,1014
6,"McCarthy, Mr. Timothy J",male,17463,E46,S,2
7,"Palsson, Master. Gosta Leonard",male,349909,,S,1014
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,347742,,S,1014
9,"Nasser, Mrs. Nicholas (Adele Achem)",female,237736,,C,1014


# 作業2
* 承上題，三者比較效果何者最好?

In [3]:
# 對照組 : 標籤編碼 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in df.columns:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()

0.780004837244799


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,155,1,720,185,3
1,286,0,816,106,0
2,523,0,914,185,3
3,422,0,65,70,3
4,22,1,649,185,3


In [3]:
# 'Cabin'計數編碼 + 邏輯斯迴歸
cabin_count = df.groupby(["Cabin"])["Name"].agg({'Cabin_count':'size'}).reset_index()
df = pd.merge(df , cabin_count , on=["Cabin"] , how = "left")
cabin_count.sort_values(by=['Cabin'], ascending=False).head(10)

is deprecated and will be removed in a future version
  


Unnamed: 0,Cabin,Cabin_count
186,T,1
185,,1014
184,G6,5
183,F4,4
182,F38,1
181,F33,4
180,F2,4
179,F G73,2
178,F G63,2
177,F E69,1


In [14]:
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp["Cabin_count"] = df["Cabin_count"]
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator , train_X , train_Y , cv=5).mean())

0.7856230275549181




In [17]:
# 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp["Cabin_hash"] = df["Cabin"].map(lambda x:hash(x)%8)
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()

0.7766465339403296




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_hash
0,155,1,720,185,3,7
1,286,0,816,106,0,1
2,523,0,914,185,3,7
3,422,0,65,70,3,6
4,22,1,649,185,3,7


In [18]:
# 'Cabin'計數編碼 + 'Cabin'特徵雜湊 + 邏輯斯迴歸
df_temp = pd.DataFrame()
for c in object_features:
    df_temp[c] = LabelEncoder().fit_transform(df[c])
df_temp['Cabin_hash'] = df['Cabin'].map(lambda x:hash(x) % 18)
df_temp['Cabin_count'] = df['Cabin_count']
train_X = df_temp[:train_num]
estimator = LogisticRegression()
print(cross_val_score(estimator, train_X, train_Y, cv=5).mean())
df_temp.head()

0.7845372363231691




Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,Cabin_hash,Cabin_count
0,155,1,720,185,3,5,1014
1,286,0,816,106,0,5,2
2,523,0,914,185,3,5,1014
3,422,0,65,70,3,4,2
4,22,1,649,185,3,5,1014


In [16]:
for i in df['Cabin']:
    if i ==None
    df.drop
# df.drop(["Cabin"]=="None", axis = 0)

None
C85
None
C123
None
None
E46
None
None
None
G6
C103
None
None
None
None
None
None
None
None
None
D56
None
A6
None
None
None
C23 C25 C27
None
None
None
B78
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
D33
None
B30
C52
None
None
None
None
None
B28
C83
None
None
None
F33
None
None
None
None
None
None
None
None
F G73
None
None
None
None
None
None
None
None
None
None
None
None
C23 C25 C27
None
None
None
E31
None
None
None
A5
D10 D12
None
None
None
None
D26
None
None
None
None
None
None
None
C110
None
None
None
None
None
None
None
B58 B60
None
None
None
None
E101
D26
None
None
None
F E69
None
None
None
None
None
None
None
D47
C123
None
B86
None
None
None
None
None
None
None
None
F2
None
None
C2
None
None
None
None
None
None
None
None
None
None
None
None
None
None
E33
None
None
None
B19
None
None
None
A7
None
None
C49
None
None
None
None
None
F4
None
A32
None
None
None
None
None
None
None
F2
B4
B80
None
None
None
None
None
None
None
N

In [None]:
df.drop