# import 資料集

In [1]:
import pandas as pd
train = pd.read_csv("/kaggle/input/titanic/train.csv", encoding="utf-8")
test = pd.read_csv("/kaggle/input/titanic/test.csv", encoding="utf-8")

In [2]:
# check df
train.columns.tolist(), test.columns.tolist()

(['PassengerId',
  'Survived',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked'],
 ['PassengerId',
  'Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked'])

# 資料預處理
1. 處理缺失值
    1. 數值型 -> 填補中位數
    2. 類別型 -> 填補最常出現的類別
2. One-Hot
3. 萃取更多欄位資訊

In [3]:
total = pd.concat( [train, test], axis=0 )
total = total.drop( ["PassengerId", "Survived"], axis=1 )
total

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [4]:
# Cabin
def cabin_preprocess(c):
    if pd.isna(c):
        return c
    else:
        return c[0]

total["Cabin"].apply(cabin_preprocess)
total["Cabin"] = total["Cabin"].apply(cabin_preprocess)

In [5]:
# Ticket
ticket_count = total["Ticket"].value_counts()   # 多少人持有同一張票類

def ticket_preprocess(t):
    if pd.isna(t):
        return t
    else:
        return ticket_count[t]

total["Ticket"].apply(ticket_preprocess)
total["Ticket"] = total["Ticket"].apply(ticket_preprocess)

In [6]:
# Name
total["Name"].isna().value_counts()   # 確認姓名無空值

def name_preprocess(n):
    return n.split(".")[0].split(",")[-1]
    

total["Name"].apply(name_preprocess)
total["Name"] = total["Name"].apply(name_preprocess)

In [7]:
# 找尋數值類別的缺失值 -> 填補中位數
total.median()   # Pclass 是不是 數值型類別 可再研究 (這邊先當類別型)
median = total.median().drop("Pclass")

total.fillna(median)
total = total.fillna(median)

  total.median()   # Pclass 是不是 數值型類別 可再研究 (這邊先當類別型)
  median = total.median().drop("Pclass")


In [8]:
# Embarked
# 類別型 -> 填補最常出現的值
total["Embarked"].value_counts()
most = total["Embarked"].value_counts().idxmax()

total["Embarked"].fillna(most)
total["Embarked"] = total["Embarked"].fillna(most)

In [9]:
# 類別型資料 需額外再處理 -> One hot encoding
# 剛剛 drop 的 "Pclass" 有必要做嗎? (需實際測試過, 才知道效果)
# Sex 二值型資料 可做可不做

In [10]:
# Name -> One hot encoding
name_count = total["Name"].value_counts()
name_count[ name_count > 10 ]

name_reserved = name_count[ name_count > 10 ].index

def name_onehot(n):
    if n in name_reserved:
        return n
    else:
        return None

total["Name"].apply(name_onehot)
total["Name"] = total["Name"].apply(name_onehot)

# One hot encoding
total = pd.get_dummies(total)   # 一次對所有字串類的資料 one hot encoding

In [11]:
# Pclass -> One hot encoding
total = pd.get_dummies(total, columns=["Pclass"])   # Pclass資料型態為數字, 額外one hot

total

Unnamed: 0,Age,SibSp,Parch,Ticket,Fare,Name_ Master,Name_ Miss,Name_ Mr,Name_ Mrs,Sex_female,...,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,22.0,1,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
1,38.0,1,0,2,71.2833,0,0,0,1,1,...,0,0,0,0,1,0,0,1,0,0
2,26.0,0,0,1,7.9250,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,35.0,1,0,2,53.1000,0,0,0,1,1,...,0,0,0,0,0,0,1,1,0,0
4,35.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
414,39.0,0,0,3,108.9000,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
415,38.5,0,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
416,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [12]:
# 額外增加Family欄位 -> 避免決策樹 僅單獨考慮 SibSp or ParCh 而沒有考慮到家人總數
total["Family"] = total["SibSp"] + total["Parch"]

In [13]:
# 資料分類
import numpy as np
x = np.array(total.iloc[ :len(train) ])   # x_train
y = np.array(train["Survived"])   # y_train

x_predict = np.array(total.iloc[ len(train): ])

x.shape, y.shape, x_predict.shape

((891, 26), (891,), (418, 26))

In [14]:
# 確認所有欄位有無空值
total.isna().sum()

total.isna().sum() [total.isna().sum() != 0]

Series([], dtype: int64)

# 額外補充
1. Pandas 篩選操作
2. loc vs iloc
3. 資料分類方式

In [15]:
# Pandas 篩選操作
test_df = pd.DataFrame([
    [1, 2],
    [3, 4],
    [5, 6]
])

test_df[ [True, False, True] ]

Unnamed: 0,0,1
0,1,2
2,5,6


In [16]:
# loc vs iloc
test_df = pd.DataFrame([
    [1, 2],
    [3, 4]
], index=[0, 0])

test_df.loc[0]   # 會將兩列都取出 (因兩列的index都為0)
test_df.iloc[0]   # 僅取出index為0的第一列

0    1
1    2
Name: 0, dtype: int64

In [17]:
# 資料分類方式
# 分兩份 -> train / test   -> 依照test結果, 調整模型參數

# 分三份 -> train / test / valid   -> 同上, 但最後用完全沒看過的 valid 驗證模型