資料預處理  
1. 找出缺失值
2. 填補缺失值
    1. 數值形缺失值 : 補中位數
    2. 類別形缺失值 : 捕最常出現的值
3. 針對"類別形"的資料 還要多做一個處理 (one hot encoding)
4. 資料性質相近 可以加在一起 / 資料太無意義 可以捨去
5. 特徵縮放 : 不同的特徵值可能數值上差異非常大 (但資料重要性跟數值大小無關 所以進行縮放)

資料預處理 - DataFrame : train, test, total 

In [1]:
import pandas as pd


# train_data & test_data
train = pd.read_csv("D:\Learn\Machine_Learn\_Titanic_train.csv", encoding="utf-8")
test = pd.read_csv("D:\Learn\Machine_Learn\_Titanic_test.csv", encoding="utf-8")

# total 去除ID, 生存 欄位
total = pd.concat([train, test], axis=0)
total = total.drop(["PassengerId", "Survived"], axis=1)

train.columns, test.columns, total.columns

(Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
        'Ticket', 'Fare', 'Cabin', 'Embarked'],
       dtype='object'),
 Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
        'Cabin', 'Embarked'],
       dtype='object'))

資料預處理 - Cabin欄位

In [2]:
# 檢查 total 含有 NA 欄位
total.isna().sum()

# 查看 Cabin 大概數值 -> 第一字母可能代表 上下夾層順序
total["Cabin"].value_counts()

# 目標: 保留第一字母 如NA則繼續維持NA
# 定義流程
def cabin_check(column_data):
    if pd.isna(column_data):
        return column_data
    else:
        return column_data[0]

# apply
total["Cabin"] = total["Cabin"].apply(cabin_check)

# 檢查
total["Cabin"].value_counts()

C    94
B    65
D    46
E    41
A    22
F    21
G     5
T     1
Name: Cabin, dtype: int64

資料預處理 - Ticket欄位

In [3]:
# Ticket 查看是否多人共同持有一張票
c = total["Ticket"].value_counts()

def ticket_check(t):
    if pd.isna(t):
        return t
    else:
        return c[t]

total["Ticket"] = total["Ticket"].apply(ticket_check)

total

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,1,7.2500,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,2,71.2833,C,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,1,7.9250,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,2,53.1000,C,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,1,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,1,8.0500,,S
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,3,108.9000,C,C
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,1,7.2500,,S
416,3,"Ware, Mr. Frederick",male,,0,0,1,8.0500,,S


資料預處理 - Name欄位

In [4]:
# 名稱中的稱謂
def name(n):
    if pd.isna(n):
        return n
    else:
        mid = n.split(".")[0].split(",")[-1]
        return mid

total["Name"] = total["Name"].apply(name)
total

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,Mr,male,22.0,1,0,1,7.2500,,S
1,1,Mrs,female,38.0,1,0,2,71.2833,C,C
2,3,Miss,female,26.0,0,0,1,7.9250,,S
3,1,Mrs,female,35.0,1,0,2,53.1000,C,S
4,3,Mr,male,35.0,0,0,1,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
413,3,Mr,male,,0,0,1,8.0500,,S
414,1,Dona,female,39.0,0,0,3,108.9000,C,C
415,3,Mr,male,38.5,0,0,1,7.2500,,S
416,3,Mr,male,,0,0,1,8.0500,,S


填補缺失值 - Age欄位

In [5]:
# 找出中位數
age_median = total["Age"].median()

# 將空缺值填入中位數
total["Age"] = total["Age"].fillna(age_median)

# 檢查
total.isna().sum()

Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

填補缺失值 - Fare欄位

In [6]:
fare_median = total["Fare"].median()
total["Fare"] = total["Fare"].fillna(fare_median)
total.isna().sum()

Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin       1014
Embarked       2
dtype: int64

填補缺失值 - Embarked欄位

In [7]:
# 找最常出現的值
total["Embarked"].value_counts()   # S最常出現

embarked_most = total["Embarked"].value_counts().idxmax()   # 找出數量最大的idx

# 填入缺失值
total["Embarked"] = total["Embarked"].fillna(embarked_most)

# 檢查
total.isna().sum()

Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin       1014
Embarked       0
dtype: int64

類別形資料轉換one hot encoding - Name 欄位

In [8]:
# 姓名種類過多 直接one hot encoding 可能會出現維度災難
name_more_than_10 = total["Name"].value_counts()

# 只想留 數量大於10的稱謂
name_reserved = name_more_than_10[name_more_than_10 > 10].index

# 製作流程
def name_check_2(column_data):
    if column_data in name_reserved:
        return column_data
    else:
        return None

# apply
total["Name"] = total["Name"].apply(name_check_2)

# 檢查
total["Name"].value_counts()

 Mr        757
 Miss      260
 Mrs       197
 Master     61
Name: Name, dtype: int64

類別形資料轉換one hot encoding - Pclass 欄位

In [9]:
total = pd.get_dummies(total, columns=["Pclass"])

類別形資料轉換one hot encoding - 所有欄位

In [10]:
# 自動判斷字串資料 然後對它one hot encoding
total = pd.get_dummies(total)

total

Unnamed: 0,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Name_ Master,Name_ Miss,...,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,22.0,1,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,38.0,1,0,2,71.2833,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,26.0,0,0,1,7.9250,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
3,35.0,1,0,2,53.1000,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
4,35.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
414,39.0,0,0,3,108.9000,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
415,38.5,0,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
416,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


新增欄位 - Family

In [11]:
total["Family"] = total["SibSp"] + total["Parch"]
total

Unnamed: 0,Age,SibSp,Parch,Ticket,Fare,Pclass_1,Pclass_2,Pclass_3,Name_ Master,Name_ Miss,...,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Embarked_C,Embarked_Q,Embarked_S,Family
0,22.0,1,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
1,38.0,1,0,2,71.2833,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,26.0,0,0,1,7.9250,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
3,35.0,1,0,2,53.1000,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1
4,35.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
414,39.0,0,0,3,108.9000,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
415,38.5,0,0,1,7.2500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
416,28.0,0,0,1,8.0500,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


特徵縮放

In [12]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()

# fir_transform
total_scaled = scaler.fit_transform(total)

# 將array資料 放回 DF
total_scaled = pd.DataFrame(total_scaled)

# 檢查
total_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.273456,0.125,0.000000,0.0,0.014151,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1
1,0.473882,0.125,0.000000,0.1,0.139136,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.1
2,0.323563,0.000,0.000000,0.0,0.015469,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.436302,0.125,0.000000,0.1,0.103644,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1
4,0.436302,0.000,0.000000,0.0,0.015713,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0.348616,0.000,0.000000,0.0,0.015713,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1305,0.486409,0.000,0.000000,0.2,0.212559,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1306,0.480145,0.000,0.000000,0.0,0.014151,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1307,0.348616,0.000,0.000000,0.0,0.015713,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
import numpy as np


# 將特徵縮放後的資料 重新區分為 x_train, y_train
x = np.array(total_scaled.iloc[:len(train)])
y = np.array(train["Survived"])


AttributeError: 'numpy.ndarray' object has no attribute 'encode'

K近鄰分析模型 & 交叉驗證

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


# 創立模型 & 假設近鄰有5個
clf = KNeighborsClassifier(n_neighbors=5)

# 使用交叉驗證
scores = cross_val_score(clf, x, y, cv=10, n_jobs=-1).encode("ascii", "ignore")
# sklearn 分類器 (本案 K近鄰)
# x 訓練集
# y 訓練答案
# cv 交叉驗證的分組 or 次數
# n_jobs 使用CPU核心數量 -1 使用系統上限數量
print(np.average(scores))

隨機森林模型 & 交叉驗證

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


params = {
    # 40種
    "n_estimators" : range(21, 101, 2),
    # 10種
    "max_depth" :  range(6,16)
}

clf = RandomForestClassifier()
search = GridSearchCV(clf, params, cv=10, n_jobs=-1)

search.fit(x, y)