## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [250]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier as XGBclr
from xgboost import XGBRegressor as XGBregr
import numpy as np
import pandas as pd
import copy
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_curve


In [251]:
data_path = 'data/'
df = pd.read_csv(data_path + 'titanic_train.csv')

df_T = df['Survived']
df = df.drop(['PassengerId', 'Survived'] , axis=1)
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [260]:
# 因為需要把類別型與數值型特徵都加入, 故使用最簡版的特徵工程
LEncoder = LabelEncoder()
MMEncoder = MinMaxScaler()
for c in df.columns:
    df[c] = df[c].fillna(-1)
    if df[c].dtype == 'object':
        df[c] = LEncoder.fit_transform(list(df[c].values))
    df[c] = MMEncoder.fit_transform(df[c].values.reshape(-1, 1))
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.121348,1.0,0.275,0.125,0.0,0.769118,0.014151,0.0,1.0
1,0.0,0.213483,0.0,0.475,0.125,0.0,0.876471,0.139136,0.557823,0.333333
2,1.0,0.396629,0.0,0.325,0.0,0.0,0.983824,0.015469,0.0,1.0
3,0.0,0.305618,0.0,0.4375,0.125,0.0,0.072059,0.103644,0.380952,1.0
4,1.0,0.016854,1.0,0.4375,0.0,0.0,0.694118,0.015713,0.0,1.0


In [261]:
df_D = df.values
train_X, test_X, train_Y, test_Y = train_test_split(df_D, df_T, test_size=0.25,random_state = 42)

In [262]:
clf = XGBclr(
    objective = "binary:logistic",
    eval_metric = "logloss",
    random_state=7)
XGBclr

xgboost.sklearn.XGBClassifier

In [263]:
clf.fit(train_X,train_Y)
Pred_Y = clf.predict(test_X)



In [264]:
# 先看看使用預設參數得到的結果，約為 8.379 的 MSE
clf.fit(train_X, train_Y)
acc = accuracy_score(test_Y, Pred_Y)
print("Accuracy: ", acc)

Accuracy:  0.7847533632286996


In [265]:
num_1 = []
for i in range(1,20):
    num_1.append(i)

num_2 = []
for i in range(1,20):
    num_2.append(i)

In [266]:
# 設定要訓練的超參數組合

n_estimators = num_1
max_depth = num_2
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(train_X, train_Y)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 361 candidates, totalling 1083 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 1083 out of 1083 | elapsed:    6.7s finished


In [267]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.818862 using {'max_depth': 9, 'n_estimators': 17}
