# 20일차 - 모델 최적화

데이터셋 불러오기 및 훈련

In [1]:
from sklearn.datasets import load_iris

iris = load_iris()
x = iris.data
y = iris.target

# GridSearchCV
GridSearchCV를 사용해서 의사결정나무의 최적의 하이퍼파라미터 검색

참고할 만한 자료: [The Hyperparameter Cheat Sheet](https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff)

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

gsc = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid={
        'max_depth': [4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 5, 7, 10],
        'max_leaf_nodes': [10, 20, 30, 40]
    },
    cv=6
)

In [3]:
gsc.fit(x, y)
print('param:', gsc.best_params_)
print('score:', gsc.best_score_)
print('estimator:', gsc.best_estimator_)

param: {'max_depth': 7, 'max_leaf_nodes': 20, 'min_samples_split': 2}
score: 0.9666666666666667
estimator: DecisionTreeClassifier(max_depth=7, max_leaf_nodes=20)


In [4]:
import pandas as pd 
pd.DataFrame(gsc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_leaf_nodes,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000997,0.000014,0.000000,0.000000,4,10,2,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,16
1,0.000500,0.000500,0.000171,0.000383,4,10,5,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,16
2,0.000335,0.000474,0.000000,0.000000,4,10,7,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,16
3,0.000342,0.000484,0.000168,0.000375,4,10,10,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,16
4,0.000337,0.000477,0.000167,0.000373,4,20,2,"{'max_depth': 4, 'max_leaf_nodes': 20, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,0.000333,0.000471,0.000000,0.000000,10,30,10,"{'max_depth': 10, 'max_leaf_nodes': 30, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,75
108,0.000167,0.000373,0.000168,0.000375,10,40,2,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,0.96,1.0,0.960000,0.032660,4
109,0.000338,0.000478,0.000167,0.000373,10,40,5,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,75
110,0.000328,0.000464,0.000167,0.000373,10,40,7,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,75


In [5]:
gsc.best_estimator_.predict(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
gsc.best_estimator_.score(iris.data, iris.target)

1.0

# HyperOpt
필요한 라이브러리 임포트

In [7]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import warnings
warnings.filterwarnings("ignore")

SEED = 1234

  from pandas import MultiIndex, Int64Index


데이터셋 불러오기

In [8]:
data = load_boston()
train_x, test_x, train_y, test_y = train_test_split(data.data , data.target, random_state=SEED)
print(train_x.shape, train_y.shape)

# 손실 함수
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

(379, 13) (379,)


In [9]:
# 하이퍼 파라미터로 전달할 reg 후보군 정의
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

space = {
    'max_depth': hp.quniform("max_depth", 5, 15, 1), # 10
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.05, 0.005), # 8
    'reg_alpha': hp.choice('reg_alpha', reg_candidate), # 9
    'reg_lambda': hp.choice('reg_lambda', reg_candidate), # 9
    'subsample': hp.quniform('subsample', 0.6, 1, 0.05), # 8
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05), # 8
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1), # 9
    'n_estimators': hp.quniform('n_estimators', 200, 1500, 100) # 13
}
# 48,522,240 회 학습해야 하는 숫자

In [10]:
def hyperparameter_tuning(space):
    model = XGBRegressor(
        max_depth=int(space['max_depth']),
        learning_rate=space['learning_rate'],
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        n_estimators=int(space['n_estimators']),
        random_state=SEED
    )
    evalution  = [(train_x, train_y), (test_x, test_y)]
    
    model.fit(train_x, train_y,
             eval_set=evalution,
             eval_metric='rmse',
             early_stopping_rounds=20,
             verbose=0)
    
    pred = model.predict(test_x)
    rmse = RMSE(test_y, pred)
    
    # 평가 방식 선정
    return {'loss': rmse, 'status': STATUS_OK, 'model': model}

In [11]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
     space=space,
     algo=tpe.suggest,
     max_evals=3,
     trials=trials,
     rstate=np.random.default_rng(SEED)
)

100%|██████████| 3/3 [00:00<00:00,  3.98trial/s, best loss: 3.090059504932305] 


In [12]:
best = fmin(fn=hyperparameter_tuning,
     space=space,
     algo=tpe.suggest,
     max_evals=50,
     trials=trials,
     rstate=np.random.default_rng(SEED)
)

100%|██████████| 50/50 [00:16<00:00,  2.89trial/s, best loss: 2.937918639499599]


In [13]:
# 최적화된 결과를 확인해보자.
best['max_depth'] = int(best['max_depth'])
best['n_estimators'] = int(best['n_estimators'])
best['min_child_weight'] = best['min_child_weight']
best['colsample_bytree'] = best['colsample_bytree']
best['subsample'] = best['subsample']
best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
best['learning_rate'] = best['learning_rate']
best['random_state'] = SEED
print(best)

{'colsample_bytree': 0.9500000000000001, 'learning_rate': 0.04, 'max_depth': 6, 'min_child_weight': 3.0, 'n_estimators': 1000, 'reg_alpha': 1, 'reg_lambda': 0.01, 'subsample': 0.9, 'random_state': 1234}


In [14]:
xgb = XGBRegressor(
        max_depth=best['max_depth'],
        learning_rate=best['learning_rate'],
        reg_alpha=best['reg_alpha'],
        reg_lambda=best['reg_lambda'],
        subsample=best['subsample'],
        colsample_bytree=best['colsample_bytree'],
        min_child_weight=best['min_child_weight'],
        n_estimators=best['n_estimators'],
        random_state=SEED
    )

xgb.fit(train_x, train_y)
pred_y = xgb.predict(test_x)
pd.DataFrame([pred_y,test_y], index=['pred_y', 'test_y'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,117,118,119,120,121,122,123,124,125,126
pred_y,32.544144,24.099964,8.692296,21.262743,12.748672,22.618435,19.340052,16.685472,19.460136,29.339199,...,24.212835,13.36933,42.199867,19.718386,15.711754,17.579304,34.37751,19.549719,40.161812,24.694496
test_y,33.0,27.5,5.6,21.2,14.9,22.3,18.8,14.6,19.4,32.0,...,23.1,14.4,38.7,21.2,23.2,19.6,35.4,20.1,42.8,24.7
