# 20일차 - 모델 최적화

데이터셋 불러오기 및 훈련

In [1]:
from sklearn.datasets import load_iris

iris = load_iris()
x = iris.data
y = iris.target

# GridSearchCV
GridSearchCV를 사용해서 의사결정나무의 최적의 하이퍼파라미터 검색

참고할 만한 자료: [The Hyperparameter Cheat Sheet](https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff)

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

gsc = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid={
        'max_depth': [4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 5, 7, 10],
        'max_leaf_nodes': [10, 20, 30, 40]
    },
    cv=6
)

In [8]:
gsc.fit(x, y)
print('param:', gsc.best_params_)
print('score:', gsc.best_score_)
print('estimator:', gsc.best_estimator_)

param: {'max_depth': 8, 'max_leaf_nodes': 10, 'min_samples_split': 2}
score: 0.9666666666666667
estimator: DecisionTreeClassifier(max_depth=8, max_leaf_nodes=10)


In [7]:
import pandas as pd 
pd.DataFrame(gsc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_leaf_nodes,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000671,0.000754,0.000162,0.000362,4,10,2,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,63
1,0.000338,0.000478,0.000171,0.000383,4,10,5,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,15
2,0.000167,0.000373,0.000332,0.000470,4,10,7,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,63
3,0.000329,0.000465,0.000167,0.000373,4,10,10,"{'max_depth': 4, 'max_leaf_nodes': 10, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,15
4,0.000167,0.000373,0.000501,0.000501,4,20,2,"{'max_depth': 4, 'max_leaf_nodes': 20, 'min_sa...",0.96,1.0,0.92,0.92,0.92,1.0,0.953333,0.035901,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,0.000334,0.000473,0.000166,0.000371,10,30,10,"{'max_depth': 10, 'max_leaf_nodes': 30, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,63
108,0.000500,0.000500,0.000167,0.000372,10,40,2,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,1.00,1.0,0.966667,0.035901,1
109,0.000338,0.000478,0.000162,0.000363,10,40,5,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,63
110,0.000166,0.000372,0.000166,0.000372,10,40,7,"{'max_depth': 10, 'max_leaf_nodes': 40, 'min_s...",0.96,1.0,0.92,0.92,0.88,1.0,0.946667,0.044222,63


In [9]:
gsc.best_estimator_.predict(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
gsc.best_estimator_.score(iris.data, iris.target)

1.0

# HyperOpt
필요한 라이브러리 임포트

In [24]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import warnings
warnings.filterwarnings("ignore")

SEED = 1234

데이터셋 불러오기

In [26]:
data = load_boston()
train_x, test_x, train_y, test_y = train_test_split(data.data , data.target, random_state=SEED)
print(train_x.shape, train_y.shape)

# 손실 함수
def RMSE(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

(379, 13) (379,)


In [27]:
# 하이퍼 파라미터로 전달할 reg 후보군 정의
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

space = {
    'max_depth': hp.quniform("max_depth", 5, 15, 1), # 10
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.05, 0.005), # 8
    'reg_alpha': hp.choice('reg_alpha', reg_candidate), # 9
    'reg_lambda': hp.choice('reg_lambda', reg_candidate), # 9
    'subsample': hp.quniform('subsample', 0.6, 1, 0.05), # 8
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, 1, 0.05), # 8
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1), # 9
    'n_estimators': hp.quniform('n_estimators', 200, 1500, 100) # 13
}
# 48,522,240 회 학습해야 하는 숫자

In [32]:
def hyperparameter_tuning(space):
    model = XGBRegressor(
        max_depth=int(space['max_depth']),
        learning_rate=int(space['learning_rate']),
        reg_alpha=space['reg_alpha'],
        reg_lambda=space['reg_lambda'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        n_estimators=space['n_estimators'],
        random_state=SEED
    )
    evalution  = [(train_x, train_y), (test_x, test_y)]
    
    model.fit(train_x, train_y,
             eval_set=evalution,
             eval_metric='rmse',
             early_stopping_rounds=20,
             verbose=0)
    
    pred = modol.predict(x_test)
    rmse = RMSE(y_test, pred)
    
    # 평가 방식 선정
    return {'loss': rmse, 'status': STATUS_OK, 'model': model}

In [33]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
     space=space,
     algo=tpe.suggest,
     max_evals=3,
     trials=trials,
     rstate=np.random.default_rng(SEED)
)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

job exception: 'float' object cannot be interpreted as an integer



  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]


TypeError: 'float' object cannot be interpreted as an integer