## HyperOpt 사용하기

In [4]:
from hyperopt import hp

### 1. 입력 변수명과 입력값의 검색 공간(Search Space)를 설정한다. 

In [5]:
# 입력변수 x: (-10,10), 1간격
# 입력변수 y: (-15,15), 1간격
# 검색공간 : key값으로 입력 변수명, value값으로 범위가 주어지는 딕셔너리 형태

search_space = {'x': hp.quniform('x',-10,10,1), 'y': hp.quniform('y', -15, 15, 1)}

In [6]:
import numpy as np

### 2. 목적함수를 생성한다. 

### 3. 목적 함수의 반환 최솟값을 가지는 최적 입력값을 유추한다. 

In [7]:
# 위의 search_space (변수 + 검색공간을 가지는 dictionary)를 인자로 받고 특정 값을 반환하는 목적함수 생성

def objective_function(search_space):
    x = search_space['x']
    y = search_space['y']
    
    loss = x**2-20*y
    return(loss)

    # 딕셔너리 형태로 반환하고 싶은 경우에는 from hyperopt import STATUS_OK 후
    # return {'loss' : score, 'status' : STATUS_OK}

from hyperopt import fmin, tpe, Trials
# 입력 결괏값을 저장한 Trials 객체값 생성
trial_val = Trials()

# 목적함수의 최솟값을 반환하는 최적 입력 변숫값을 5번의 입력값 시도(max_evals=5)로 찾기
best = fmin(fn=objective_function,
            space=search_space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trial_val,
            rstate=np.random.default_rng(seed=0)
            )

print(best)

100%|█████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 370.85trial/s, best loss: -224.0]
{'x': -4.0, 'y': 12.0}


In [8]:
# 목적함수의 최솟값을 반환하는 최적 입력 변숫값을 20번의 입력값 시도(max_evals=20)로 찾기
best = fmin(fn=objective_function,
            space=search_space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trial_val,
            rstate=np.random.default_rng(seed=0)
            )

print(best)

100%|███████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 409.95trial/s, best loss: -296.0]
{'x': 2.0, 'y': 15.0}


In [9]:
# Trials 객채 속성 (1) : results
trial_val.results

[{'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -64.0, 'status': 'ok'},
 {'loss': -184.0, 'status': 'ok'},
 {'loss': 56.0, 'status': 'ok'},
 {'loss': -224.0, 'status': 'ok'},
 {'loss': 61.0, 'status': 'ok'},
 {'loss': -296.0, 'status': 'ok'},
 {'loss': -40.0, 'status': 'ok'},
 {'loss': 281.0, 'status': 'ok'},
 {'loss': 64.0, 'status': 'ok'},
 {'loss': 100.0, 'status': 'ok'},
 {'loss': 60.0, 'status': 'ok'},
 {'loss': -39.0, 'status': 'ok'},
 {'loss': 1.0, 'status': 'ok'},
 {'loss': -164.0, 'status': 'ok'},
 {'loss': 21.0, 'status': 'ok'}]

In [10]:
# Trials 객체 속성 (2) : vals
print(trial_val.vals)

{'x': [-6.0, -4.0, 4.0, -4.0, 9.0, -6.0, -4.0, 4.0, -4.0, 9.0, 2.0, 10.0, -9.0, -8.0, -0.0, -0.0, 1.0, 9.0, 6.0, 9.0], 'y': [5.0, 10.0, -2.0, 12.0, 1.0, 5.0, 10.0, -2.0, 12.0, 1.0, 15.0, 7.0, -10.0, 0.0, -5.0, -3.0, 2.0, 4.0, 10.0, 3.0]}


In [22]:
from xgboost import XGBClassifier
from hyperopt import STATUS_OK
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
import warnings
warnings.filterwarnings('ignore')

data = load_breast_cancer()
X = data.data
y = data.target

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1)

In [24]:
# XGBoost의 하이퍼파라미터로 'max_depth', 'min_child_weight', 'learning_rate', 'colsample_bytree' 선정
search_space = {'max_depth':hp.quniform('max_depth',5, 20, 1),
                'min_child_weight':hp.quniform('min_child_weight',1, 2, 1),
                'learning_rate':hp.uniform('learning_rate', 0.01, 0.2),
                'colsample_bytree':hp.uniform('colsample_bytree', 0.5, 1)
                }

In [25]:
def objective_func(search_space):
    
    # XGBoost Model
    xgb_clf = XGBClassifier(n_estimators=100,
                            max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            learning_rate=search_space['learning_rate'],
                            colsample_bytree=search_space['colsample_bytree'],
                            eval_metric='logloss')
    
    # 반환값 : 교차 검증 기반 평균 정확도 (accuracy)
    accuracy = cross_val_score(xgb_clf, X_train, y_train, scoring='accuracy',cv=3)
    
    # cv=3이므로 평균으로 반환
    return {'loss':-1*np.mean(accuracy),'status':STATUS_OK}

In [26]:
trial_val = Trials()
best = fmin(fn=objective_func,
            space=search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trial_val
            )
best

100%|███████████████████████████████████████████████| 50/50 [00:08<00:00,  5.78trial/s, best loss: -0.9714040896944347]


{'colsample_bytree': 0.5597488496368312,
 'learning_rate': 0.19909489542505937,
 'max_depth': 5.0,
 'min_child_weight': 2.0}

In [28]:
from sklearn.metrics import accuracy_score

In [30]:
# 도출된 하이퍼파라미터들로 XBGClassifier 재학습
xgb_clf = XGBClassifier(n_estimators=400,
                        max_depth=int(best['max_depth']),
                        min_child_weight=int(best['min_child_weight']),
                        learning_rate=best['learning_rate'],
                        colsample_bytree=best['colsample_bytree']
                       )

# validation data set을 이용하여 성능 평가 결과 확인
evals = [(X_val, y_val)]
xgb_clf.fit(X_tr, y_tr, early_stopping_rounds=50, eval_metric='logloss',
            eval_set=evals, verbose=False)

preds=xgb_clf.predict(X_test)
pred_proba=xgb_clf.predict_proba(X_test)[:,1]

accuracy_score(y_test, preds)

0.956140350877193