# 2. 하이퍼파라미터 튜닝을 쉽고 빠르게 하는 방법
코드공유에서 가장 좋아요 많이 받은 글
https://dacon.io/competitions/official/235713/codeshare/2704?page=1&dtype=vote

## 순서 글로 정리
Optuna 실험해보기  
처음보는 내용이기 때문에 바로 필사

In [2]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [7]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

train = train.drop(['index'], axis=1)
train.fillna('NAN', inplace=True)
test = test.drop(['index'], axis=1)
test.fillna('NAN', inplace=True)

# index 열 떼고 NaN 값들을 캐릭터 "NAN" 으로 대입

In [10]:
train

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,NAN,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,NAN,2.0,-47.0,2.0
26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


In [13]:
train_ohe = pd.get_dummies(train)
test_ohe = pd.get_dummies(test)

In [14]:
X = train_ohe.drop(['credit'], axis=1)
y = train['credit']
X_test = test_ohe.copy()

- Optuna는 objective 하이퍼 파라미터의 성능을 평가하고 향후 시험에서 샘플링 할 위치를 결정하기 위해 숫자 값을 반환 하는 함수가 필요하다는 것을 의미하는 블랙 박스 최적화 프로그램
- Optuna의 특정 인수를 object에 전달됩니다.
- trial은 조정해야하는 하이퍼 파라미터를 지정하기 위해 objective 함수에 전달됩니다.
- 이것은 logloss 성능에 대한 피드백으로 Optuna에서 사용하는 모델에서 반환합니다.

..?? 뭔솔...???!!

In [15]:
def objective(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "objective": "multiclass",
        "metric": "multi_logloss",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

    model = LGBMClassifier(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    )

    lgb_pred = model.predict_proba(X_valid)
    log_score = log_loss(y_valid, lgb_pred)
    
    return log_score

In [16]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=10)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-30 19:41:46,529][0m A new study created in memory with name: lgbm_parameter_opt[0m
[32m[I 2021-07-30 19:41:50,787][0m Trial 0 finished with value: 0.7432538731074472 and parameters: {'reg_alpha': 1.12424581642324e-05, 'reg_lambda': 0.08556428806974939, 'max_depth': 15, 'num_leaves': 154, 'colsample_bytree': 0.4936111842654619, 'subsample': 0.40919616423534183, 'subsample_freq': 1, 'min_child_samples': 88, 'max_bin': 380}. Best is trial 0 with value: 0.7432538731074472.[0m
[32m[I 2021-07-30 19:41:57,048][0m Trial 1 finished with value: 0.738817257708709 and parameters: {'reg_alpha': 2.1245096608103405e-05, 'reg_lambda': 0.0018526142807772773, 'max_depth': 20, 'num_leaves': 214, 'colsample_bytree': 0.5274034664069657, 'subsample': 0.42727747704497043, 'subsample_freq': 2, 'min_child_samples': 34, 'max_bin': 357}. Best is trial 1 with value: 0.738817257708709.[0m
[32m[I 2021-07-30 19:42:02,333][0m Trial 2 finished with value: 0.7413939006072351 and parameters: {'

Best Score: 0.7207942901383018
Best trial: {'reg_alpha': 9.145366937509386e-06, 'reg_lambda': 0.008790499283853408, 'max_depth': 14, 'num_leaves': 114, 'colsample_bytree': 0.47322294090686734, 'subsample': 0.6466238370778892, 'subsample_freq': 1, 'min_child_samples': 92, 'max_bin': 277}


In [17]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [18]:
optuna.visualization.plot_parallel_coordinate(study)

In [19]:
# 각 파라미터들의 상관관계
optuna.visualization.plot_contour(
    study,
    params=[
        "max_depth",
        "num_leaves",
        "colsample_bytree",
        "subsample",
        "subsample_freq",
        "min_child_samples",
        "max_bin",
    ],
)

In [20]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
d