In [1]:
from typing import Tuple
import warnings

import neptune.new as neptune
import neptune.new.integrations.optuna as optuna_utils

import optuna
import numpy as np
import pandas as pd

from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.study import Study
from optuna.trial import FrozenTrial
from optuna.integration.xgboost import XGBoostPruningCallback 
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")

In [2]:
def load_dataset(path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    train = pd.read_csv(path + "train.csv")
    test = pd.read_csv(path + "test.csv")

    drops = ["조식메뉴", "중식메뉴", "석식메뉴"]

    train = train.drop(drops, axis=1)
    test = test.drop(drops, axis=1)

    train["월"] = pd.DatetimeIndex(train["일자"]).month
    test["월"] = pd.DatetimeIndex(test["일자"]).month

    train["일"] = pd.DatetimeIndex(train["일자"]).day
    test["일"] = pd.DatetimeIndex(test["일자"]).day

    weekday = {"월": 1, "화": 2, "수": 3, "목": 4, "금": 5}

    train["요일"] = train["요일"].map(weekday)
    test["요일"] = test["요일"].map(weekday)

    train["식사가능자수"] = train["본사정원수"] - train["본사휴가자수"] - train["현본사소속재택근무자수"]
    test["식사가능자수"] = test["본사정원수"] - test["본사휴가자수"] - test["현본사소속재택근무자수"]

    train["중식참여율"] = train["중식계"] / train["식사가능자수"]
    train["석식참여율"] = train["석식계"] / train["식사가능자수"]

    features = ["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]
    labels = ["중식계", "석식계", "중식참여율", "석식참여율"]

    train = train[features + labels]
    test = test[features]

    # 요일을 석식 rank에 맞춰 mapping한 요일(석식) 칼럼 만들기.

    weekday_rank4dinner = {
        1: 1,
        2: 2,
        3: 5,
        4: 3,
        5: 4,
    }

    train["요일(석식)"] = train["요일"].map(weekday_rank4dinner)
    test["요일(석식)"] = test["요일"].map(weekday_rank4dinner)

    return train, test

# 예시를 위해 점심 식수 인원만 예측 튜닝

In [3]:
train, test = load_dataset("../input/predict-meals/")


X_lunch = train[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y_lunch = train["중식계"]
X_test = test[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]

x_train, x_valid, y_train, y_valid = train_test_split(
    X_lunch, y_lunch, test_size=0.15, random_state=42
)

In [4]:
def objective(trial: FrozenTrial) -> float:
    param = {
        "lambda": trial.suggest_loguniform("lambda", 1e-03, 1e-01),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
        "random_state": 42,
        "learning_rate": 0.02,
        "n_estimators": 10000,
        "eval_metric": "mae",
    }

    pruning_callback = XGBoostPruningCallback(trial, "validation_1-mae")
    model = XGBRegressor(**param)

    model.fit(
        x_train,
        y_train,
        eval_set=[(x_train, y_train), (x_valid, y_valid)],
        early_stopping_rounds=100,
        callbacks=[pruning_callback], # pruning과정을 callback할 수 있음.
        verbose=False,
    )

    preds = model.predict(x_valid)

    mae = mean_absolute_error(y_valid, preds)

    return mae

In [5]:
%%time
# run = neptune.init(
#     project="ds-wook/predict-meals",
#     # api_token="Anonymous",
# )

# # Create a NeptuneCallback for Optuna
# neptune_callback = optuna_utils.NeptuneCallback(
#     run,
#     plots_update_freq=1,  # create/log plots every 10 trials
#     log_plot_slice=False,  # do not create/log plot_slice
#     log_plot_contour=False,  # do not create/log plot_contour
# )

sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name="optimization",
    direction="minimize",
    sampler=sampler,
    pruner=MedianPruner(n_warmup_steps=5), # 튜닝시 과적합 될 경우 가지치기
)

study.optimize(objective, n_trials=200)
# run.stop()

[32m[I 2021-08-11 09:54:48,197][0m A new study created in memory with name: optimization[0m
[32m[I 2021-08-11 09:54:49,453][0m Trial 0 finished with value: 83.16557893700363 and parameters: {'lambda': 0.005611516415334507, 'subsample': 0.9753571532049581, 'max_depth': 16, 'min_child_weight': 180}. Best is trial 0 with value: 83.16557893700363.[0m
[32m[I 2021-08-11 09:54:49,722][0m Trial 1 finished with value: 128.07020256927657 and parameters: {'lambda': 0.0020513382630874496, 'subsample': 0.5779972601681014, 'max_depth': 4, 'min_child_weight': 260}. Best is trial 0 with value: 83.16557893700363.[0m
[32m[I 2021-08-11 09:54:51,065][0m Trial 2 finished with value: 98.93614711023827 and parameters: {'lambda': 0.015930522616241012, 'subsample': 0.8540362888980227, 'max_depth': 3, 'min_child_weight': 291}. Best is trial 0 with value: 83.16557893700363.[0m
[32m[I 2021-08-11 09:54:51,802][0m Trial 3 finished with value: 75.97353805626295 and parameters: {'lambda': 0.046225890010

[32m[I 2021-08-11 09:54:59,102][0m Trial 69 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,137][0m Trial 70 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,171][0m Trial 71 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,205][0m Trial 72 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,237][0m Trial 73 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,271][0m Trial 74 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,307][0m Trial 75 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,340][0m Trial 76 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,519][0m Trial 77 pruned. Trial was pruned at iteration 97.[0m
[32m[I 2021-08-11 09:54:59,550][0m Trial 78 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:54:59,591][0m Trial 79 pruned. Trial was pruned at iteration 5.[0

[32m[I 2021-08-11 09:55:06,401][0m Trial 156 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:06,466][0m Trial 157 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:06,516][0m Trial 158 pruned. Trial was pruned at iteration 15.[0m
[32m[I 2021-08-11 09:55:06,545][0m Trial 159 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:06,572][0m Trial 160 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:06,802][0m Trial 161 pruned. Trial was pruned at iteration 123.[0m
[32m[I 2021-08-11 09:55:06,834][0m Trial 162 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:07,199][0m Trial 163 pruned. Trial was pruned at iteration 130.[0m
[32m[I 2021-08-11 09:55:07,225][0m Trial 164 pruned. Trial was pruned at iteration 5.[0m
[32m[I 2021-08-11 09:55:07,488][0m Trial 165 pruned. Trial was pruned at iteration 155.[0m
[32m[I 2021-08-11 09:55:07,553][0m Trial 166 pruned. Trial was pruned a

CPU times: user 2min 34s, sys: 656 ms, total: 2min 35s
Wall time: 21.9 s


In [6]:
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    '{key}': {value},")

Best trial:
  Value:  72.22710911871978
  Params: 
    'lambda': 0.03418313169716836,
    'subsample': 0.6146465660294592,
    'max_depth': 17,
    'min_child_weight': 11,
