In [1]:
import warnings

import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")


path_gothic = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = fm.FontProperties(fname=path_gothic, size=20)

In [2]:
path = "../input/predict-meals/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
submit = pd.read_csv(path + "sample_submission.csv")

In [3]:
drops = ["조식메뉴", "중식메뉴", "석식메뉴"]

train = train.drop(drops, axis=1)
test = test.drop(drops, axis=1)

train["월"] = pd.DatetimeIndex(train["일자"]).month
test["월"] = pd.DatetimeIndex(test["일자"]).month

train["일"] = pd.DatetimeIndex(train["일자"]).day
test["일"] = pd.DatetimeIndex(test["일자"]).day

weekday = {"월": 1, "화": 2, "수": 3, "목": 4, "금": 5}

train["요일"] = train["요일"].map(weekday)
test["요일"] = test["요일"].map(weekday)

train["식사가능자수"] = train["본사정원수"] - train["본사휴가자수"] - train["현본사소속재택근무자수"]
test["식사가능자수"] = test["본사정원수"] - test["본사휴가자수"] - test["현본사소속재택근무자수"]

train["중식참여율"] = train["중식계"] / train["식사가능자수"]
train["석식참여율"] = train["석식계"] / train["식사가능자수"]

features = ["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]
labels = ["중식계", "석식계", "중식참여율", "석식참여율"]

train = train[features + labels]
test = test[features]

# 요일을 석식 rank에 맞춰 mapping한 요일(석식) 칼럼 만들기.

weekday_rank4dinner = {
    1: 1,
    2: 2,
    3: 5,
    4: 3,
    5: 4,
}

train["요일(석식)"] = train["요일"].map(weekday_rank4dinner)
test["요일(석식)"] = test["요일"].map(weekday_rank4dinner)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

X_lunch = train[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y_lunch = train["중식계"]

train_x, test_x, train_y, test_y = train_test_split(X_lunch, y_lunch, test_size=0.3, random_state=42)

In [30]:
def objective(trial):
    param = {
        "lambda": trial.suggest_loguniform("lambda", 1e-3, 1e-01),
#         "alpha": trial.suggest_loguniform("alpha", 1e-3, 1e-01),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "learning_rate": 0.02,
        "n_estimators": 10000,
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "random_state": 42,
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }
    model = XGBRegressor(**param)

    model.fit(
        train_x,
        train_y,
        eval_set=[(test_x, test_y)],
        early_stopping_rounds=100,
        verbose=False,
    )

    preds = model.predict(test_x)

    mae = mean_absolute_error(test_y, preds)

    return mae

In [31]:
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-09 18:30:39,948][0m A new study created in memory with name: no-name-3a94bebc-54fb-4653-afc4-a547e444acfa[0m
[32m[I 2021-07-09 18:30:43,682][0m Trial 0 finished with value: 69.24352247543756 and parameters: {'lambda': 0.005611516415334507, 'subsample': 0.9753571532049581, 'max_depth': 16, 'min_child_weight': 180}. Best is trial 0 with value: 69.24352247543756.[0m
[32m[I 2021-07-09 18:30:44,384][0m Trial 1 finished with value: 93.57701953866864 and parameters: {'lambda': 0.0020513382630874496, 'subsample': 0.5779972601681014, 'max_depth': 4, 'min_child_weight': 260}. Best is trial 0 with value: 69.24352247543756.[0m
[32m[I 2021-07-09 18:30:48,144][0m Trial 2 finished with value: 78.3061565588851 and parameters: {'lambda': 0.015930522616241012, 'subsample': 0.8540362888980227, 'max_depth': 3, 'min_child_weight': 291}. Best is trial 0 with value: 69.24352247543756.[0m
[32m[I 2021-07-09 18:30:49,848][0m Trial 3 finished with value: 66.61411154731202 and paramet

[32m[I 2021-07-09 18:31:59,148][0m Trial 32 finished with value: 61.47306392732905 and parameters: {'lambda': 0.06444871183824874, 'subsample': 0.6600354772806227, 'max_depth': 14, 'min_child_weight': 14}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:32:02,298][0m Trial 33 finished with value: 63.680284822843355 and parameters: {'lambda': 0.0627613868076351, 'subsample': 0.7433585061814858, 'max_depth': 14, 'min_child_weight': 1}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:32:04,312][0m Trial 34 finished with value: 63.259008628887365 and parameters: {'lambda': 0.09844341773249896, 'subsample': 0.6653740094126572, 'max_depth': 11, 'min_child_weight': 36}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:32:06,749][0m Trial 35 finished with value: 66.1863801229066 and parameters: {'lambda': 0.09711870442113799, 'subsample': 0.6901789315795475, 'max_depth': 12, 'min_child_weight': 64}. Best is trial

[32m[I 2021-07-09 18:33:24,051][0m Trial 64 finished with value: 62.88202612966464 and parameters: {'lambda': 0.05763098042562872, 'subsample': 0.6991692276471567, 'max_depth': 11, 'min_child_weight': 1}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:33:25,626][0m Trial 65 finished with value: 63.14910094118908 and parameters: {'lambda': 0.08623390322336445, 'subsample': 0.657137736355762, 'max_depth': 11, 'min_child_weight': 25}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:33:27,708][0m Trial 66 finished with value: 64.73369191628134 and parameters: {'lambda': 0.05086355267817809, 'subsample': 0.7414533142589442, 'max_depth': 9, 'min_child_weight': 57}. Best is trial 12 with value: 61.23874230147725.[0m
[32m[I 2021-07-09 18:33:29,555][0m Trial 67 finished with value: 61.12148570487513 and parameters: {'lambda': 0.03845099065210128, 'subsample': 0.6843709136349125, 'max_depth': 10, 'min_child_weight': 11}. Best is trial 6

[32m[I 2021-07-09 18:34:25,027][0m Trial 96 finished with value: 63.428979146546425 and parameters: {'lambda': 0.07110825519617572, 'subsample': 0.6725810938265009, 'max_depth': 11, 'min_child_weight': 39}. Best is trial 67 with value: 61.12148570487513.[0m
[32m[I 2021-07-09 18:34:27,471][0m Trial 97 finished with value: 63.128558459202885 and parameters: {'lambda': 0.039646758328010706, 'subsample': 0.7577460877118479, 'max_depth': 16, 'min_child_weight': 31}. Best is trial 67 with value: 61.12148570487513.[0m
[32m[I 2021-07-09 18:34:28,660][0m Trial 98 finished with value: 62.04946664683727 and parameters: {'lambda': 0.028050382521863954, 'subsample': 0.6529809146716476, 'max_depth': 12, 'min_child_weight': 6}. Best is trial 67 with value: 61.12148570487513.[0m
[32m[I 2021-07-09 18:34:30,557][0m Trial 99 finished with value: 62.937129173489566 and parameters: {'lambda': 0.021018888686276726, 'subsample': 0.701241173847251, 'max_depth': 3, 'min_child_weight': 22}. Best is t

Number of finished trials: 100
Best trial: {'lambda': 0.03845099065210128, 'subsample': 0.6843709136349125, 'max_depth': 10, 'min_child_weight': 11}


In [34]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [35]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [36]:
lunch_params = study.best_trial.params
lunch_params["n_estimators"] = 10000
lunch_params["random_state"] = 42
lunch_params["learning_rate"] = 0.02

In [None]:
lunch_model = XGBRegressor(**lunch_params)
lunch_model.fit(X_lunch, y_lunch)

In [None]:
test_x = test[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
predict_lunch = lunch_model.predict(test_x)

In [None]:
submit["중식계"] = predict_lunch

In [12]:
X_dinner = train[["월", "일", "요일(석식)", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y_dinner = train["석식계"]

train_x, test_x, train_y, test_y = train_test_split(
    X_dinner, y_dinner, test_size=0.3, random_state=42
)

In [37]:
def objective(trial):
    param = {
#         "lambda": trial.suggest_loguniform("lambda", 1e-3, 1e-01),
#         "alpha": trial.suggest_loguniform("alpha", 1e-3, 1e-01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "learning_rate": trial.suggest_float("learning_rate", 1e-02, 1e-01),
        "n_estimators": 10000,
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "random_state": 42,
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }
    model = XGBRegressor(**param)

    model.fit(
        train_x,
        train_y,
        eval_set=[(test_x, test_y)],
        early_stopping_rounds=100,
        verbose=False,
    )

    preds = model.predict(test_x)

    mae = mean_absolute_error(test_y, preds)

    return mae

In [38]:
sampler = TPESampler(
    seed=42,
    constant_liar=True,
    multivariate=True,
    group=True,
    n_startup_trials=20,
)
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

[32m[I 2021-07-09 18:35:40,613][0m A new study created in memory with name: no-name-8462cc0b-9a05-4104-93c8-290445ee0ecb[0m
[32m[I 2021-07-09 18:35:41,174][0m Trial 0 finished with value: 64.72776107366572 and parameters: {'colsample_bytree': 0.6872700594236812, 'subsample': 0.9753571532049581, 'learning_rate': 0.07587945476302646, 'max_depth': 13, 'min_child_weight': 47}. Best is trial 0 with value: 64.72776107366572.[0m
[32m[I 2021-07-09 18:35:41,393][0m Trial 1 finished with value: 82.76895714859936 and parameters: {'colsample_bytree': 0.5779972601681014, 'subsample': 0.5290418060840998, 'learning_rate': 0.08795585311974417, 'max_depth': 13, 'min_child_weight': 213}. Best is trial 0 with value: 64.72776107366572.[0m
[32m[I 2021-07-09 18:35:42,230][0m Trial 2 finished with value: 64.63486510218836 and parameters: {'colsample_bytree': 0.5102922471479012, 'subsample': 0.9849549260809971, 'learning_rate': 0.08491983767203796, 'max_depth': 6, 'min_child_weight': 55}. Best is t

[32m[I 2021-07-09 18:36:16,951][0m Trial 27 finished with value: 64.04473257328266 and parameters: {'colsample_bytree': 0.7682871161825013, 'subsample': 0.8500613846819234, 'learning_rate': 0.09714404046923503, 'max_depth': 6, 'min_child_weight': 58}. Best is trial 5 with value: 62.12086450987758.[0m
[32m[I 2021-07-09 18:36:17,294][0m Trial 28 finished with value: 63.42593830593383 and parameters: {'colsample_bytree': 0.9732476398164983, 'subsample': 0.796756063161515, 'learning_rate': 0.09974604067818804, 'max_depth': 4, 'min_child_weight': 6}. Best is trial 5 with value: 62.12086450987758.[0m
[32m[I 2021-07-09 18:36:17,892][0m Trial 29 finished with value: 66.05056535078018 and parameters: {'colsample_bytree': 0.955989203411005, 'subsample': 0.7663067108103108, 'learning_rate': 0.07871040119996349, 'max_depth': 15, 'min_child_weight': 62}. Best is trial 5 with value: 62.12086450987758.[0m
[32m[I 2021-07-09 18:36:18,722][0m Trial 30 finished with value: 63.2214839787773 and

[32m[I 2021-07-09 18:36:44,595][0m Trial 54 finished with value: 64.1027688295143 and parameters: {'colsample_bytree': 0.9369736724647377, 'subsample': 0.6313824296633149, 'learning_rate': 0.05765548400903546, 'max_depth': 11, 'min_child_weight': 37}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:36:45,718][0m Trial 55 finished with value: 64.2081126007586 and parameters: {'colsample_bytree': 0.7775367536793598, 'subsample': 0.55797901080397, 'learning_rate': 0.05597371185788331, 'max_depth': 11, 'min_child_weight': 34}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:36:46,630][0m Trial 56 finished with value: 62.60627085859604 and parameters: {'colsample_bytree': 0.9754595090839572, 'subsample': 0.7624917842333185, 'learning_rate': 0.05954042216522823, 'max_depth': 15, 'min_child_weight': 9}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:36:47,044][0m Trial 57 finished with value: 81.77243497226779

[32m[I 2021-07-09 18:37:27,789][0m Trial 81 finished with value: 62.386746274832205 and parameters: {'colsample_bytree': 0.9838205614806013, 'subsample': 0.8657486201527167, 'learning_rate': 0.010652223917633308, 'max_depth': 14, 'min_child_weight': 26}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:37:29,152][0m Trial 82 finished with value: 62.69941079155516 and parameters: {'colsample_bytree': 0.7871481086174992, 'subsample': 0.7200898951609719, 'learning_rate': 0.015944764852986352, 'max_depth': 20, 'min_child_weight': 21}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:37:30,390][0m Trial 83 finished with value: 68.04526490269446 and parameters: {'colsample_bytree': 0.8993181806962331, 'subsample': 0.901878799172228, 'learning_rate': 0.0660173278871285, 'max_depth': 12, 'min_child_weight': 97}. Best is trial 33 with value: 61.30953867791107.[0m
[32m[I 2021-07-09 18:37:31,247][0m Trial 84 finished with value: 64.24622779

Number of finished trials: 100
Best trial: {'colsample_bytree': 0.9407078053937545, 'subsample': 0.6567160279948582, 'learning_rate': 0.0520414339956619, 'max_depth': 19, 'min_child_weight': 8}


In [39]:
# 시각화
optuna.visualization.plot_optimization_history(study)

In [40]:
# 하이퍼파라미터 중요도
optuna.visualization.plot_param_importances(study)

In [None]:
dinner_params = study.best_trial.params
dinner_params["n_estimators"] = 10000
dinner_params["random_state"] = 42

In [None]:
dinner_model = XGBRegressor(**dinner_params)
dinner_model.fit(X_dinner, y_dinner)

In [None]:
test_x = test[["월", "일", "요일(석식)", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
predict_dinner = dinner_model.predict(test_x)
submit["석식계"] = predict_dinner

In [None]:
submit

In [None]:
submit.to_csv("tpe_xgb.csv", index=False)

### Bayseian Ridge