In [8]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import datetime as dt
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [9]:
path = "../input/predict-meals/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

train[["현본사소속재택근무자수", "중식계", "석식계"]] = train[["현본사소속재택근무자수", "중식계", "석식계"]].astype(
    "int"
)
test["현본사소속재택근무자수"] = test["현본사소속재택근무자수"].astype("int")

train["일자"] = pd.to_datetime(train["일자"])
test["일자"] = pd.to_datetime(test["일자"])

train["년"] = train["일자"].dt.year
train["월"] = train["일자"].dt.month
train["일"] = train["일자"].dt.day
train["주"] = train["일자"].dt.week
train["요일"] = train["일자"].dt.weekday
train["출근"] = train["본사정원수"] - (
    train["본사휴가자수"] + train["본사출장자수"] + train["현본사소속재택근무자수"]
)
train["휴가비율"] = train["본사휴가자수"] / train["본사정원수"]
train["출장비율"] = train["본사출장자수"] / train["본사정원수"]
train["야근비율"] = train["본사시간외근무명령서승인건수"] / train["출근"]
train["재택비율"] = train["현본사소속재택근무자수"] / train["본사정원수"]

test["년"] = test["일자"].dt.year
test["월"] = test["일자"].dt.month
test["일"] = test["일자"].dt.day
test["주"] = test["일자"].dt.week
test["요일"] = test["일자"].dt.weekday
test["출근"] = test["본사정원수"] - (test["본사휴가자수"] + test["본사출장자수"] + test["현본사소속재택근무자수"])
test["휴가비율"] = test["본사휴가자수"] / test["본사정원수"]
test["출장비율"] = test["본사출장자수"] / test["본사정원수"]
test["야근비율"] = test["본사시간외근무명령서승인건수"] / test["출근"]
test["재택비율"] = test["현본사소속재택근무자수"] / test["본사정원수"]

print("done")

done


  train["주"] = train["일자"].dt.week
  test["주"] = test["일자"].dt.week


In [10]:
train["공휴일전후"] = 0
test["공휴일전후"] = 0

In [11]:
train_holidays = [
    17,
    3,
    62,
    131,
    152,
    226,
    221,
    224,
    245,
    310,
    311,
    309,
    330,
    379,
    467,
    470,
    502,
    565,
    623,
    651,
    705,
    709,
    815,
    864,
    950,
    951,
    953,
    954,
    955,
    971,
    1038,
    1099,
    1129,
    1187,
]

for holiday in train_holidays:
    train.loc[holiday - 1, "공휴일전후"] = 1
test.loc[9, "공휴일전후"] = 1
test.loc[19, "공휴일전후"] = 1

In [12]:
train = pd.get_dummies(train, columns=["공휴일전후"])
test = pd.get_dummies(test, columns=["공휴일전후"])

In [13]:
test.loc[19, "공휴일전후_0"] = 1
test.loc[19, "공휴일전후_1"] = 0

In [14]:
# -----------------------------<Full DATA>-----------------------------------

train_lunch = train[
    [
        "요일",
        "공휴일전후_0",
        "휴가비율",
        "출장비율",
        "재택비율",
        "공휴일전후_1",
        "출근",
#         "본사출장자수",
#         "현본사소속재택근무자수",
#         "본사휴가자수",
        "일",
        "주",
        "월",
        "년",
        "중식계",
    ]
]
test_lunch = test[
    [
        "요일",
        "공휴일전후_0",
        "휴가비율",
        "출장비율",
        "재택비율",
        "공휴일전후_1",
        "출근",
#         "본사출장자수",
#         "현본사소속재택근무자수",
#         "본사휴가자수",
        "일",
        "주",
        "월",
        "년",
    ]
]

train_dinner = train[
    [
        "요일",
        "공휴일전후_0",
        "휴가비율",
        "출장비율",
        "재택비율",
        "공휴일전후_1",
        "출근",
#         "본사휴가자수",
#         "현본사소속재택근무자수",
        "본사시간외근무명령서승인건수",
#         "본사출장자수",
        "석식계",
        "일",
        "주",
        "월",
        "년",
    ]
]
test_dinner = test[
    [
        "요일",
        "공휴일전후_0",
        "휴가비율",
        "출장비율",
        "재택비율",
        "공휴일전후_1",
        "출근",
#         "본사휴가자수",
#         "현본사소속재택근무자수",
        "본사시간외근무명령서승인건수",
#         "본사출장자수",
        "일",
        "주",
        "월",
        "년",
    ]
]

In [15]:
X_lunch = train_lunch.drop("중식계", axis=1)
y_lunch = train_lunch["중식계"]
X_dinner = train_dinner.drop("석식계", axis=1)
y_dinner = train_dinner["석식계"]

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(
    X_lunch, y_lunch, test_size=0.15, random_state=42
)

In [19]:
lunch_params = pd.read_pickle("../parameters/xgb_lunch_params.pkl")

lunch_model = XGBRegressor(**lunch_params)
lunch_model.fit(
    x_train,
    y_train,
    eval_set=[(x_train, y_train), (x_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=100,
)

[0]	validation_0-mae:873.40662	validation_1-mae:863.93726
[100]	validation_0-mae:121.74165	validation_1-mae:121.84717
[200]	validation_0-mae:47.36787	validation_1-mae:71.60096
[300]	validation_0-mae:40.23014	validation_1-mae:69.35230
[400]	validation_0-mae:35.87794	validation_1-mae:68.93666
[500]	validation_0-mae:32.29570	validation_1-mae:68.78835
[600]	validation_0-mae:29.23320	validation_1-mae:68.61154
[700]	validation_0-mae:26.58250	validation_1-mae:68.67536
[720]	validation_0-mae:26.02900	validation_1-mae:68.75520


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='mae', gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             lambda=0.001957209705589047, learning_rate=0.02, max_delta_step=0,
             max_depth=7, min_child_weight=8, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=8,
             num_parallel_tree=1, random_state=42, reg_alpha=0,
             reg_lambda=0.00195720978, scale_pos_weight=1,
             subsample=0.5134936676742582, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [20]:
lunch_model.fit(X_lunch, y_lunch)
lunch_preds = lunch_model.predict(test_lunch)

In [22]:
dinner_params = pd.read_pickle("../parameters/xgb_dinner_params.pkl")
dinner_model = XGBRegressor(**dinner_params)
dinner_model.fit(
    x_train,
    y_train,
    eval_set=[(x_train, y_train), (x_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=100,
)

[0]	validation_0-mae:879.97730	validation_1-mae:870.52014
[100]	validation_0-mae:251.04820	validation_1-mae:244.14703
[200]	validation_0-mae:86.06233	validation_1-mae:92.36872
[300]	validation_0-mae:51.79037	validation_1-mae:72.19988
[400]	validation_0-mae:42.41095	validation_1-mae:69.97480
[500]	validation_0-mae:37.22838	validation_1-mae:69.67126
[600]	validation_0-mae:33.35967	validation_1-mae:69.53409
[700]	validation_0-mae:30.31086	validation_1-mae:69.58132
[725]	validation_0-mae:29.61041	validation_1-mae:69.53654


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9796229501882451,
             eval_metric='mae', gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.012726386555401356,
             max_delta_step=0, max_depth=15, min_child_weight=13, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=8,
             num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.6652282725071904,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
dinner_params = pd.read_pickle("../parameters/xgb_dinner_params.pkl")
dinner_model = XGBRegressor(**dinner_params)
dinner_model.fit(X_dinner, y_dinner)
dinner_preds = dinner_model.predict(test_dinner)

In [None]:
submission = pd.read_csv(path + "sample_submission.csv")
submission["중식계"] = lunch_preds
submission["석식계"] = dinner_preds

submission.to_csv("fea_xgb2.csv", index=False)