In [1]:
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import optuna

from xgboost import XGBRegressor

path_gothic = "/usr/share/fonts/truetype/nanum/NanumGothic.ttf"
fontprop = fm.FontProperties(fname=path_gothic, size=20)

In [2]:
path = "../input/predict-meals/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")
submit = pd.read_csv(path + "sample_submission.csv")

In [3]:
drops = ["조식메뉴", "중식메뉴", "석식메뉴"]

train = train.drop(drops, axis=1)
test = test.drop(drops, axis=1)

In [4]:
train["월"] = pd.DatetimeIndex(train["일자"]).month
test["월"] = pd.DatetimeIndex(test["일자"]).month

train["일"] = pd.DatetimeIndex(train["일자"]).day
test["일"] = pd.DatetimeIndex(test["일자"]).day

In [5]:
weekday = {"월": 1, "화": 2, "수": 3, "목": 4, "금": 5}

train["요일"] = train["요일"].map(weekday)
test["요일"] = test["요일"].map(weekday)

In [6]:
train["식사가능자수"] = train["본사정원수"] - train["본사휴가자수"] - train["현본사소속재택근무자수"]
test["식사가능자수"] = test["본사정원수"] - test["본사휴가자수"] - test["현본사소속재택근무자수"]

In [7]:
train["중식참여율"] = train["중식계"] / train["식사가능자수"]
train["석식참여율"] = train["석식계"] / train["식사가능자수"]

In [8]:
features = ["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]
labels = ["중식계", "석식계", "중식참여율", "석식참여율"]

train = train[features + labels]
test = test[features]

In [9]:
# 요일을 석식 rank에 맞춰 mapping한 요일(석식) 칼럼 만들기.

weekday_rank4dinner = {
    1: 1,
    2: 2,
    3: 5,
    4: 3,
    5: 4,
}

train["요일(석식)"] = train["요일"].map(weekday_rank4dinner)
test["요일(석식)"] = test["요일"].map(weekday_rank4dinner)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

x = train[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y = train["중식계"]

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)

lunch_params = {
    "lambda": 0.014695381889341316,
    "alpha": 7.411440351007072,
    "colsample_bytree": 0.9,
    "subsample": 0.8,
    "learning_rate": 0.014,
    "max_depth": 5,
    "min_child_weight": 1,
}
lunch_params["n_estimators"] = 10000
lunch_params["random_state"] = 42

lunch_model = XGBRegressor(**lunch_params)
lunch_model.fit(train_x, train_y)
y_pred = lunch_model.predict(test_x)
print(f"MAE: {mean_absolute_error(test_y, y_pred)}")

MAE: 79.6927486862267


In [11]:
lunch_model.fit(x, y)
test_x = test[["월", "일", "요일", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y_pred = lunch_model.predict(test_x)

In [12]:
submit['중식계'] = y_pred

In [13]:
x = train[["월", "일", "요일(석식)", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y = train["석식계"]

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=42)

dinner_params = {
    "lambda": 0.014750443224224807,
    "alpha": 0.0010890646914942937,
    "colsample_bytree": 1.0,
    "subsample": 0.6,
    "learning_rate": 0.012,
    "max_depth": 7,
    "min_child_weight": 2,
}
dinner_params["n_estimators"] = 10000
dinner_params["random_state"] = 42

dinner_model = XGBRegressor(**dinner_params)
dinner_model.fit(train_x, train_y)
y_pred = dinner_model.predict(test_x)
print(f"MAE: {mean_absolute_error(test_y, y_pred)}")

MAE: 63.43847495547974


In [14]:
dinner_model.fit(x, y)
test_x = test[["월", "일", "요일(석식)", "식사가능자수", "본사출장자수", "본사시간외근무명령서승인건수"]]
y_pred = dinner_model.predict(test_x)

In [15]:
submit["석식계"] = y_pred

In [16]:
submit.to_csv("xgb_baseline3.csv", index=False)