In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm

import warnings 
warnings.filterwarnings(action='ignore')

from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/SeoulHangangB.ttf"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

In [2]:
train = pd.read_csv('dataset/data/train.csv')

In [3]:
test = pd.read_csv('dataset/test_date.csv')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   일자              1205 non-null   object 
 1   요일              1205 non-null   int64  
 2   본사정원수           1205 non-null   int64  
 3   본사휴가자수          1205 non-null   int64  
 4   본사출장자수          1205 non-null   int64  
 5   본사시간외근무명령서승인건수  1205 non-null   int64  
 6   현본사소속재택근무자수     1205 non-null   float64
 7   조식메뉴            1205 non-null   object 
 8   중식메뉴            1205 non-null   object 
 9   석식메뉴            1205 non-null   object 
 10  중식계             1205 non-null   float64
 11  석식계             1205 non-null   float64
 12  년               1205 non-null   int64  
 13  월               1205 non-null   int64  
 14  일               1205 non-null   int64  
 15  식사가능자           1205 non-null   float64
 16  전주중식계           1205 non-null   int64  
 17  전주석식계           1205 non-null   i

In [5]:
test.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '년', '월', '일', '식사가능자'],
      dtype='object')

## 식사가능자

In [4]:
test['식사가능자'] = test['본사정원수'] - test['본사휴가자수'] - test['본사출장자수'] - test['현본사소속재택근무자수']

## 요일평균 식계

In [5]:
def make_dow_avg(df_test, df_prev):
    dow_avg_ln = df_prev.groupby('요일').mean()['중식계']
    dow_avg_dn = df_prev.groupby('요일').mean()['석식계']
    
    df_test['요일평균중식계'] = 0
    tmp = dow_avg_ln.values
    for i in range(len(tmp)):
        df_test['요일평균중식계'][df_test['요일'] == i] = tmp[i]
        
    df_test['요일평균석식계'] = 0
    tmp = dow_avg_dn.values
    for i in range(len(tmp)):
        df_test['요일평균석식계'][df_test['요일'] == i] = tmp[i]
    
    return df_test

## 월평균식계

In [6]:
def make_month_avg(df_test, df_prev):
    month_avg_ln = df_prev.groupby('월').mean()['중식계']
    month_avg_dn = df_prev.groupby('월').mean()['석식계']
    
    df_test['월평균중식계'] = 0
    tmp = month_avg_ln
    tmp_k = tmp.keys(); tmp_v = tmp.values
    for i in tmp_k:
        df_test['월평균중식계'][df_test['월'] == i] = tmp_v[i-1]
        
    df_test['월평균석식계'] = 0
    tmp = month_avg_dn
    tmp_k = tmp.keys(); tmp_v = tmp.values
    for i in tmp_k:
        df_test['월평균석식계'][df_test['월'] == i] = tmp_v[i-1]
        
    return df_test

## 공휴일전후

In [7]:
test['공휴일전후'] = 0
test['공휴일전후'][10] =1
test['공휴일전후'][20] = 1

# 모델

## 점심

In [8]:
from pycaret.regression import *

In [9]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '년', '월', '일',
       '식사가능자', '전주중식계', '전주석식계', '요일평균중식계', '요일평균석식계', '월평균중식계', '월평균석식계',
       '공휴일전후'],
      dtype='object')

In [10]:
X_train_ln = train[['요일', '본사시간외근무명령서승인건수', '전주중식계', '요일평균중식계', '월평균중식계', '공휴일전후',
                '본사휴가자수', '본사출장자수', '식사가능자', '중식계']]

In [11]:
reg = setup(session_id=1,
            data=X_train_ln,
            target='중식계',
            #numeric_imputation = 'mean',
            normalize = True,
            #categorical_features=['월', '요일', '공휴일전후'],
            silent=True)

Unnamed: 0,Description,Value
0,session_id,1
1,Target,중식계
2,Original Data,"(1205, 10)"
3,Missing Values,False
4,Numeric Features,7
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(843, 13)"


In [120]:
top5 = compare_models(n_select=5, sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,73.1334,9304.8669,95.6047,0.7848,0.1171,0.0882,0.025
huber,Huber Regressor,74.9167,10050.6822,99.4267,0.7677,0.1249,0.0903,0.014
ridge,Ridge Regression,75.3434,9917.3244,98.8562,0.7711,0.1228,0.091,0.419
lar,Least Angle Regression,75.3711,9928.307,98.9243,0.7708,0.1232,0.091,0.008
lr,Linear Regression,75.3715,9928.4604,98.9251,0.7708,0.1232,0.091,0.023
br,Bayesian Ridge,75.3955,9923.28,98.8622,0.7709,0.1226,0.0911,0.006
rf,Random Forest Regressor,75.738,10036.3431,99.406,0.7693,0.1235,0.0924,0.071
lasso,Lasso Regression,75.7525,10015.2415,99.2591,0.7686,0.1228,0.0916,0.02
lightgbm,Light Gradient Boosting Machine,75.8789,9927.7634,99.2398,0.7702,0.1234,0.0918,0.049
et,Extra Trees Regressor,75.9928,10214.8996,100.3683,0.765,0.1253,0.0931,0.054


In [12]:
gbr_l = tune_model(create_model('gbr', criterion='mae'), optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,86.3805,12550.9529,112.031,0.7469,0.1418,0.1052
1,76.3429,11290.4856,106.2567,0.673,0.1145,0.0832
2,65.1661,7970.8641,89.2797,0.8106,0.1112,0.0808
3,60.7936,6880.6362,82.9496,0.8544,0.1128,0.0815
4,58.048,5672.4515,75.3157,0.8563,0.0875,0.0666
5,67.2914,9522.2819,97.5822,0.7893,0.1259,0.0831
6,80.309,11922.0444,109.1881,0.7313,0.1258,0.0931
7,68.508,6832.9709,82.6618,0.7915,0.097,0.0789
8,73.4903,9136.4033,95.5845,0.8123,0.1085,0.0857
9,71.5605,9624.0493,98.1022,0.8221,0.1393,0.0947


In [13]:
pred_holdouts = predict_model(gbr_l)
final_model_l = finalize_model(gbr_l)
final_model_l

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,67.522,8337.7513,91.3113,0.8071,0.1157,0.0845


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='mae', init=None,
                          learning_rate=0.05, loss='ls', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.05, min_impurity_split=None,
                          min_samples_leaf=2, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=260,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=1, subsample=0.8, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

## 저녁

In [16]:
X_train_dn = train[['요일', '본사시간외근무명령서승인건수', '전주석식계', '요일평균석식계', '월평균석식계', '공휴일전후',
                '본사휴가자수', '본사출장자수', '식사가능자', '석식계']]

In [17]:
reg = setup(session_id=2,
            data=X_train_dn,
            target='석식계',
            #numeric_imputation = 'mean',
            normalize = True,
            #categorical_features=['월', '요일', '공휴일전후'],
            silent=True)

Unnamed: 0,Description,Value
0,session_id,2
1,Target,석식계
2,Original Data,"(1205, 10)"
3,Missing Values,False
4,Numeric Features,7
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(843, 13)"


In [18]:
top5 = compare_models(n_select=5, sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,65.6956,9730.4975,98.2011,0.4631,1.0544,0.13,0.047
gbr,Gradient Boosting Regressor,66.9076,9674.7951,98.0204,0.478,1.0503,0.1321,0.02
lightgbm,Light Gradient Boosting Machine,67.6686,9689.6621,97.9266,0.4776,1.0355,0.1354,0.039
rf,Random Forest Regressor,67.9291,10198.3036,100.5112,0.4513,1.0555,0.1339,0.065
huber,Huber Regressor,68.006,10952.9215,103.9915,0.4201,1.0779,0.1325,0.006
knn,K Neighbors Regressor,68.5529,10250.2645,100.7932,0.447,1.036,0.1427,0.006
par,Passive Aggressive Regressor,69.1323,11394.344,106.1306,0.3928,1.0841,0.1361,0.007
lasso,Lasso Regression,70.819,10548.2511,102.277,0.4342,1.0642,0.1398,0.006
lr,Linear Regression,71.0086,10524.4809,102.198,0.4312,1.0604,0.1401,0.008
ridge,Ridge Regression,71.0129,10511.5423,102.1306,0.4326,1.0606,0.1402,0.005


In [19]:
blended_d = blend_models(top5, optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,67.8305,11678.1826,108.0656,0.4727,1.3119,0.1365
1,62.1728,8179.6177,90.4412,0.2418,0.2337,0.1407
2,65.0434,10802.5087,103.9351,0.4517,1.1414,0.111
3,62.2686,7057.8246,84.0109,0.5541,0.8783,0.1334
4,60.6977,8395.3052,91.6259,0.5346,0.9366,0.121
5,60.1126,7815.4293,88.4049,0.4852,0.9161,0.1185
6,67.0025,9452.2664,97.2228,0.5867,1.2498,0.1308
7,57.9494,6777.51,82.3256,0.6482,1.056,0.1166
8,65.6323,9381.7706,96.8595,0.5514,1.099,0.1259
9,67.1135,12101.6055,110.0073,0.5406,1.6796,0.0981


In [128]:
et_d = create_model('et', criterion='mae')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,70.3089,12095.1836,109.9781,0.4539,1.3105,0.1399
1,69.7558,11467.6603,107.0872,-0.0629,0.446,0.1554
2,64.8721,10542.6419,102.6774,0.4649,1.1264,0.1111
3,65.7452,7095.7637,84.2364,0.5517,0.8232,0.1439
4,63.951,9120.1388,95.4994,0.4945,0.9397,0.127
5,61.391,7688.2595,87.6827,0.4936,0.9177,0.1262
6,68.0401,9731.8926,98.6504,0.5745,1.217,0.1318
7,58.4755,6446.4877,80.29,0.6654,1.014,0.1215
8,71.422,10965.6032,104.7168,0.4757,1.0964,0.1404
9,66.1804,11762.5741,108.4554,0.5535,1.6694,0.0965


In [20]:
pred_holdouts = predict_model(blended_d)
final_model_d = finalize_model(blended_d)
final_model_d

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,65.1066,9785.7263,98.9228,0.4965,1.1245,0.1288


VotingRegressor(estimators=[('et',
                             ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                                 criterion='mse',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=-1,
                                                 oob_score=False,
                                  

# 추론

## Test 5일마다 split

In [25]:
submission_df = pd.read_csv('./sample_submission.csv')

In [30]:
res = []
X_test = test[['일자', '월', '요일', '본사시간외근무명령서승인건수', '공휴일전후',
                '본사휴가자수', '본사출장자수', '식사가능자']][:5]

In [31]:
X_test

Unnamed: 0,일자,월,요일,본사시간외근무명령서승인건수,공휴일전후,본사휴가자수,본사출장자수,식사가능자
0,2021-01-27,1,2,5,0,88,182,2355.0
1,2021-01-28,1,1,409,0,104,212,2319.0
2,2021-01-29,1,0,0,0,270,249,2170.0
3,2021-02-01,2,3,538,0,108,154,2340.0
4,2021-02-02,2,4,455,0,62,186,2362.0


In [32]:
from datetime import timedelta
X_test = make_dow_avg(X_test, train)
X_test = make_month_avg(X_test, train)
X_test['전주중식계'] = 0
X_test['전주석식계'] = 0
index = pd.DatetimeIndex(X_test['일자']) + timedelta(weeks=-1)
for i in range(5):
    X_test['전주중식계'][i] = train[train['일자'] == str(index[i])[:10]]['중식계']
    X_test['전주석식계'][i] = train[train['일자'] == str(index[i])[:10]]['석식계']

In [49]:
pred_ln = predict_model(final_model_l, X_test)
pred_dn = predict_model(final_model_d, X_test)

In [50]:
submission_df['중식계'][:5] = pred_ln['Label']
submission_df['석식계'][:5] = pred_dn['Label']

In [51]:
submission_df

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,904.510674,389.667138
1,2021-01-28,904.011825,443.825444
2,2021-01-29,594.587829,276.191645
3,2021-02-01,1174.937702,588.83285
4,2021-02-02,1033.847466,548.33717
5,2021-02-03,0.0,0.0
6,2021-02-04,0.0,0.0
7,2021-02-05,0.0,0.0
8,2021-02-08,0.0,0.0
9,2021-02-09,0.0,0.0


In [52]:
submission_df.to_csv('submission/sub_f1.csv', index=False)

In [None]:

train['전주중식계'] = 0
train['전주석식계'] = 0
idx = pd.DatetimeIndex(train['일자']) + timedelta(weeks=-1)
for i in range(len(train)):
    try:
        train['전주중식계'][i] = train[train['일자'] == str(idx[i])[:10]]['중식계']
        train['전주석식계'][i] = train[train['일자'] == str(idx[i])[:10]]['석식계']
    except:
        train['전주중식계'][i] = train.iloc[i,:]['중식계']
        train['전주석식계'][i] = train.iloc[i,:]['석식계']

train[['중식계', '전주중식계', '석식계', '전주석식계', '일자']].head(10)