In [10]:
# 실행 환경 확인

import pandas as pd
import numpy as np
import sklearn
import scipy
import statsmodels
import mlxtend
import sys
import xgboost as xgb

print(sys.version)
for i in [pd, np, sklearn, scipy, mlxtend, statsmodels, xgb]:
    print(i.__name__, i.__version__)

3.7.4 (tags/v3.7.4:e09359112e, Jul  8 2019, 20:34:20) [MSC v.1916 64 bit (AMD64)]
pandas 0.25.1
numpy 1.18.5
sklearn 0.21.3
scipy 1.5.2
mlxtend 0.15.0.0
statsmodels 0.11.1
xgboost 0.80


In [11]:
df_elec = pd.read_csv('elec.csv', parse_dates=['Date', 'DateHour'])
df_elec

Unnamed: 0,Date,DateHour,Minute,Value
0,2021-01-01 00:15:00,2021-01-01 00:00:00,15분,62
1,2021-01-01 00:30:00,2021-01-01 00:00:00,30분,61
2,2021-01-01 00:45:00,2021-01-01 00:00:00,45분,61
3,2021-01-01 01:00:00,2021-01-01 00:00:00,60분,61
4,2021-01-01 01:15:00,2021-01-01 01:00:00,15분,96
...,...,...,...,...
24667,2021-09-14 23:00:00,2021-09-14 22:00:00,60분,114
24668,2021-09-14 23:15:00,2021-09-14 23:00:00,15분,117
24669,2021-09-14 23:30:00,2021-09-14 23:00:00,30분,119
24670,2021-09-14 23:45:00,2021-09-14 23:00:00,45분,112


In [12]:
df_info = pd.read_csv('info.csv')
df_info

Unnamed: 0,Date,생산량,기온,풍속,습도,강수량,전기요금(계절),공장인원,인건비
0,2021-01-01 00:00:00,0,-3.2,2.4,71,0.0,109.8,0.000000,1.5
1,2021-01-01 01:00:00,0,-4.5,1.5,77,0.0,109.8,0.000000,1.5
2,2021-01-01 02:00:00,0,-3.9,2.6,58,0.0,109.8,0.000000,1.5
3,2021-01-01 03:00:00,0,-4.1,2.6,56,0.0,109.8,0.000000,1.5
4,2021-01-01 04:00:00,0,-4.6,2.6,60,0.0,109.8,0.000000,1.5
...,...,...,...,...,...,...,...,...,...
6163,2021-09-14 19:00:00,1497,21.7,3.6,85,9.4,167.2,2.442088,1.5
6164,2021-09-14 20:00:00,45,22.2,4.2,78,9.4,167.2,0.087891,1.5
6165,2021-09-14 21:00:00,149,22.2,4.3,76,9.4,167.2,0.290448,1.5
6166,2021-09-14 22:00:00,66,22.0,2.5,79,9.4,167.2,0.148984,1.5


In [13]:
df_elec1 = df_elec.set_index(['DateHour', 'Minute'])['Value'].unstack()

In [14]:
holi = pd.to_datetime(
    ["2021-01-01", "2021-02-11", "2021-02-12", "2021-03-01", "2021-05-05", "2021-05-19", "2021-08-16"]
).date # 일자의 비교여서 일자 형식으로 맞춰 줍니다
min_cols = ['15분', '30분', '45분', '60분']
df_elec1 = df_elec1.assign(
    DayName = df_elec1.index.weekday,
    Hour = df_elec1.index.hour,
    AM = (df_elec1.index.hour >= 12).astype(int),
    Weekend_yn = df_elec1.index.weekday.isin([5, 6]).astype(int),
    Holiday_yn = np.isin(df_elec1.index.date, holi).astype(int), # numpy 형식이니 np.isin을 씁니다
    Avg = df_elec1[min_cols].mean(axis=1),
    TotalHour = df_elec1[min_cols].sum(axis=1),
)
df_info1 = df_info.fillna(0)
df_basetable1 = df_elec1.join(df_info1.set_index('Date'), how='inner')
df_basetable1['target'] = df_basetable1['TotalHour'].shift(-24)
df_prob6 = pd.concat(
    [df_basetable1] +
    [df_basetable1['TotalHour'].shift(24 * i).rename('lag_{}'.format(i)) for i in range(1, 7)], 
    axis=1
).dropna()

In [15]:
# df_prob6_train이 학습셋입니다.
df_prob6_train = df_prob6.loc[df_prob6.index < '2021-08-14'].copy()
# df_prob6_test는 입력만 알고 있다는 가정하에서 진행합니다.
df_prob6_test = df_prob6.loc[df_prob6.index >= '2021-08-14'].copy()

In [16]:
# Kaggle 문제 내에서 학습셋과 테스트셋을 나눕니다.
df_train = df_prob6_train.loc[df_prob6_train.index < '2021-07-14'].copy()
df_test = df_prob6_train.loc[df_prob6_train.index >= '2021-07-14'].copy()

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [18]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Baseline 모델을 만들어 봅니다.
X_svr = ['TotalHour'] + ['lag_{}'.format(i) for i in range(1, 7)]
reg_svm = make_pipeline(StandardScaler(), SVR(kernel='rbf', C = 10, gamma=0.1))
reg_svm.fit(df_train[X_svr], df_train['target'])

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svr',
                 SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
                     gamma=0.1, kernel='rbf', max_iter=-1, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [21]:
mean_absolute_error(df_test['target'], reg_svm.predict(df_test[X_svr]))

148.65000821712516

In [26]:
# Baseline으로 제출 결과를 만들어 냅니다.
reg_svm.fit(df_prob6_train[X_svr], df_prob6_train['target'])
pd.DataFrame(
    reg_svm.predict(df_prob6_test[X_svr]),
    columns=['TotalHour'],
    index=df_prob6_test.index
).to_csv('answer6.csv')

In [28]:
# 중요도가 높았던 DayName을 넣어 봅니다.
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, categories='auto'), ['DayName']),
    ('std', StandardScaler(), ['TotalHour'] + ['lag_{}'.format(i) for i in range(1, 7)])
])

X_svr_2 = ['DayName', 'TotalHour'] + ['lag_{}'.format(i) for i in range(1, 7)]
reg_svm_2 = make_pipeline(ct, SVR(kernel='rbf', C = 10, gamma=0.1))
reg_svm_2.fit(df_train[X_svr_2], df_train['target'])

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ohe',
                                                  OneHotEncoder(categorical_features=None,
                                                                categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                n_values=None,
                                                                sparse=False),
                                                  ['DayName']),
                                             

In [30]:
# 약간의 개선점이 보입니다.
mean_absolute_error(df_test['target'], reg_svm_2.predict(df_test[X_svr_2]))

145.77442370868258

In [31]:
# 문제 4에서 사용했던 GradientBoostingRegressor로 해봅니다
from sklearn.ensemble import GradientBoostingRegressor
X_gb = "15분, 30분, 45분, 60분, DayName, Hour, AM, Weekend_yn, Holiday_yn, Avg, TotalHour, 생산량, 기온, 풍속, 습도, 강수량, 전기요금(계절), 공장인원, 인건비"
X_gb = [i.strip() for i in X_gb.split(',')]
reg_gb = GradientBoostingRegressor(
    n_estimators=75,
    max_depth=10,
    min_samples_leaf=4,
    random_state=123
)
reg_gb.fit(df_train[X_gb], df_train['target'])

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=10,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=4, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=75,
                          n_iter_no_change=None, presort='auto',
                          random_state=123, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [33]:
# 상당히 개선됐습니다.
mean_absolute_error(df_test['target'], reg_gb.predict(df_test[X_cols]))

112.8586555932534

In [37]:
from sklearn.ensemble import VotingRegressor
# 둘을 합친 Voting 앙상블 모델을 만들어 봅니다.
reg_vt = VotingRegressor([
    ('svr', reg_svm_2),
    ('gb', reg_gb)
])
X_vt = X_cols + ['lag_{}'.format(i) for i in range(1, 7)]
reg_vt.fit(df_train[X_vt], df_train['target'])

VotingRegressor(estimators=[('svr',
                             Pipeline(memory=None,
                                      steps=[('columntransformer',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('ohe',
                                                                               OneHotEncoder(categorical_features=None,
                                                                                             categories='auto',
                                                                                             drop=None,
                                                                                         

In [38]:
mean_absolute_error(df_test['target'], reg_vt.predict(df_test[X_vt]))

146.83497429612407

In [40]:
X_gb = "15분, 30분, 45분, 60분, DayName, Hour, AM, Weekend_yn, Holiday_yn, Avg, TotalHour, 생산량, 기온, 풍속, 습도, 강수량, 전기요금(계절), 공장인원, 인건비"
X_gb = [i.strip() for i in X_gb.split(',')]
reg_gb.fit(df_prob6_train[X_gb], df_prob6_train['target'])
pd.DataFrame(
    reg_gb.predict(df_prob6_test[X_gb]),
    columns=['TotalHour'],
    index=df_prob6_test.index
).to_csv('answer6.csv')