앞서 전처리 과정에서 이상치 건물을 확인  
다중공선성을 고려한 열 재설정

In [1]:
import sys
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

pd.set_option('display.max_columns', 30)

In [17]:
train = pd.read_csv('./data/train.csv')
train.head(3)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88


In [18]:
test = pd.read_csv('./data/test.csv')
test.head(3)

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75


In [20]:
## 변수 영문명으로 변경
cols = ['num_date_time','num', 'date_time', 'temp', 'prec','wind' ,'hum', 'hr', 'mj/m2', 'power']
train.columns = cols
train.head()

Unnamed: 0,num_date_time,num,date_time,temp,prec,wind,hum,hr,mj/m2,power
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [21]:
# train.csv 에만 있는 열 제거 num_date_time, hr, mj/m2
train = train.drop(['num_date_time','hr','mj/m2'], axis=1)
train.head()

Unnamed: 0,num,date_time,temp,prec,wind,hum,power
0,1,20220601 00,18.6,,0.9,42.0,1085.28
1,1,20220601 01,18.0,,1.1,45.0,1047.36
2,1,20220601 02,17.7,,1.5,45.0,974.88
3,1,20220601 03,16.7,,1.4,48.0,953.76
4,1,20220601 04,18.4,,2.8,43.0,986.4


In [22]:
train.isnull().sum()

num               0
date_time         0
temp              0
prec         160069
wind             19
hum               9
power             0
dtype: int64

In [24]:
# 강수량 결측치 0.0으로 채우기
train['prec'].fillna(0.0, inplace=True)

In [32]:
train.isnull().sum()

num           0
date_time     0
temp          0
prec          0
wind         19
hum           9
power         0
dtype: int64

In [37]:
train.wind = train.wind.interpolate(method = 'polynomial', order = 3)
train.hum = train.hum.interpolate(method = 'polynomial', order = 3)

In [38]:
train.isnull().sum()

num          0
date_time    0
temp         0
prec         0
wind         0
hum          0
power        0
dtype: int64

In [39]:
# 시간 관련 변수들 생성
train.date_time = pd.to_datetime(train.date_time)
train['hour'] = train.date_time.dt.hour
train['weekday'] = train.date_time.dt.weekday
train['month'] = train.date_time.dt.month
train['day'] = train.date_time.dt.day
train['weekend'] = train['weekday'].isin([5,6]).astype(int)
train.head()

Unnamed: 0,num,date_time,temp,prec,wind,hum,power,hour,weekday,month,day,weekend
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,2,6,1,0
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,1,2,6,1,0
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,2,2,6,1,0
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,3,2,6,1,0
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,4,2,6,1,0


In [41]:
train.day.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [44]:
## 건물별, 요일별, 시간별 발전량 평균 넣어주기
power_mean = pd.pivot_table(train, values = 'power', index = ['num', 'hour', 'weekday'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train['day_hour_mean'] = train.progress_apply(lambda x : power_mean.loc[(power_mean.num == x['num']) & (power_mean.hour == x['hour']) & (power_mean.weekday == x['weekday']) ,'power'].values[0], axis = 1)
train.head()

  0%|          | 0/204000 [00:00<?, ?it/s]

100%|██████████| 204000/204000 [02:40<00:00, 1267.71it/s]


Unnamed: 0,num,date_time,temp,prec,wind,hum,power,hour,weekday,month,day,weekend,day_hour_mean
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,2,6,1,0,1774.744615
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,1,2,6,1,0,1687.347692
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,2,2,6,1,0,1571.483077
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,3,2,6,1,0,1522.153846
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,4,2,6,1,0,1506.793846


In [45]:
## 건물별 시간별 발전량 평균 넣어주기
power_hour_mean = pd.pivot_table(train, values = 'power', index = ['num', 'hour'], aggfunc = np.mean).reset_index()
tqdm.pandas()
train['hour_mean'] = train.progress_apply(lambda x : power_hour_mean.loc[(power_hour_mean.num == x['num']) & (power_hour_mean.hour == x['hour']) ,'power'].values[0], axis = 1)
train.head()

100%|██████████| 204000/204000 [01:57<00:00, 1739.22it/s]


Unnamed: 0,num,date_time,temp,prec,wind,hum,power,hour,weekday,month,day,weekend,day_hour_mean,hour_mean
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,2,6,1,0,1774.744615,1706.318118
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,1,2,6,1,0,1687.347692,1622.620235
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,2,2,6,1,0,1571.483077,1506.971294
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,3,2,6,1,0,1522.153846,1437.365647
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,4,2,6,1,0,1506.793846,1447.321412


In [46]:
## 건물별 시간별 발전량 표준편차 넣어주기
power_hour_std = pd.pivot_table(train, values = 'power', index = ['num', 'hour'], aggfunc = np.std).reset_index()
tqdm.pandas()
train['hour_std'] = train.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.num == x['num']) & (power_hour_std.hour == x['hour']) ,'power'].values[0], axis = 1)
train.head()

100%|██████████| 204000/204000 [01:14<00:00, 2750.00it/s]


Unnamed: 0,num,date_time,temp,prec,wind,hum,power,hour,weekday,month,day,weekend,day_hour_mean,hour_mean,hour_std
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,2,6,1,0,1774.744615,1706.318118,446.882767
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,1,2,6,1,0,1687.347692,1622.620235,439.662704
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,2,2,6,1,0,1571.483077,1506.971294,412.071906
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,3,2,6,1,0,1522.153846,1437.365647,391.205981
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,4,2,6,1,0,1506.793846,1447.321412,381.099697


In [48]:
### 공휴일 변수 추가
### 공휴일이면 1 아니면 0
### 6월 1일, 6월 6일, 8월 15일 임시공휴일로 지정이므로 1로 변경
train['holiday'] = train.apply(lambda x : 0 if x['weekday']<5 else 1, axis = 1)
train.loc[('2022-06-01'<=train.date_time)&(train.date_time<'2022-06-02'), 'holiday'] = 1
train.loc[('2022-06-06'<=train.date_time)&(train.date_time<'2022-06-07'), 'holiday'] = 1
train.loc[('2022-08-15'<=train.date_time)&(train.date_time<'2022-08-16'), 'holiday'] = 1
train.head()

Unnamed: 0,num,date_time,temp,prec,wind,hum,power,hour,weekday,month,day,weekend,day_hour_mean,hour_mean,hour_std,holiday
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,2,6,1,0,1774.744615,1706.318118,446.882767,1
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,1,2,6,1,0,1687.347692,1622.620235,439.662704,1
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,2,2,6,1,0,1571.483077,1506.971294,412.071906,1
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,3,2,6,1,0,1522.153846,1437.365647,391.205981,1
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,4,2,6,1,0,1506.793846,1447.321412,381.099697,1
