In [1]:
#Library imports
import numpy as np
import pandas as pd
import math
import os
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error

#Deep Learning Library
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Reshape, GRU, RNN

tf.keras.backend.set_floatx('float64')

#### Load data

In [2]:
# 60개의 건물 * 85일 24시간 = 122400

train = pd.read_csv('energy/train.csv', encoding='cp949')

print(train.shape)
train.head()

(122400, 10)


Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [3]:
# 60개의 건물 * 7일 24시간 = 10080

test = pd.read_csv('energy/test.csv', encoding='cp949')

print(test.shape)
test.head()

(10080, 9)


Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,


In [4]:
submission = pd.read_csv('energy/sample_submission.csv', encoding='cp949')

print(submission.shape)
submission.head()

(10080, 2)


Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,0
1,1 2020-08-25 01,0
2,1 2020-08-25 02,0
3,1 2020-08-25 03,0
4,1 2020-08-25 04,0


#### Deep Learning Model 전력사용량만 변수로 사용

In [5]:
#전력사용량(kWh) 정규화

mini = train.iloc[:,2].min()
size = train.iloc[:,2].max() - train.iloc[:,2].min()
train.iloc[:,2] = (train.iloc[:,2] - mini)/size

In [6]:
train.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,0.461072,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,0.458624,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,0.457017,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,0.453729,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,0.453437,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [7]:
input_window = 996 # 임의의 수
output_window = 24 # 168 : 7일 24시간
window = 12 # window는 12시간 마다는 12시간 마다
num_features = 1 # baseline은 feature 하나만 사용
num_power = 60
end_ = 168
lstm_units = 32
dropout = 0.2
EPOCH = 30
BATCH_SIZE = 128

In [8]:
#train을 tensor로 변경 (60, 24*85, 1)
train_x = tf.reshape(train.iloc[:,2].values, [num_power, 24*85, num_features])
print(f'train_x.shape:{train_x.shape}')

train_x.shape:(60, 2040, 1)


In [9]:
#train_window_x np.zeros를 만듬 (60, 85, 996, 1)
train_window_x = np.zeros((train_x.shape[0], (train_x.shape[1]-(input_window + output_window))//window, input_window, num_features))
train_window_y = np.zeros((train_x.shape[0], (train_x.shape[1]-(input_window + output_window))//window, output_window, num_features))
print(f'train_window_x:{train_window_x.shape}')
print(f'train_window_y:{train_window_y.shape}')

train_window_x:(60, 85, 996, 1)
train_window_y:(60, 85, 24, 1)


In [10]:
#train_window_x에 train값 채워넣기
for example in range(train_x.shape[0]):
    for start in range(0, train_x.shape[1]-(input_window + output_window), window):
        end=start+input_window
        train_window_x[example, start//window, :] = train_x[example, start:end, :]
        train_window_y[example, start//window, :] = train_x[example, end:end+output_window, :]

In [11]:
#new_train_x, reshape를 통해 lstm에 알맞은 형태로 집어넣기
new_train_x = tf.reshape(train_window_x, [-1, input_window, num_features])
new_train_y = tf.reshape(train_window_y, [-1, output_window, num_features])
print(f'new_train_x:{new_train_x.shape}')
print(f'new_train_y:{new_train_y.shape}')

new_train_x:(5100, 996, 1)
new_train_y:(5100, 24, 1)


In [12]:
#층 쌓기

model = Sequential([
    LSTM(lstm_units, return_sequences=False, recurrent_dropout=dropout),
    Dense(output_window * num_features, kernel_initializer = tf.initializers.zeros()),
    Reshape([output_window, num_features])
])

In [13]:
#compile

model.compile(optimizer='rmsprop', loss='mae', metrics=['mae'])

# epoch가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다.
class PrintDot(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs):
        if epoch% 10 == 0: print('')
        print('.', end='')
        

# 가장 좋은 성능을 낸 val_loss가 적은 model만 남겨 놓았습니다.
save_best_only = tf.keras.callbacks.ModelCheckpoint(filepath='lstm_model.h5', monitor='val_loss', save_best_only=True)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

# 검증 손실이 10epoch 동안 좋아지지 않으면 학습률은 0.1배로 재구성하는 명령어입니다.
reduceLR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10)

In [14]:
# fit

model.fit(new_train_x, new_train_y, epochs=EPOCH, batch_size=BATCH_SIZE, validation_split=0.2, verbose=0, callbacks=[PrintDot(), early_stop, save_best_only, reduceLR])

model.summary()


..........
..........
..........Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 32)                4352      
_________________________________________________________________
dense (Dense)                (None, 24)                792       
_________________________________________________________________
reshape (Reshape)            (None, 24, 1)             0         
Total params: 5,144
Trainable params: 5,144
Non-trainable params: 0
_________________________________________________________________


In [15]:
prediction = np.zeros((num_power, end_, num_features))
new_test_x = train_x

for i in range(end_//output_window):
    start_ = i * output_window
    next_ = model.predict(new_test_x[:, -input_window:, :])
    new_test_x = tf.concat([new_test_x, next_], axis=1)
    print(new_test_x.shape)
    prediction[:,start_: start_ + output_window, :] = next_
    
prediction = prediction * size + mini

(60, 2064, 1)
(60, 2088, 1)
(60, 2112, 1)
(60, 2136, 1)
(60, 2160, 1)
(60, 2184, 1)
(60, 2208, 1)


In [16]:
submission['answer'] = prediction.reshape([-1,1])
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,8824.452133
1,1 2020-08-25 01,8847.972716
2,1 2020-08-25 02,8780.386005
3,1 2020-08-25 03,8748.747465
4,1 2020-08-25 04,8751.227653
...,...,...
10075,60 2020-08-31 19,5456.702919
10076,60 2020-08-31 20,4924.870355
10077,60 2020-08-31 21,4401.827953
10078,60 2020-08-31 22,4255.104692


In [17]:
submission.to_csv('baseline_submission1.csv', index=False)