In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import os

In [3]:
# 재생산성을 위해 시드 고정
np.random.seed(7)
random.seed(7)
tf.random.set_seed(7)

In [4]:
train = pd.read_csv('../input/train.csv', index_col = 'id')
test = pd.read_csv('../input/test.csv', index_col = 'id')

In [6]:
# 기상청 데이터만 추출
X_train = train.loc[:,'X00':'X39']

# standardization을 위해 평균과 표준편차 구하기
MEAN = X_train.mean()
STD = X_train.std()

# 표준편차가 0일 경우 대비하여 1e-07 추가 
X_train = (X_train - MEAN) / (STD + 1e-07)

In [7]:
# RNN 모델에 입력 할 수 있는 시계열 형태로 데이터 변환 
def convert_to_timeseries(df, interval):
    sequence_list = []
    target_list = []
    
    for i in tqdm(range(df.shape[0] - interval)):
        sequence_list.append(np.array(df.iloc[i:i+interval,:-1]))
        target_list.append(df.iloc[i+interval,-1])
    
    sequence = np.array(sequence_list)
    target = np.array(target_list)
    
    return sequence, target

In [29]:
_sequence.shape

(4308, 12, 40)

In [30]:
_target.shape

(4308,)

In [8]:
y_columns = ['Y15','Y16']

In [9]:
# t시점 이전 120분의 데이터로 t시점의 온도를 추정할 수 있는 학습데이터 형성
sequence = np.empty((0, 12, 40))
target = np.empty((0,))
for column in y_columns :
    
    concat = pd.concat([X_train, train[column]], axis = 1)

    _sequence, _target = convert_to_timeseries(concat.head(144*30), interval = 12)

    sequence = np.vstack((sequence, _sequence))
    target = np.hstack((target, _target))

100%|██████████| 4308/4308 [00:01<00:00, 3411.60it/s]
100%|██████████| 4308/4308 [00:01<00:00, 3472.77it/s]


In [32]:
# convert_to_timeseries 함수를 쓰기 위한 dummy feature 생성
X_train['dummy'] = 0

In [33]:
X_train

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X31,X32,X33,X34,X35,X36,X37,X38,X39,dummy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-2.236377,0.053243,-0.288997,-0.911345,-0.286742,0.284562,0.174782,-2.160752,0.315216,0.396435,...,-2.109779,-1.766623,0.297981,-1.015881,0.482017,-0.274946,0.577720,0.240703,-0.234576,0
1,-2.321347,0.083006,0.177482,0.477320,-0.286742,0.284562,0.174782,-2.183598,0.343622,0.424634,...,-2.091664,-1.842501,0.297981,-1.015881,0.070290,-0.274946,0.582841,0.289700,-0.234576,0
2,-2.300105,0.112769,-0.382293,0.904602,-0.286742,0.255107,0.204162,-2.183598,0.372029,0.481033,...,-2.146009,-1.956318,0.268916,-1.015881,0.269124,-0.274946,0.582841,0.311477,-0.234576,0
3,-2.300105,0.083006,-0.009110,-0.804525,-0.286742,0.255107,0.174782,-2.206443,0.343622,0.452834,...,-2.200355,-2.013226,0.297981,-1.015881,0.056231,-0.274946,0.593083,0.344142,-0.234576,0
4,-2.342589,0.083006,-0.662181,0.263679,-0.286742,0.255107,0.204162,-2.206443,0.372029,0.452834,...,-2.254700,-2.013226,0.297981,-1.015881,-0.336416,-0.274946,0.618688,0.371363,-0.234576,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,-0.069655,-0.303908,-0.568885,-0.697704,-0.286742,-0.422352,-0.383432,0.009577,-0.366541,-0.280350,...,-0.388845,-0.173188,-0.457686,0.930244,0.102425,-0.274946,0.838894,0.022936,-0.126289,0
4748,-0.069655,-0.303908,-0.942069,-0.804525,-0.286742,-0.451807,-0.383432,-0.013269,-0.366541,-0.280350,...,-0.388845,-0.154219,-0.457686,0.930244,-0.468972,-0.274946,0.849136,0.050157,-0.126289,0
4749,-0.112139,-0.274146,-0.568885,-0.911345,-0.286742,-0.422352,-0.412812,-0.058960,-0.394948,-0.252150,...,-0.370730,-0.154219,-0.457686,0.930244,0.459924,-0.274946,0.874742,0.039269,-0.126289,0
4750,-0.175867,-0.274146,-0.568885,-0.697704,-0.286742,-0.422352,-0.354052,-0.081806,-0.338135,-0.252150,...,-0.388845,-0.135249,-0.428622,0.930244,0.919853,-0.274946,0.884984,0.088266,-0.126289,0


In [34]:
# train set에서 도출된 평균과 표준편차로 standardization 실시 
test = (test - MEAN) / (STD + 1e-07)

In [35]:
# convert_to_timeseries 함수를 쓰기 위한 dummy feature 생성
test['dummy'] = 0

In [36]:
# train과 test 기간을 합쳐서 120분 간격으로 학습데이터 재구축
X_test, _ = convert_to_timeseries(pd.concat([X_train, test], axis = 0), interval=12)

100%|██████████| 16260/16260 [00:05<00:00, 2739.85it/s]


In [37]:
# test set 기간인 후반부 80일에 맞게 자르기 
X_test = X_test[-11520:, :, :]

In [38]:
# 만들어 두었던 dummy feature 제거
X_train.drop('dummy', axis = 1, inplace = True)
test.drop('dummy', axis = 1, inplace = True)

In [39]:
# 간단한 lstm 모델 구축하기 
simple_lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, input_shape=sequence.shape[-2:]),
    tf.keras.layers.Dense(256, activation='linear'),
    tf.keras.layers.Dense(128, activation='linear'),
    tf.keras.layers.Dense(1)
])

simple_lstm_model.compile(optimizer='adam', loss='mse')

In [40]:
# loss가 4미만으로 떨어지면 학습 종료 시키는 기능
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs = None):
        if(logs.get('loss') < 4):
            print('\n Loss is under 4, cancelling training')
            self.model.stop_training = True

In [42]:
callbacks = myCallback()


# 모델 학습
simple_lstm_model.fit(    
    sequence, target,
    epochs=60,
    batch_size=128,
    verbose=2,
    shuffle=False,
    callbacks = [callbacks]
)


Train on 8616 samples
Epoch 1/60
8616/8616 - 3s - loss: 153.0562
Epoch 2/60
8616/8616 - 1s - loss: 10.5764
Epoch 3/60
8616/8616 - 1s - loss: 6.9967
Epoch 4/60
8616/8616 - 1s - loss: 5.7360
Epoch 5/60
8616/8616 - 1s - loss: 5.1107
Epoch 6/60
8616/8616 - 1s - loss: 4.7538
Epoch 7/60
8616/8616 - 1s - loss: 4.5645
Epoch 8/60
8616/8616 - 1s - loss: 4.6706
Epoch 9/60
8616/8616 - 1s - loss: 4.8616
Epoch 10/60
8616/8616 - 1s - loss: 5.6113
Epoch 11/60
8616/8616 - 1s - loss: 6.8359
Epoch 12/60
8616/8616 - 1s - loss: 11.2073
Epoch 13/60
8616/8616 - 1s - loss: 12.8778
Epoch 14/60
8616/8616 - 1s - loss: 9.3709
Epoch 15/60
8616/8616 - 1s - loss: 6.6584
Epoch 16/60
8616/8616 - 1s - loss: 4.6112
Epoch 17/60
8616/8616 - 1s - loss: 4.1208
Epoch 18/60

 Loss is under 4, cancelling training
8616/8616 - 1s - loss: 3.8928


<tensorflow.python.keras.callbacks.History at 0x7f90759c0668>

In [43]:
# LSTM 레이어는 고정
simple_lstm_model.layers[0].trainable = False

In [44]:
# fine tuning 할 때 사용할 학습데이터 생성 (Y18)
finetune_X, finetune_y = convert_to_timeseries(pd.concat([X_train.tail(432), train['Y18'].tail(432)], axis = 1), interval=12)

100%|██████████| 420/420 [00:00<00:00, 2193.75it/s]


In [45]:
# LSTM 레이어는 고정 시켜두고, DNN 레이어에 대해서 fine tuning 진행 (Transfer Learning)
finetune_history = simple_lstm_model.fit(
            finetune_X, finetune_y,
            epochs=20,
            batch_size=64,
            shuffle=False,
            verbose = 2)

Train on 420 samples
Epoch 1/20
420/420 - 0s - loss: 13.5430
Epoch 2/20
420/420 - 0s - loss: 17.2472
Epoch 3/20
420/420 - 0s - loss: 7.6325
Epoch 4/20
420/420 - 0s - loss: 6.1783
Epoch 5/20
420/420 - 0s - loss: 3.5916
Epoch 6/20
420/420 - 0s - loss: 2.6432
Epoch 7/20
420/420 - 0s - loss: 2.1865
Epoch 8/20
420/420 - 0s - loss: 1.7699
Epoch 9/20
420/420 - 0s - loss: 1.5259
Epoch 10/20
420/420 - 0s - loss: 1.3655
Epoch 11/20
420/420 - 0s - loss: 1.2133
Epoch 12/20
420/420 - 0s - loss: 1.1420
Epoch 13/20
420/420 - 0s - loss: 1.0342
Epoch 14/20
420/420 - 0s - loss: 0.9988
Epoch 15/20
420/420 - 0s - loss: 0.9121
Epoch 16/20
420/420 - 0s - loss: 0.8895
Epoch 17/20
420/420 - 0s - loss: 0.8235
Epoch 18/20
420/420 - 0s - loss: 0.8058
Epoch 19/20
420/420 - 0s - loss: 0.7538
Epoch 20/20
420/420 - 0s - loss: 0.7383


In [50]:
X_test

array([[[ 0.18525395, -0.27414579, -0.10240563, ...,  0.64941447,
         -0.11861225, -0.12628906],
        [ 0.16401156, -0.27414579,  0.08418623, ...,  0.6442934 ,
         -0.13494475, -0.12628906],
        [ 0.14276918, -0.27414579, -1.2219568 , ...,  0.66989875,
         -0.11861225, -0.12628906],
        ...,
        [-0.11213939, -0.27414579, -0.56888529, ...,  0.8747415 ,
          0.03926857, -0.12628906],
        [-0.17586653, -0.27414579, -0.56888529, ...,  0.88498364,
          0.08826607, -0.12628906],
        [-0.23959367, -0.30390844, -0.47558936, ...,  0.92595219,
          0.07193357, -0.12628906]],

       [[ 0.16401156, -0.27414579,  0.08418623, ...,  0.6442934 ,
         -0.13494475, -0.12628906],
        [ 0.14276918, -0.27414579, -1.2219568 , ...,  0.66989875,
         -0.11861225, -0.12628906],
        [ 0.1215268 , -0.30390844,  1.01714554, ...,  0.69038302,
         -0.09139142, -0.12628906],
        ...,
        [-0.17586653, -0.27414579, -0.56888529, ...,  

In [46]:
# 예측하기 
finetune_pred = simple_lstm_model.predict(X_test)

In [47]:
# 제출 파일 만들기
submit = pd.DataFrame({'id':range(144*33, 144*113),
              'Y18':finetune_pred.reshape(1,-1)[0]})

In [48]:
submit.to_csv('baseline_result.csv', index = False)