# Library

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
from datetime import datetime
import pandas as pd 
import numpy as np 
import keras
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split
from keras.models import Sequential 
from keras.layers import Dense, Dropout, LSTM
from sklearn.metrics import mean_squared_error
from itertools import product
import pickle

from numba import cuda
import gc

os.chdir(r'C:\Users\Nyoths\Desktop\한강')

# GPU 설정

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_memory_growth(gpus[0], True)
  except RuntimeError as e:
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    print(e)

# Load Data

In [None]:
def read_path_csv(path):
    file_list = os.listdir(path)
    file_list_py = [file for file in file_list if file.endswith('.csv')]

    df = pd.DataFrame()

    for file in file_list_py:
        data = pd.read_csv(path + file)
        df = pd.concat([df, data], ignore_index=True)

        print('read', file)

    return df

In [None]:
rf_data = read_path_csv('rf_data/')
rf_data = rf_data.sort_values('ymdhm', ascending = True).reset_index(drop=True) 

read rf_2012.csv
read rf_2013.csv
read rf_2014.csv
read rf_2015.csv
read rf_2016.csv
read rf_2017.csv
read rf_2018.csv
read rf_2019.csv
read rf_2020.csv
read rf_2021.csv
read rf_2022.csv


In [None]:
water_data = read_path_csv('water_data/')
water_data = water_data.sort_values('ymdhm', ascending = True).reset_index(drop=True)

read data_2012.csv
read data_2013.csv
read data_2014.csv
read data_2015.csv
read data_2016.csv
read data_2017.csv
read data_2018.csv
read data_2019.csv
read data_2020.csv
read data_2021.csv
read data_2022.csv


# 전처리

In [None]:
# 특정일 결측치 제거 함수
def drop_date_range(df, start, end):
    start = datetime.strptime(start, "%Y-%m-%d")
    end = datetime.strptime(end, "%Y-%m-%d")
    date_list = [date.strftime("%Y-%m-%d") for date in pd.date_range(start, periods=(end - start).days + 1)]

    drop_idx = df['ymd'].isin(date_list)
    df = df[~drop_idx]

    df = df.reset_index(drop=True)

    return df

In [None]:
df = pd.merge(rf_data, water_data, on='ymdhm') #데이터 프레임 결합
df['ymd'] = df['ymdhm'].str[0:10] # 날짜 컬럼 추가
df['year'] = df['ymdhm'].str[0:4] # 연도 컬럼 추가
df['month'] = df['ymdhm'].str[5:7] # 월 컬럼 추가
df['day'] = df['ymdhm'].str[8:9] # 일 컬럼 추가
df = df.drop(['fw_1018680'],axis=1) # fw_1018680 컬럼 제거
df = drop_date_range(df, '2021-10-14', '2021-10-31') # tide_level 결측치 부분 제거
df = df.interpolate(method="linear") # 선형보간
df = df[:-6912] # test data 제거  

In [None]:
# MinMaxScaler 생성

X_col_list = ['tototf', 'tide_level',
              'wl_1018662', 'fw_1018662', 'wl_1018680', 'wl_1018683',
              'fw_1018683', 'wl_1019630', 'fw_1019630']

y_col_list = ['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']

data = df.loc[:,X_col_list]
target = df.loc[:,y_col_list]

data_scaler = MinMaxScaler().fit(data)
target_scaler = MinMaxScaler().fit(target)

data = []
target = []

In [None]:
def create_data_set(df, window_size, X_col_list, y_col_list, data_scaler, target_scaler):
    data_list = []
    target_list = []

    year_list = ['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019','2020', '2021', '2022']

    for year in year_list:
        df_year = df[df.year == year].reset_index(drop=True)

        data = df_year.loc[:,X_col_list]
        target = df_year.loc[:,y_col_list]

        data = data_scaler.transform(data)
        target = target_scaler.transform(target)

        for i in range(window_size, len(data)):
            data_list.append(np.array(data[i-window_size:i]))
            target_list.append(np.array(target[i]))

    data_list = np.array(data_list)
    target_list = np.array(target_list)

    return data_list, target_list

In [None]:
# data 생성
window_size = 144 # 1일 데이터 input
X_col_list = ['tototf', 'tide_level',
              'wl_1018662', 'fw_1018662', 'wl_1018680', 'wl_1018683',
              'fw_1018683', 'wl_1019630', 'fw_1019630']

y_col_list = ['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630']

data_scaled, target_scaled = create_data_set(df, window_size, X_col_list, y_col_list, data_scaler, target_scaler)

print('X shape:', data_scaled.shape, 'Y shape:', target_scaled.shape)

# 스케일러는 따로 저장. -> 모델 예측시 써야하니까.
with open('data_scaler_samemodel_opt.pickle', 'wb') as f:
    pickle.dump(data_scaler, f, pickle.HIGHEST_PROTOCOL)
with open('target_scaler_samemodel_opt.pickle', 'wb') as f:
    pickle.dump(target_scaler, f, pickle.HIGHEST_PROTOCOL)

X shape: (265248, 144, 9) Y shape: (265248, 4)


In [None]:
# train set, test set 분할 
x_train, x_test, y_train, y_test = train_test_split(data_scaled, target_scaled, test_size=0.3, shuffle=False)

In [None]:
# train set, test set 분할 후 shape 
x_train_shape = np.shape(x_train)
y_train_shape = np.shape(y_train)
x_test_shape = np.shape(x_test)
y_test_shape = np.shape(y_test)

print('x_train:',x_train_shape, 'y_train:',y_train_shape)
print('x_test:',x_test_shape, 'y_test:',y_test_shape)

x_train: (185673, 144, 9) y_train: (185673, 4)
x_test: (79575, 144, 9) y_test: (79575, 4)


# LSTM 

In [None]:
def make_lstm(lstm_layers, dense_layers, dropout, input_shape = (144,9), output_shape = 4):
    model = Sequential()
    
    for i, layer_size in enumerate(lstm_layers,1):
        if i ==1 :
            model.add(LSTM(layer_size, input_shape=input_shape, return_sequences = True))
        else:
            model.add(LSTM(layer_size, return_sequences = True))
            model.add(LSTM(layer_size))
    
    for layer_size in dense_layers:
        model.add(Dense(layer_size, activation='swish'))
        model.add(Dropout(dropout))

    model.add(Dense(dense_layers[-1], activation='swish'))
    model.add(Dense(4))
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [None]:
#grid params
lstm_layer_opts = [(256,256),(256,256),(256,256)]
dense_layer_opts = [(256,256),(256,256)]
dropout_opts = [0.15,0.15]
batch_size_opts = [64,64,64]

param_gird = list(product(lstm_layer_opts, dense_layer_opts, dropout_opts, batch_size_opts))
print('param gird length:', len(param_gird))

param gird length: 36


In [None]:
result_pilename = 'lstm_앙상블용 동일모델.csv'

# model setting
early_stopping = keras.callbacks.EarlyStopping(patience = 15, restore_best_weights = True)
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr = 0.00000001) # 대체적으로 e-6에서 수렴하니 학습률 e-8까지는 보겠다.

for i, params in enumerate(param_gird,1):
    # 데이터 프레임은 매번 섞어서 훈련을 실시한다.
    x_train, x_test, y_train, y_test = train_test_split(data_scaled, target_scaled, test_size=0.3, shuffle=True)

    # 하이퍼 파라미터
    lstm_layers, dense_layers, dropout, batch_size = params 
    #모델 생성
    model = make_lstm(lstm_layers = lstm_layers, dense_layers=dense_layers, dropout=dropout) 
    model.summary()
    #모델 학습 
    model.fit(x_train, y_train, batch_size=batch_size, epochs=500, callbacks=[early_stopping, reduce_lr], validation_data=(x_test, y_test))
    model_name = '256same_model_opt_lstm{0}'.format(i)
    model.save('models/'+model_name+'.h5')

    # 메모리 비우기. 실행안하면 GPU 메모리 에러.
    gc.collect()
    
    #train 예측 
    train_predictions = model.predict(x_train, verbose=1)

    #train 역변환 
    val = target_scaler.inverse_transform(y_train)
    predictions = target_scaler.inverse_transform(train_predictions)

    #train RMSE
    train_RMSE = mean_squared_error(val, predictions)**0.5

    #test 예측 
    test_predictions = model.predict(x_test, verbose=1)

    #test 역변환 
    val = target_scaler.inverse_transform(y_test)
    predictions = target_scaler.inverse_transform(test_predictions)

    #test RMSE
    test_RMSE = mean_squared_error(val, predictions)**0.5
    
    result_df = pd.DataFrame({'model':[model_name], 
                          'lstm_layer':[lstm_layers],
                          'dense_layer':[dense_layers],
                          'dropout':[dropout],
                          'batch_size':[batch_size],
                          'train_RMSE':[train_RMSE],
                          'test_RMSE':[test_RMSE]
                         })
    
    # 메모리 비우기. 실행안하면 GPU 메모리 에러.
    gc.collect()
    
    if not os.path.exists(result_pilename):
        result_df.to_csv(result_pilename, index=False, mode='w')

    else:
        result_df.to_csv(result_pilename, index=False, mode='a', header=False)
    
    print('')
    print(result_df)
    print('')
    print('-'*268)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_15 (LSTM)              (None, 144, 256)          272384    
                                                                 
 lstm_16 (LSTM)              (None, 144, 256)          525312    
                                                                 
 lstm_17 (LSTM)              (None, 256)               525312    
                                                                 
 dense_20 (Dense)            (None, 256)               65792     
                                                                 
 dropout_10 (Dropout)        (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 256)               65792     
                                                                 
 dropout_11 (Dropout)        (None, 256)              