In [3]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from matplotlib import rc
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Dropout, Dense, LSTM
from tensorflow.keras.layers import LSTM,GRU
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

matplotlib.rcParams['font.family'] ='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] =False

In [4]:
# hyper params
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

SEQ_LEN = 14
PRED_LEN = 1

BATCH_SIZE = 16
DROPOUT = 0.2


model_path = os.path.join(os.getcwd(), 'model')
data_path = os.path.join(os.getcwd(), 'data')
result_path = os.path.join(os.getcwd(), 'result')

try:
    os.makedirs(model_path)
    os.makedirs(result_path)    

except FileExistsError:
    pass

In [5]:
# 국제
dataset1 = pd.read_csv(os.path.join(data_path, "국제_원유가격.csv"), parse_dates=['기간'], encoding='utf-8')
dataset1 = dataset1.sort_values('기간')

# 국내
dataset2 = pd.read_csv(os.path.join(data_path, "주유소_평균판매가격.csv"), parse_dates=['구분'], encoding='utf-8')
dataset2 = dataset2.sort_values('구분')

In [6]:
total_dataset = pd.merge(left=dataset1, right=dataset2, left_on='기간', right_on='구분')
del total_dataset['구분']
total_dataset

Unnamed: 0,기간,Dubai,Brent,WTI,고급휘발유,보통휘발유,자동차용경유,실내등유
0,2008-04-15,103.66,111.31,113.79,1861.80,1681.33,1585.35,1159.41
1,2008-04-16,105.16,112.66,114.93,1871.39,1692.15,1600.81,1168.47
2,2008-04-17,106.39,112.43,114.86,1874.54,1686.56,1594.53,1174.24
3,2008-04-18,105.83,113.92,116.69,1877.81,1689.68,1602.15,1179.31
4,2008-04-21,107.96,114.43,117.48,1881.82,1695.54,1610.82,1191.62
...,...,...,...,...,...,...,...,...
3637,2022-05-16,106.65,114.24,114.20,2177.84,1958.73,1970.51,1481.83
3638,2022-05-17,110.88,111.93,112.40,2183.42,1963.26,1976.49,1485.38
3639,2022-05-18,109.79,109.11,109.59,2185.95,1967.33,1981.61,1489.18
3640,2022-05-19,105.52,112.04,112.21,2191.74,1972.11,1986.76,1493.57


In [7]:
def to_sequences(data, seq_len, pred_len):
    x_train = []
    y_train = []

    for i in range(seq_len, len(data) - pred_len + 1):
        x_train.append(data[i - seq_len: i, 0])
        y_train.append(data[i + pred_len - 1: i + pred_len, 1])

    return np.array(x_train), np.array(y_train)

def preprocess(data_raw, seq_len, pred_len, train_split):

    x_data, y_data = to_sequences(data_raw, seq_len, pred_len)
    num_train = int(train_split * x_data.shape[0])

    X_train = x_data[:num_train, :]
    y_train = y_data[:num_train, :]

    X_test = x_data[num_train:, :]
    y_test = y_data[num_train:, :]
    
    return X_train, y_train, X_test, y_test

In [8]:
X_train.shape

NameError: name 'X_train' is not defined

In [2]:
def train_model(X_train, y_train, BATCH_SIZE):
    model = keras.Sequential()
    model.add(LSTM(64, activation='relu', input_shape=(1,X_train.shape[1] ), return_sequences=True))

    model.add(LSTM(32, activation='relu', return_sequences=False))
    model.add(Dropout(rate=DROPOUT))

    model.add(Dense(y_train.shape[1]))

    model.compile(
        loss='mean_squared_error', 
        optimizer='adam'
    )

    history = model.fit(
        X_train, 
        y_train, 
        epochs=50, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        validation_split=0.1
    )

    return model, history

In [8]:
# Prediction
def draw_plot_test(model, df, X_test, y_test, scaler):
    y_hat = model.predict(X_test)

    y_test_ = [y[0] for y in y_test]
    y_hat = [y[0] for y in y_hat]

    y_test_inverse = scaler.inverse_transform(pd.DataFrame(
        {df.columns[0]: X_test[:,-1], 
        df.columns[1]: y_test_}
        ))
    y_hat_inverse = scaler.inverse_transform(pd.DataFrame(
        {df.columns[0]: X_test[:,-1], 
        df.columns[1]: y_hat}
        ))

    fig, ax1 = plt.subplots(figsize=(20, 10))
    ax1.plot(total_dataset.iloc[-len(X_test):, 0], y_test_inverse[:, 1], label="Actual Price (left)", color='green')
    ax1.plot(total_dataset.iloc[-len(X_test):, 0], y_hat_inverse[:, 1], label="Predicted Price (left)", color='red')
    ax1.set_ylabel(f'Price ({df.columns[1]}, won)')

    ax2 = ax1.twinx()
    ax2.plot(total_dataset.iloc[-len(X_test):, 0], y_test_inverse[:, 0], label="Brent Oil (right)", color='blue')
    ax2.set_ylabel(f'Price (Brent, $)')

    plt.title(f'Oil Price ({df.columns[1]})')
    plt.xlabel('Time [days]')
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')
    plt.savefig(os.path.join(result_path, f"predict_test_{df.columns[1]}.png"))
    plt.close()

In [10]:
def draw_plot_total(model, df, X_total, y_total, scaler):
    y_total_hat = model.predict(X_total)

    y_total_ = [y[0] for y in y_total]
    y_total_hat = [y[0] for y in y_total_hat]

    y_test_inverse = scaler.inverse_transform(pd.DataFrame(
        {df.columns[0]: X_total[:,-1], 
        df.columns[1]: y_total_}
        ))
    y_hat_inverse = scaler.inverse_transform(pd.DataFrame(
        {df.columns[0]: X_total[:,-1], 
        df.columns[1]: y_total_hat}
        ))

    fig, ax1 = plt.subplots(figsize=(20, 10))
    ax1.plot(total_dataset.iloc[-len(X_total):, 0], y_test_inverse[:, 1], label="Actual Price (left)", color='green')
    ax1.plot(total_dataset.iloc[-len(X_total):, 0], y_hat_inverse[:, 1], label="Predicted Price (left)", color='red')
    ax1.set_ylabel(f'Price ({df.columns[1]}, won)')

    ax2 = ax1.twinx()
    ax2.plot(total_dataset.iloc[-len(X_total):, 0], y_test_inverse[:, 0], label="Brent Oil (right)", color='blue')
    ax2.set_ylabel(f'Price (Brent, $)')

    plt.title(f'Oil Price ({df.columns[1]})')
    plt.xlabel('Time [days]')
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')
    plt.savefig(os.path.join(result_path, f"predict_total_{df.columns[1]}.png"))
    plt.close()

In [9]:
# total code for 4 different oils
df1 = total_dataset.loc[:,['Brent', '고급휘발유']]
df2 = total_dataset.loc[:,['Brent', '보통휘발유']]
df3 = total_dataset.loc[:,['Brent', '자동차용경유']]
df4 = total_dataset.loc[:,['Brent', '실내등유']]

result_df = pd.DataFrame({
    'Oil_Name': ['고급휘발유', '보통휘발유', '자동차용경유', '실내등유'],
})

test_mse = []

for i in range(4):
    df = globals()[f"df{i+1}"]
    oil_name = globals()[f"df{i+1}"].columns[1]
    scaled_df = scaler.fit_transform(df)
    scaler = MinMaxScaler()
    
    X_train, y_train, X_test, y_test = preprocess(scaled_df, SEQ_LEN, PRED_LEN, train_split = 0.95)
    trained_model, history = train_model(X_train, y_train, BATCH_SIZE)
    trained_model.save(os.path.join(model_path, f"LSTM_model_{oil_name}.h5"))
    
    X_total = np.concatenate([X_train, X_test])
    y_total = np.concatenate([y_train, y_test])
    draw_plot_test(trained_model, df, X_test, y_test, scaler)
    draw_plot_total(trained_model, df, X_total, y_total, scaler)

    test_mse.append(trained_model.evaluate(X_test, y_test))

result_df['test_mse'] = list(map(lambda x: round(x, 5), test_mse))
result_df.to_csv(os.path.join(result_path, 'result.csv'), index=False)

NameError: name 'scaler' is not defined