## Preliminaries

In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Data preview

In [163]:
df1 = pd.read_csv('CCXI.csv', index_col="Date", parse_dates=["Date"])
df2 = pd.read_csv('ESI.csv', index_col="Date", parse_dates=["Date"])
df3 = pd.read_csv('gtrends.csv', index_col="date", parse_dates=["date"])
df3 = df3.drop(columns='Unnamed: 0')


In [164]:
df = pd.concat([df1, df2, df3], axis=1)
df = df.dropna()
df

Unnamed: 0,CCLI,CCCI,ESI,CCESI,hits_금리,hits_주식,hits_물가,hits_불황,hits_대출,hits_자살,...,hits_수입,hits_이자,hits_취업,hits_경기침체,hits_노동자,hits_실업,hits_금융,hits_보험,hits_환율,hits_금값
2004-03-01,100.9,101.5,105.4,102.0,27.0,11.0,12.0,0.0,42.0,8.0,...,67.0,29.0,70.0,0.0,25.0,79.0,100.0,64.0,32.0,6.0
2004-04-01,100.9,101.5,108.0,101.5,43.0,10.0,56.0,0.0,37.0,8.0,...,68.0,31.0,77.0,0.0,41.0,100.0,73.0,64.0,27.0,0.0
2004-05-01,100.5,101.1,99.2,100.8,25.0,10.0,47.0,38.0,38.0,6.0,...,53.0,32.0,77.0,0.0,51.0,58.0,86.0,63.0,27.0,4.0
2004-06-01,100.2,100.9,96.7,99.9,25.0,13.0,23.0,38.0,30.0,6.0,...,60.0,39.0,74.0,0.0,17.0,74.0,65.0,55.0,34.0,11.0
2004-07-01,99.8,100.5,95.4,99.1,35.0,8.0,84.0,82.0,32.0,6.0,...,73.0,55.0,76.0,0.0,18.0,68.0,72.0,64.0,32.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-01,100.4,101.1,106.2,106.4,48.0,52.0,39.0,9.0,62.0,9.0,...,25.0,75.0,57.0,4.0,30.0,55.0,70.0,73.0,56.0,6.0
2021-12-01,100.2,101.8,104.5,106.0,45.0,44.0,44.0,6.0,63.0,9.0,...,23.0,77.0,60.0,7.0,24.0,64.0,68.0,77.0,51.0,7.0
2022-01-01,100.1,102.4,105.6,105.5,72.0,49.0,45.0,9.0,76.0,10.0,...,24.0,86.0,63.0,6.0,21.0,65.0,74.0,82.0,59.0,9.0
2022-02-01,99.8,102.6,105.7,104.8,66.0,44.0,51.0,4.0,72.0,12.0,...,24.0,89.0,61.0,1.0,16.0,60.0,69.0,81.0,52.0,13.0


In [165]:
pio.templates.default = "plotly_white"

plot_template = dict(
    layout=go.Layout({
        "font_size": 18,
        "xaxis_title_font_size": 24,
        "yaxis_title_font_size": 24})
    )

fig = px.line(df1, labels=dict(
    Date="Date", value="CCXI"
    ))
fig.update_layout(
  template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
    )
fig.show()

In [166]:
fig = px.line(df2, labels=dict(
    Date="Date", value="ESI"
    ))
fig.update_layout(
  template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
    )
fig.show()

## Data normalization

In [167]:
target = "CCCI"
features = ["CCCI", "CCLI", "hits_금리", "hits_환율"]

df_train = df.loc[:"2019-03-01"]

target_mean = df_train[target].mean()
target_std = df_train[target].std()

for c in df_train.columns:
    mu = df_train.loc[:,c].mean()
    std = df_train.loc[:,c].std()
    
    df.loc[:,c] = (df.loc[:,c] - mu) / std
    
df_train = df.loc[:"2019-03-01"]
df_test  = df.loc["2019-04-01":]
test_start = "2019-04-01"

In [192]:
df_show = df[features]
fig = px.line(df_show, labels=dict(
    Date="Date", value="CCCI"
    ))
fig.update_layout(
  template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
    )
fig.show()

## Dataset

In [168]:
class CCXI(Dataset):
    def __init__(self, target, features, seq_length = 5, train = True):
        if train:
            dataset = df_train
        else:
            dataset = df_test
        self.y = torch.tensor(dataset[target].values).float()
        self.X = torch.tensor(dataset[features].values).float()
        self.seq_length = seq_length
        
        if train:
            self.past = torch.zeros_like(self.X[0]).repeat(seq_length, 1)
        else:
            self.past = torch.tensor(df_train.iloc[(df_train.shape[0]-seq_length):,].loc[:,features].values).float()
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        if i >= self.seq_length:
            i_start = i - self.seq_length
            X = self.X[i_start:i, :]
        else:
            pad = self.past[i:]
            X = self.X[:i, :]
            X = torch.cat((pad, X), 0)
        return X, self.y[i]

In [169]:
seq_length = 12
batch_size = 8

train_dataset = CCXI(target, features, seq_length, train = True)
test_dataset = CCXI(target, features, seq_length, train = False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

X, y = next(iter(train_loader))
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: torch.Size([8, 12, 4])
Target shape: torch.Size([8])


## LSTM model

In [179]:
class LSTM(nn.Module):
    def __init__(self, num_features, hidden_units):
        super().__init__()
        self.num_features = num_features
        self.hidden_units = hidden_units
        self.num_layers = 12
        
        self.lstm = nn.LSTM(
            input_size = num_features,
            hidden_size = hidden_units,
            batch_first = True,
            num_layers = self.num_layers)
        
        self.linear = nn.Linear(in_features = self.hidden_units, out_features = 1)
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        _, (hn, _) = self.lstm(x)
        out = self.linear(hn[0]).flatten()
        
        return out
    
num_hidden_units = 12
model = LSTM(num_features = len(features), hidden_units = num_hidden_units).to(device)

## Loss & Optimizer

In [180]:
learning_rate = 5e-4

loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Train

In [181]:
num_epochs = 150
num_batch = len(train_loader)

train_losses = []
test_losses = []

for epoch in range(num_epochs):
    total_loss = 0
    # train set
    for X, y in train_loader:
        optimizer.zero_grad()
        
        output = model(X.to(device))
        loss = loss_function(output, y.to(device))
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    train_avg = total_loss / num_batch * 1000
    # test set
    with torch.no_grad():
        test_loss = 0
        
        for X, y in test_loader:
            output = model(X.to(device))
            test_loss += loss_function(output, y.to(device)).item()
    
    test_avg = test_loss / num_batch * 1000
    train_losses.append(train_avg)
    test_losses.append(test_avg)
    if (epoch + 1) % 50 == 0:
        print(f"({epoch+1:3d}/{num_epochs}) Train loss: {train_avg:.4f}, Test loss: {test_avg:.4f}")  

( 50/150) Train loss: 79.8250, Test loss: 303.6610
(100/150) Train loss: 46.8179, Test loss: 241.6301
(150/150) Train loss: 34.0660, Test loss: 246.9126


In [182]:
df_loss = pd.DataFrame({"Train":train_losses, "Test":test_losses}, index=list(range(len(train_losses))))
fig = px.line(df_loss[50:], labels=dict(Epoch="Epoch", value="Loss"))
fig.update_layout(
  template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
    )
fig.show()

## Prediction

In [183]:
def predict(data_loader, model):
    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _ in data_loader:
            y_hat = model(X.to(device)).cpu()
            output = torch.cat((output, y_hat), 0)
    return output

train_eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

train_hat = predict(train_eval_loader, model).numpy()
test_hat = predict(test_loader, model).numpy()
df["forecast"] = np.concatenate((train_hat, test_hat), 0)

df_out = df.loc[:, [target, "forecast"]].copy()

for c in df_out.columns:
    df_out[c] = df_out[c] * target_std + target_mean

In [184]:
new_test_hat = torch.tensor([]).numpy()
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
for i, (feat, _) in enumerate(test_loader, 0):
    if i >= seq_length:
        i_start = i - seq_length
        X = new_test_hat[i_start:i]
        X = np.concatenate((X, ))
    else:
        pad = feat[0,:12-i,0]
        X = new_test_hat[:i]
        X = np.concatenate((pad, X), 0)
    X = np.concatenate((X.reshape((12,1)), feat[0,:,1:]), 1)
    X = X.reshape((1,12,len(features)))
    y_hat = model(torch.tensor(X).to(device)).cpu().detach().numpy()
    new_test_hat = np.concatenate((new_test_hat, y_hat), 0)

df["forecast2"] = np.concatenate((train_hat, new_test_hat), 0)
df_out2 = df.loc[:, [target, "forecast2"]].copy()

for c in df_out2.columns:
    df_out2[c] = df_out2[c] * target_std + target_mean

## Plot

In [185]:
fig = px.line(df_out, labels=dict(Date="Date", value=target))
fig.add_vline(x=test_start, line_width=4, line_dash="dash")
fig.update_layout(
    template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
)
fig.show()

In [186]:
fig = px.line(df_out2, labels=dict(Date="Date", value=target))
fig.add_vline(x=test_start, line_width=4, line_dash="dash")
fig.update_layout(
    template=plot_template, legend=dict(orientation='h', y=1.02, title_text="")
)
fig.show()

In [187]:
MS = (df_out2[target] - df_out2["forecast2"])**2
train_MSE = (MS.loc[:"2019-03-01"]).mean()
test_MSE = (MS.loc[test_start:]).mean()

In [188]:
print("train_MSE:", train_MSE)
print("test_MSE :", test_MSE)

train_MSE: 0.03695309772172907
test_MSE : 1.7370794771687352


In [189]:
df_out2.to_csv('./predict3.csv')