In [1]:
!pip install -U finance-datareader

Collecting finance-datareader
  Downloading finance_datareader-0.9.94-py3-none-any.whl.metadata (466 bytes)
Collecting requests-file (from finance-datareader)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading finance_datareader-0.9.94-py3-none-any.whl (89 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.3/89.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, finance-datareader
Successfully installed finance-datareader-0.9.94 requests-file-2.1.0


In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import train_test_split
import FinanceDataReader as fdr

In [3]:
raw_data = fdr.DataReader('005930','2018')
raw_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,51380,51400,50780,51020,169485,0.001177
2018-01-03,52540,52560,51420,51620,200270,0.01176
2018-01-04,52120,52180,50640,51080,233909,-0.010461
2018-01-05,51300,52120,51200,52120,189623,0.02036
2018-01-08,52400,52520,51500,52020,167673,-0.001919


In [130]:
X = raw_data.iloc[1:]
y = raw_data['Close'].shift(1).dropna().values.reshape(-1, 1)
print(X.shape, y.shape)

(1698, 6) (1698, 1)


In [131]:
# Minmaxscaling
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
X_scaled = X_scaler.fit_transform(X)
y_scaled = y_scaler.fit_transform(y)
print(X_scaled.shape, y_scaled.shape)

(1698, 6) (1698, 1)


In [132]:
WINDOW_SIZE = 5 # 5일을 보고 1일을 예측
X_train = []
for index in range(len(X_scaled)-WINDOW_SIZE):
    X_train.append(X_scaled[index:index+WINDOW_SIZE])
X_train = np.array(X_train)
y_train = y_scaled[WINDOW_SIZE:]

X_train = torch.from_numpy(X_train).float()
y_train = torch.from_numpy(y_train).float()

In [133]:
# 데이터셋 만들기
dataset = TensorDataset(X_train, y_train)
print(dataset.tensors[0].shape) # X_train
print(dataset.tensors[1].shape) # y_train

torch.Size([1693, 5, 6])
torch.Size([1693, 1])


In [134]:
# Set Validation data
VALIDATION_RATE = 0.2
train_index, validation_index = train_test_split(
    range(len(dataset)),
    test_size = VALIDATION_RATE
)
print(len(train_index))
print(len(validation_index))

1354
339


In [135]:
# Set Dataset
train_dataset = Subset(dataset, train_index)
validation_dataset = Subset(dataset, validation_index)

In [136]:
BATCH_SIZE = 128
train_batches = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valiadation_batches = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [137]:
# Modeling
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size = input_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout
        )
        self.fc = nn.Sequential(
            nn.LeakyReLU(0.1),
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, 1)
        )
    def forward(self, x):
        output, _ = self.lstm(x)
        output = output[:, -1, :] # 모델의 출력으로 나온 데이터 중 마지막 결과물만 필요(many to one)
        y = self.fc(output)
        return y

In [140]:
input_size = 6
hidden_size = 32
num_layers = 2
dropout = 0

model = LSTM(input_size, hidden_size, num_layers, dropout)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [141]:
num_epoch = 100
early_stop_threshold = 30
train_losses, validation_losses, lowest_loss = list(), list(), np.inf

for epoch in range(num_epoch):
    train_loss, val_loss = 0,0
    model.train()
    for train_x_batch, train_y_batch in train_batches:
        pred_y = model(train_x_batch)
        loss = loss_func(pred_y, train_y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss = train_loss/len(train_batches)
    train_losses.append(train_loss)


    model.eval()
    with torch.no_grad():
        for val_x_batch, val_y_batch in valiadation_batches:
            pred_val_y = model(val_x_batch)
            loss = loss_func(pred_val_y, val_y_batch)
            val_loss += loss.item()
        val_loss = val_loss/ len(valiadation_batches)
        validation_losses.append(val_loss)

        # Early Stopping
        if validation_losses[-1] < lowest_loss:
            lowest_loss = validation_losses[-1]
            lowest_epoch = epoch
            best_model = torch.save(model.state_dict(), 'best_model.pt') # save best model
        else:
            if (early_stop_threshold > 0) & (lowest_epoch + early_stop_threshold < epoch):
                print(f'Early Stopped {epoch} epochs')
                print(f'Best Model Epochs {lowest_epoch}')
                break
    print(f'{epoch+1}/{num_epoch} Train Loss : {train_loss}, Val Loss : {val_loss}')


1/100 Train Loss : 0.15868171033534137, Val Loss : 0.19669311245282492
2/100 Train Loss : 0.11862970753149553, Val Loss : 0.1540565937757492
3/100 Train Loss : 0.08991622992537239, Val Loss : 0.12385080009698868
4/100 Train Loss : 0.06281781162727963, Val Loss : 0.09502536058425903
5/100 Train Loss : 0.038260474293069405, Val Loss : 0.07237907747427623
6/100 Train Loss : 0.020406773483211346, Val Loss : 0.049694045136372246
7/100 Train Loss : 0.008496812724677677, Val Loss : 0.02966746191183726
8/100 Train Loss : 0.0027923230690331284, Val Loss : 0.0132187536607186
9/100 Train Loss : 0.0007681555037958209, Val Loss : 0.005412618474413951
10/100 Train Loss : 0.0008247419503856112, Val Loss : 0.0038938659708946943
11/100 Train Loss : 0.0008981167907636104, Val Loss : 0.010832744960983595
12/100 Train Loss : 0.0007790166313167323, Val Loss : 0.0002578072405109803
13/100 Train Loss : 0.0007743998678713317, Val Loss : 0.0003383734244077156
14/100 Train Loss : 0.0007069781460648466, Val Loss

In [142]:
model.load_state_dict(torch.load('best_model.pt', weights_only=True))
test_batches = DataLoader(train_dataset, batch_size=BATCH_SIZE)
y_test_pred_list, y_test_list = list(), list()

model.eval()
with torch.no_grad():
    for x_test_batch, y_test_batch in test_batches:
        pred_test_y = model(x_test_batch)
        y_test_pred_list.append(pred_test_y) # 예측값
        y_test_list.append(y_test_batch) # 실제값
print(len(y_test_pred_list))

11


In [143]:
# 배치 풀기
y_test_pred = torch.cat(y_test_pred_list, 0)
y_test = torch.cat(y_test_list, 0)

In [144]:
# 스케일 복원
predict_data = y_scaler.inverse_transform(y_test_pred)
real_data = y_scaler.inverse_transform(y_test)

In [146]:
# RMSE
from sklearn.metrics import mean_squared_error
RMSE = mean_squared_error(real_data, predict_data) ** 0.5
print(RMSE)

168.987629126449


In [147]:
concat_data = np.concatenate((predict_data, real_data), axis = 1)
result_df = pd.DataFrame(concat_data, columns=['predict', 'real'])
result_df.head()

Unnamed: 0,predict,real
0,43621.363341,43750.000023
1,77745.988661,78000.000775
2,49078.100967,49220.000096
3,68461.105716,68500.001341
4,46979.361066,47220.000216


In [148]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=result_df.index, y=result_df.predict,
                    mode='lines',
                    name='predict'))
fig.add_trace(go.Scatter(x=result_df.index, y=result_df.real,
                    mode='lines',
                    name='real'))
fig.show()