# RNN을 사용한 주식 예측

In [95]:
import FinanceDataReader as fdr
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm

import torch

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Data

In [71]:
df = fdr.DataReader('005930', '2018-05-04', '2024-06-30')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-04,53000,53900,51800,51900,39565391,-0.020755
2018-05-08,52600,53200,51900,52600,23104720,0.013487
2018-05-09,52600,52800,50900,50900,16128305,-0.032319
2018-05-10,51700,51700,50600,51600,13905263,0.013752
2018-05-11,52000,52200,51200,51300,10314997,-0.005814


In [13]:
# df.reset_index(inplace=True)

In [72]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-04,53000,53900,51800,51900,39565391,-0.020755
2018-05-08,52600,53200,51900,52600,23104720,0.013487
2018-05-09,52600,52800,50900,50900,16128305,-0.032319
2018-05-10,51700,51700,50600,51600,13905263,0.013752
2018-05-11,52000,52200,51200,51300,10314997,-0.005814
...,...,...,...,...,...,...
2024-06-24,79700,80900,79500,80600,15454227,0.007500
2024-06-25,80600,81800,80100,80800,19088458,0.002481
2024-06-26,80100,81400,79900,81300,17783242,0.006188
2024-06-27,81300,81600,80500,81600,11739720,0.003690


# Scaling

In [73]:
def min_max_scaling(col):
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(df[[col]])

    return scaled

In [74]:
cols = df.columns

for col in cols[:-1]:
    scaled = min_max_scaling(col)
    df[col] = scaled

In [75]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-04,0.294229,0.275338,0.283951,0.269841,0.409302,-0.020755
2018-05-08,0.28666,0.263514,0.28585,0.282913,0.217675,0.013487
2018-05-09,0.28666,0.256757,0.266857,0.251167,0.136459,-0.032319
2018-05-10,0.269631,0.238176,0.261159,0.264239,0.110579,0.013752
2018-05-11,0.275307,0.246622,0.272555,0.258637,0.068783,-0.005814


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1514 entries, 2018-05-04 to 2024-06-28
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    1514 non-null   float64
 1   High    1514 non-null   float64
 2   Low     1514 non-null   float64
 3   Close   1514 non-null   float64
 4   Volume  1514 non-null   float64
 5   Change  1514 non-null   float64
dtypes: float64(6)
memory usage: 82.8 KB


In [77]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Change
count,1514.0,1514.0,1514.0,1514.0,1514.0,1514.0
mean,0.468158,0.424636,0.470156,0.461212,0.133048,0.000408
std,0.246939,0.221609,0.24616,0.243099,0.092281,0.015767
min,0.0,0.0,0.0,0.0,0.0,-0.063877
25%,0.237465,0.216216,0.238841,0.232493,0.074069,-0.00975
50%,0.449385,0.408784,0.451092,0.445378,0.111852,0.0
75%,0.674551,0.606419,0.677113,0.663866,0.166025,0.008578
max,1.0,1.0,1.0,1.0,1.0,0.104706


In [78]:
dfx = df.drop(['Close', 'Change'], axis=1)
dfy = df[['Close']]

In [79]:
dfx.head()

Unnamed: 0_level_0,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-04,0.294229,0.275338,0.283951,0.409302
2018-05-08,0.28666,0.263514,0.28585,0.217675
2018-05-09,0.28666,0.256757,0.266857,0.136459
2018-05-10,0.269631,0.238176,0.261159,0.110579
2018-05-11,0.275307,0.246622,0.272555,0.068783


In [80]:
dfy.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2018-05-04,0.269841
2018-05-08,0.282913
2018-05-09,0.251167
2018-05-10,0.264239
2018-05-11,0.258637


In [81]:
X = dfx.values.tolist()
y = dfy.values.tolist()

In [82]:
X[:5]

[[0.2942289498580889,
  0.27533783783783783,
  0.28395061728395066,
  0.40930157034640163],
 [0.2866603595080417,
  0.2635135135135135,
  0.28584995251661927,
  0.21767480857837296],
 [0.2866603595080417,
  0.2567567567567567,
  0.2668566001899335,
  0.13645893255485592],
 [0.2696310312204352,
  0.23817567567567566,
  0.2611585944919279,
  0.11057940797567432],
 [0.27530747398297073,
  0.2466216216216216,
  0.2725546058879392,
  0.06878335680219218]]

In [83]:
y[:5]

[[0.2698412698412699],
 [0.28291316526610644],
 [0.2511671335200747],
 [0.26423902894491136],
 [0.2586367880485527]]

# CustomDataset
- 10일간의 데이터를 사용하여 다음날 종가(close) 예측

In [84]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, window_size):
        super().__init__()
        
        self.window_size = window_size
        self.data = list()
        l = len(y)
        
        for i in range(l - self.window_size):
            self.data.append((torch.FloatTensor(X[i:i+window_size]), torch.FloatTensor(y[i+window_size])))

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

In [85]:
window_size = 10
dataset = CustomDataset(X, y, window_size)

In [86]:
dataset[0][0]

tensor([[0.2942, 0.2753, 0.2840, 0.4093],
        [0.2867, 0.2635, 0.2858, 0.2177],
        [0.2867, 0.2568, 0.2669, 0.1365],
        [0.2696, 0.2382, 0.2612, 0.1106],
        [0.2753, 0.2466, 0.2726, 0.0688],
        [0.2564, 0.2280, 0.2479, 0.1223],
        [0.2412, 0.2162, 0.2327, 0.1665],
        [0.2223, 0.2128, 0.2336, 0.1340],
        [0.2431, 0.2179, 0.2384, 0.0694],
        [0.2356, 0.2078, 0.2374, 0.0268]])

In [87]:
dataset[0][1]

tensor([0.2344])

In [88]:
len(dataset)

1504

In [89]:
train_size = int(len(dataset) * 0.8)
train_dataset = dataset[:train_size]
test_dataset = dataset[train_size:]

In [90]:
print(len(train_dataset))
print(len(test_dataset))

1203
301


In [94]:
batch_size = 100

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model

In [91]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()

        self.rnn = torch.nn.RNN(input_size, hidden_size, batch_first = True)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        outputs, h_0 = self.rnn(x)

        return outputs[:, -1, :] # or h_0.squeeze()

# train

In [96]:
def training(model, train_dataloader, criterion, optimizer, epoch, num_epochs):
    model.train()
    train_loss = 0.0

    tbar = tqdm(train_dataloader)
    for X, y in tbar:
        output = model(X)
        loss = criterion(output, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        tbar.set_description(f'Epoch/Epochs [{epoch+1}/{num_epochs}] Train Loss: {loss.item():.4f}')

    train_loss = train_loss / len(train_dataloader)

    return model, train_loss

def training_loop(model, train_dataloader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model, train_loss = training(model, train_dataloader, criterion, optimizer, epoch, num_epochs)

        print(f'Train Loss: {train_loss:.4f}')

    return model
    

In [107]:
model = RNN(4, 50, 1)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [108]:
training_loop(model, train_dataloader, criterion, optimizer, 10)

  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.1981


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.1141


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0524


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0229


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0132


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0113


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0105


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0101


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0091


  0%|          | 0/13 [00:00<?, ?it/s]

Train Loss: 0.0082


RNN(
  (rnn): RNN(4, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=1, bias=True)
)