# 삼성전자 주가 데이터 다운로드
- Yahoo Finance 에서 주가 데이터 다운로드 (https://finance.yahoo.com/)
    - 검색 키워드 '005930.KS' 입력
- 검색 후 Historical Data 선택

![yahoo finance](figures/rnn/21_yahoo_stock1.png)

- `Start Date: 2000년 1월 4일 End Date: 오늘날짜` 선택
- **Apply 버튼** 클릭 후 다운로드
  
![yahoo finance](figures/rnn/22_yahoo_stock2.png)

In [22]:
import os
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torchinfo

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split  

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

# DataLoading

In [2]:
df = pd.read_csv('dataset/005930.KS.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6122 entries, 0 to 6121
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       6122 non-null   object 
 1   Open       6122 non-null   float64
 2   High       6122 non-null   float64
 3   Low        6122 non-null   float64
 4   Close      6122 non-null   float64
 5   Adj Close  6122 non-null   float64
 6   Volume     6122 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 334.9+ KB


In [3]:
# Date를 index로 변환
df = df.set_index('Date')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,4449.711426,74195000
2000-01-05,5800.0,6060.0,5520.0,5580.0,4063.72876,74680000
2000-01-06,5750.0,5780.0,5580.0,5620.0,4092.860107,54390000
2000-01-07,5560.0,5670.0,5360.0,5540.0,4034.597656,40305000
2000-01-10,5600.0,5770.0,5580.0,5770.0,4202.100586,46880000


In [5]:
df.drop(columns='Adj Close', inplace=True)

# Dataset 구성
## input, output data
- input (X)) feature 구성: \[Open, High, Low, Close, Volumn  (Adj Close 제외)\] 50일치
- output (y) : Close - input 다음날 Close가격

In [6]:
df_y = df['Close'].to_frame()
df_X = df

df_X.shape, df_y.shape

((6122, 5), (6122, 1))

In [7]:
df_X

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-04,6000.0,6110.0,5660.0,6110.0,74195000
2000-01-05,5800.0,6060.0,5520.0,5580.0,74680000
2000-01-06,5750.0,5780.0,5580.0,5620.0,54390000
2000-01-07,5560.0,5670.0,5360.0,5540.0,40305000
2000-01-10,5600.0,5770.0,5580.0,5770.0,46880000
...,...,...,...,...,...
2024-05-30,74800.0,75200.0,73500.0,73500.0,28551273
2024-05-31,74500.0,74700.0,73500.0,73500.0,26198776
2024-06-03,74400.0,76400.0,74200.0,75700.0,15706268
2024-06-04,74900.0,76100.0,74900.0,75300.0,14098053


## 전처리
- feature scaling
    - feature 간의 scaling(단위)을 맞추는 작업.
- X: Standard Scaling (평균: 0, 표준편차: 1)
- y: MinMax Scaling (최소: 0, 최대: 1)  => X의 scale과 비슷한 값으로 변환.

In [8]:
X_scaler = StandardScaler()
y_scaler = MinMaxScaler()

X = X_scaler.fit_transform(df_X)
y = y_scaler.fit_transform(df_y)


## Input Sequential Data 구성
- X: 50일치 데이터(ex:1일 ~ 50일), y: 51일째 주가. (ex: 51일)
    - 50일의 연속된 주식데이터를 학습하여 51일째 주가를 예측한다.
    - X의 한개의 데이터가 50일치 주가데이터가 된다.

![img](figures/rnn/20_stock_dataset.png)

[연속된 날짜가 5인 경우]

In [11]:
time_steps = 50 # seq_length (몇일치 주가를 하나의 데이터로 묶을지)
data_X = []  # input data들을 모을 리스트 X 1개 shape : (50, 5)
data_y = []  # output data들을 모을 리스트

for idx in range(0, y.size - time_steps) : # 데이터를 구성할 수 있는 51개 행이 남을 때까지 반복
    # idx : 0 / X : 0 ~ 49 / y : 50
    _X = X[idx:time_steps+idx]
    _y = y[time_steps+idx]
    data_X.append(_X)
    data_y.append(_y)

np.shape(data_X)  # (6072:batch, 50:seq_len, 5:개별 seq의 feature수)

(6072, 50, 5)

## Train / test set 분리

In [86]:
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2)
np.shape(X_train), np.shape(X_test)

((4857, 50, 5), (1215, 50, 5))

In [87]:
# list -> ndarray 변환 (list -> tensor 생성 시 속도가 느림)
X_train, X_test, y_train, y_test = (np.array(X_train, dtype='float32'), np.array(X_test, dtype='float32'), np.array(y_train, dtype='float32'), np.array(y_test, dtype='float32'))

## Dataset, DataLoader 구성

In [88]:
# Dataset
# 메모리에 있는 tensor가 raw data일 때 TensorDataset으로 생성
train_set = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_set = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

len(train_set), len(test_set)

(4857, 1215)

In [89]:
train_loader = DataLoader(train_set, batch_size=200, shuffle=True, drop_last=True)
test_loader = DataLoader(test_set, batch_size=200)

# 모델 정의

In [90]:
class StockPriceModel(torch.nn.Module) :

    def __init__(self, input_size, hidden_size, num_layers, bidirectional=True, dropout_rate=0.3) :
        super().__init__()
        # X -> LSTM (마지막 hidden) -> dropout -> linear -> y
        self.lstm = nn.LSTM(
            input_size = input_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            bidirectional= bidirectional,
            dropout = dropout_rate
        )
        self.dropout = nn.Dropout(dropout_rate)
        i_features= hidden_size * 2 if bidirectional else hidden_size
        self.lr = torch.nn.Linear(i_features, 1) # 출력 : 주가 가격 1개
        self.sigmoid = nn.Sigmoid() # 0 ~ 1 범위이므로 sigmoid 사용해서 범위 맞춤

    def forward(self, X) :
        # input X : (batch, seq_len, input_size) (200, 50, 5)
        X = X.transpose(1,0)

        # (seq_len, batch, input_size) (50, 200, 5)
        out, _ = self.lstm(X)  # _ : (hidden_state, cell_state)

        # out : [seq_len, batch, hidden_size * 2]
        # [batch, hidden * 2]
        last_out = self.dropout(out[-1]) 

        last_out = self.lr(last_out)
        return self.sigmoid(last_out)


In [91]:
model = StockPriceModel(input_size=5, hidden_size=32, num_layers=1, bidirectional=True, dropout_rate=0.3)

model = model.to(device)


### train

In [92]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.MSELoss()

In [94]:
train_loss_list = []
test_loss_list = []

for epoch in range(100) :
    model.train()
    train_loss = 0.0
    for X_train, y_train in train_loader :
        X_train, y_train = X_train.to(device), y_train.to(device)
        pred = model(X_train)
        loss = loss_fn(pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()
    train_loss_list.append(train_loss/len(train_loader))

    model.eval()
    test_loss = 0.0
    with torch.no_grad() :
        for X_test, y_test in test_loader :
            X_test, y_test = X_test.to(device), y_test.to(device)
            pred_test = model(X_test)
            test_loss = test_loss + loss_fn(pred_test, y_test).item()
        test_loss_list.append(test_loss/len(test_loader))
    if epoch % 10 == 0 or epoch == (100 - 1) :
        print(f'[{epoch}/{100}] train loss : {train_loss}, valid_loss : {test_loss}')

[0/100] train loss : 2.3268619030714035, valid_loss : 0.6752043440937996
[10/100] train loss : 0.41467976197600365, valid_loss : 0.09642817731946707
[20/100] train loss : 0.12666495982557535, valid_loss : 0.029751735273748636
[30/100] train loss : 0.06539894547313452, valid_loss : 0.012983705382794142
[40/100] train loss : 0.041931468644179404, valid_loss : 0.006648026523180306
[50/100] train loss : 0.03020723961526528, valid_loss : 0.0041975697968155146
[60/100] train loss : 0.02547951735323295, valid_loss : 0.0029998615791555494
[70/100] train loss : 0.022748567920643836, valid_loss : 0.0022880843898747116
[80/100] train loss : 0.020308992243371904, valid_loss : 0.0018061205773847178
[90/100] train loss : 0.019469086779281497, valid_loss : 0.0016125383117469028
[99/100] train loss : 0.018137942242901772, valid_loss : 0.001425305410521105


In [96]:
new_data = df_X.tail(50)
new_X = X_scaler.transform(new_data)
print(new_X.shape)

(50, 5)


In [97]:
new_X = torch.tensor(new_X, dtype=torch.float32).unsqueeze(0)
new_X.shape

torch.Size([1, 50, 5])

In [98]:
model.eval()
with torch.no_grad() :
    pred = model(new_X)
pred

tensor([[0.8520]])

In [100]:
y_scaler.inverse_transform(pred)

array([[77933.16716075]])

# 마지막 데이터로 다음날 주식가격 추론