In [102]:
from tqdm.auto import tqdm
import random
import os

import torch

import pandas as pd
import numpy as np

In [103]:
def reset_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

reset_seeds()

In [104]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [105]:
!pip install -U finance-datareader



In [106]:
import FinanceDataReader as fdr

In [107]:
df_krx = fdr.StockListing('KRX')
df_krx.head()

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
0,5930,KR7005930003,삼성전자,KOSPI,,73300,1,200,0.27,73800,74000,73200,15410215,1133779657093,437585060915000,5969782550,STK
1,660,KR7000660001,SK하이닉스,KOSPI,,140000,1,3300,2.41,137000,140000,137000,5007248,697503272200,101920331100000,728002365,STK
2,373220,KR7373220003,LG에너지솔루션,KOSPI,,422000,2,-500,-0.12,426500,431000,422000,340392,144729522000,98748000000000,234000000,STK
3,207940,KR7207940008,삼성바이오로직스,KOSPI,,703000,1,5000,0.72,702000,705000,700000,59712,41965580000,50035322000000,71174000,STK
4,5935,KR7005931001,삼성전자우,KOSPI,,59300,2,-600,-1.0,59900,60100,59300,1607884,95862972500,48797181310000,822886700,STK


In [108]:
df_kospi = df_krx[df_krx['Market'] == 'KOSPI']
df_kospi

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
0,005930,KR7005930003,삼성전자,KOSPI,,73300,1,200,0.27,73800,74000,73200,15410215,1133779657093,437585060915000,5969782550,STK
1,000660,KR7000660001,SK하이닉스,KOSPI,,140000,1,3300,2.41,137000,140000,137000,5007248,697503272200,101920331100000,728002365,STK
2,373220,KR7373220003,LG에너지솔루션,KOSPI,,422000,2,-500,-0.12,426500,431000,422000,340392,144729522000,98748000000000,234000000,STK
3,207940,KR7207940008,삼성바이오로직스,KOSPI,,703000,1,5000,0.72,702000,705000,700000,59712,41965580000,50035322000000,71174000,STK
4,005935,KR7005931001,삼성전자우,KOSPI,,59300,2,-600,-1.00,59900,60100,59300,1607884,95862972500,48797181310000,822886700,STK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2771,002785,KR7002781003,진흥기업우B,KOSPI,,5020,1,25,0.50,4990,5040,4895,548,2692490,4303234360,857218,STK
2773,014915,KR7014911002,성문전자우,KOSPI,,6730,2,-350,-4.94,7540,7540,6700,3076,21072330,4038000000,600000,STK
2774,001525,KR7001521004,동양우,KOSPI,,6150,1,30,0.49,6120,6290,6060,1139,7017870,3799045650,617731,STK
2777,002787,KR7002782001,진흥기업2우B,KOSPI,,12750,1,160,1.27,12610,12750,12470,35,440350,3758802000,294808,STK


In [109]:
def item_code_by_item_name(item_name):
    """
    종목명을 받아 종목코드를 찾아 반환하는 함수
    """
    item_code_list = df_kospi.loc[df_kospi["Name"] == item_name, "Code"].tolist()
    if len(item_code_list) > 0:
        item_code = item_code_list[0]
        return item_code
    else:
        return "없는 주식입니다 !"

In [110]:
item_code_by_item_name("카카오")

'035720'

In [111]:
def find_item_list(item_name, year=2020):
    """
    종목명을 넘겨주면 일별시세를 반환하는 함수
    내부에서 종목명으로 종목코드를 반환하는 함수(item_code_by_item_name)로
    종목의 시세를 수집합니다.
    """

    item_code = item_code_by_item_name(item_name)
    if item_code:
        df_day = fdr.DataReader(item_code, str(year),"2023-11-30")
        return df_day
    else:
        return False

In [112]:
df_삼성전자 = find_item_list("삼성전자", year = 2020)

df_SK하이닉스 = find_item_list("SK하이닉스", year = 2020)

df_LG에솔 = find_item_list("LG에너지솔루션", year = 2020)

df_삼성바이오 = find_item_list("삼성바이오로직스", year = 2020)

df_삼성우 = find_item_list("삼성전자우", year = 2020)

In [134]:
df_삼성전자

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,55500,56000,55000,55200,12993228
2020-01-03,56000,56600,54900,55500,15422255
2020-01-06,54900,55600,54600,55500,10278951
2020-01-07,55700,56400,55600,55800,10009778
2020-01-08,56200,57400,55900,56800,23501171
...,...,...,...,...,...
2023-11-24,72400,72600,71700,71700,6676685
2023-11-27,71500,72100,71100,71300,9113857
2023-11-28,71400,72700,71300,72700,13283081
2023-11-29,72400,72800,72200,72700,9283933


In [113]:
df_삼성전자.drop(columns='Change', inplace =True)

In [114]:

df_SK하이닉스.drop(columns='Change', inplace =True)
df_LG에솔.drop(columns='Change', inplace =True)
df_삼성바이오.drop(columns='Change', inplace =True)
df_삼성우.drop(columns='Change', inplace =True)


In [115]:
data_list = [df_삼성전자,df_SK하이닉스,df_LG에솔,df_삼성바이오,df_삼성우]

In [116]:
data_list2 = []
for df in data_list:
    data_list2.append(df.to_numpy())

In [118]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

In [119]:
def transform_to_series_data(data,mins,sizes,input_len=20,pred_len=14):
    # 스케일링
    data = (data - mins) / sizes

    x_list = []
    y_list = []
    for i in range(input_len, data.shape[0] + 1 - pred_len):
        x = data[i-input_len:i] # 입력데이터 seq(20) , feature(5)
        x_list.append(x)
        y = data[i:i+pred_len, 3] # seq(5)
        y_list.append(y)
    x = np.array(x_list) # batch, seq(10), feature(5)
    y = np.array(y_list) # batch, seq(5)

    return x, y

In [120]:
from torch.utils.data import Dataset

In [121]:
class FinanceDataset(Dataset):
    def __init__(self,x,y):
        self.x = x # batch, seq(10), feature(5)
        self.y = y # batch,seq(6)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self,idx):
        item = {}
        item["x"] = torch.Tensor(self.x[idx])
        item["y"] = torch.Tensor(self.y[idx])
        return item

In [122]:
from torch.utils.data import DataLoader
import torch.nn as nn

In [123]:
class LSTMModel(nn.Module):
    def __init__(self, input_size=5, output_size=14, n_hidden=16, num_layers=1, bi_direction=False) -> None:
        super().__init__()
        self.n_hidden = n_hidden
        self.num_layers = num_layers
        self.direction_num = 2 if bi_direction else 1

        self.lstm_layer = nn.LSTM(input_size=input_size, hidden_size=self.n_hidden,
                            num_layers=self.num_layers, bidirectional=bi_direction)
        self.dense_layer = nn.Sequential(
            nn.Linear(self.n_hidden, self.n_hidden//2),
            nn.ReLU(),
            nn.Linear(self.n_hidden//2, output_size)
        )

    def forward(self, x):

        init_hidden = torch.zeros(
            self.num_layers*self.direction_num, x.shape[0], self.n_hidden
        ).to(device)
        init_cell = torch.zeros(
            self.num_layers*self.direction_num, x.shape[0], self.n_hidden
        ).to(device)

        out, (hidden_state, cell_state) = self.lstm_layer(x.transpose(1,0), (init_hidden, init_cell))

        predict = self.dense_layer(hidden_state[-1])
        return predict

In [124]:
class EarlyStopper(object):

    def __init__(self, num_trials, save_path):
        self.num_trials = num_trials
        self.trial_counter = 0
        self.best_loss = np.inf
        self.save_path = save_path

    def is_continuable(self, model, loss):
        if loss < self.best_loss:
            self.best_loss = loss
            self.trial_counter = 0 # 초기화
            torch.save(model, self.save_path)
            return True
        elif self.trial_counter + 1 < self.num_trials:
            self.trial_counter += 1 # 기존 시도횟수 + 1
            return True
        else:
            return False

In [125]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    epoch_loss = 0
    model.train()
    for batch in dataloader:
        pred = model(batch["x"].to(device))
        loss = loss_fn(pred, batch["y"].to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

In [126]:
@torch.inference_mode()
def test_loop(dataloader,model,loss_fn, device):
    epoch_loss = 0
    model.eval()

    pred_list = []
    for batch in dataloader:
        pred = model(batch["x"].to(device))

        if batch.get("y") is not None:
            loss = loss_fn(pred, batch["y"].to(device))
            epoch_loss += loss.item()

        _pred = pred.to("cpu").numpy()
        pred_list.append(_pred)

    epoch_loss /= len(dataloader)

    pred = np.concatenate(pred_list)
    return epoch_loss, pred

In [127]:
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold

In [128]:
data_list2

[array([[   55500,    56000,    55000,    55200, 12993228],
        [   56000,    56600,    54900,    55500, 15422255],
        [   54900,    55600,    54600,    55500, 10278951],
        ...,
        [   71400,    72700,    71300,    72700, 13283081],
        [   72400,    72800,    72200,    72700,  9283933],
        [   72700,    72800,    72200,    72800, 15783714]]),
 array([[  96000,   96200,   94100,   94700, 2342070],
        [  96500,   97900,   94300,   94500, 3021380],
        [  93000,   95300,   92800,   94300, 2577573],
        ...,
        [ 132100,  132100,  129900,  131400, 2562226],
        [ 130200,  131000,  128900,  130400, 2227853],
        [ 129900,  133900,  129900,  133900, 3625365]]),
 array([[  597000,   598000,   450000,   505000, 15946992],
        [  476000,   483000,   445000,   450000,  4559773],
        [  458000,   495500,   441000,   477000,  2918435],
        ...,
        [  436500,   440500,   434000,   435000,   140561],
        [  440000,   447500

In [129]:

input_size_dec = 1
hidden_size = 32
is_attention = False

n_splits = 5
batch_size = 32
epochs = 1000
loss_fn = torch.nn.MSELoss()

SEED = 42
cv = KFold(n_splits=n_splits,shuffle=True,random_state=SEED)


reset_seeds(SEED)

all_best_loss_list = []
all_predictions_list = []
data_list2= data_list2[:1]
for i, df in enumerate(data_list2):
        print(f"Training and predicting for stock {i + 1}...")

        mins = df.min(axis=0)
        sizes = df.max(axis=0) - mins
        scaled_data = (df - mins) / sizes  # 스케일링

        x_train, y_train = transform_to_series_data(df,mins,sizes)

        input_size_enc = x_train.shape[2]

        dt = FinanceDataset(x_train,y_train)
        dl = DataLoader(dt, batch_size = 2)

        best_loss_list = []
        predictions_list = []
        for j, (tri,vai) in enumerate(cv.split(x_train)):

                # 학습용
                train_dt = FinanceDataset(x_train[tri], y_train[tri])
                train_dl = DataLoader(train_dt,batch_size=batch_size,shuffle=False)

                # 검증용
                valid_dt = FinanceDataset(x_train[vai], y_train[vai])
                valid_dl = DataLoader(valid_dt,batch_size=batch_size,shuffle=False)

                model = LSTMModel().to(device)
                optimizer = torch.optim.Adam(model.parameters())
                early_stopper = EarlyStopper(num_trials=30, save_path=f'best_model_{i}.pt')

                for epoch in tqdm(range(epochs)):
                        train_loop(train_dl, model, loss_fn, optimizer, device)
                        valid_loss, pred = test_loop(valid_dl, model, loss_fn, device)

                        pred = pred * sizes[3] + mins[3] # 원래의 수치로 복원
                        true = y_train[vai] * sizes[3] + mins[3] # 원래의 수치로 복원
                        score = mean_absolute_percentage_error(true,pred)

                        if not early_stopper.is_continuable(model, valid_loss):
                                print(f'Cross Validation:{i} >> best loss: {early_stopper.best_loss}')
                                best_loss_list.append(early_stopper.best_loss)
                                break
                        predictions_list.append(pred)

        all_best_loss_list.append(best_loss_list)



Training and predicting for stock 1...


  0%|          | 0/1000 [00:00<?, ?it/s]

Cross Validation:0 >> best loss: 0.0021889280760660768


  0%|          | 0/1000 [00:00<?, ?it/s]

Cross Validation:0 >> best loss: 0.0027754454446646073


  0%|          | 0/1000 [00:00<?, ?it/s]

Cross Validation:0 >> best loss: 0.002687903839008262


  0%|          | 0/1000 [00:00<?, ?it/s]

Cross Validation:0 >> best loss: 0.002609305258374661


  0%|          | 0/1000 [00:00<?, ?it/s]

Cross Validation:0 >> best loss: 0.003323047790521135


In [130]:
data_list2

[array([[   55500,    56000,    55000,    55200, 12993228],
        [   56000,    56600,    54900,    55500, 15422255],
        [   54900,    55600,    54600,    55500, 10278951],
        ...,
        [   71400,    72700,    71300,    72700, 13283081],
        [   72400,    72800,    72200,    72700,  9283933],
        [   72700,    72800,    72200,    72800, 15783714]])]

In [131]:
pred.shape


(187, 14)

In [132]:
columns = [ '오늘날짜+1', '오늘날짜+2', '오늘날짜+3', '오늘날짜+4', '오늘날짜+5'
        , '오늘날짜+6', '오늘날짜+7', '오늘날짜+8', '오늘날짜+9', '오늘날짜+10'
        , '오늘날짜+11', '오늘날짜+12', '오늘날짜+13', '오늘날짜+14']
df_pred_삼성 = pd.DataFrame(pred, columns=columns)
df_pred_삼성

Unnamed: 0,오늘날짜+1,오늘날짜+2,오늘날짜+3,오늘날짜+4,오늘날짜+5,오늘날짜+6,오늘날짜+7,오늘날짜+8,오늘날짜+9,오늘날짜+10,오늘날짜+11,오늘날짜+12,오늘날짜+13,오늘날짜+14
0,56834.988281,56813.601562,56608.617188,57037.976562,56679.402344,57235.484375,57005.371094,57153.230469,56996.265625,57139.562500,57037.039062,57417.351562,57551.523438,57424.015625
1,58835.636719,58618.378906,58579.398438,58744.527344,59007.406250,58934.816406,59282.195312,59063.847656,59375.257812,59037.054688,59629.058594,59172.527344,59532.171875,59429.179688
2,60218.492188,60083.843750,60084.863281,60180.257812,60406.015625,60268.890625,60511.320312,60470.007812,60689.058594,60534.398438,60807.695312,60589.617188,60750.398438,60801.210938
3,60130.640625,60025.320312,59985.351562,60134.667969,60283.640625,60228.937500,60423.351562,60389.437500,60545.820312,60425.656250,60652.476562,60499.835938,60651.992188,60697.187500
4,55324.363281,55257.585938,55033.742188,55539.523438,55106.078125,55807.347656,55512.558594,55656.378906,55481.027344,55596.925781,55559.437500,55952.156250,56169.781250,55946.218750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,64912.921875,65188.035156,64991.976562,65229.761719,64412.273438,65205.531250,64401.859375,65086.734375,64568.535156,65460.035156,64146.835938,65597.570312,65111.781250,65097.609375
183,64372.843750,64695.914062,64396.742188,64869.062500,63303.335938,64988.628906,63407.585938,64507.000000,63572.464844,64951.039062,62945.437500,65337.062500,64731.492188,64382.390625
184,66150.187500,66089.890625,66023.898438,66093.867188,65556.148438,66295.062500,65737.359375,66095.164062,66035.390625,66505.187500,65875.593750,66745.078125,66670.437500,66175.125000
185,67112.632812,67254.890625,67161.976562,67141.804688,66798.687500,67119.398438,66800.203125,67147.218750,67057.218750,67593.664062,66821.421875,67632.023438,67325.281250,67232.875000


In [133]:
# # 예측 결과 시각화
# plt.figure(figsize=(10, 6))
# plt.plot(y_test, label='Actual Prices')
# plt.plot(y_pred, label='LSTM Predictions')
# plt.title('Stock Price Prediction with LSTM')
# plt.xlabel('Days')
# plt.ylabel('Closing Price')
# plt.legend()
# plt.show()