In [1]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import lightning as L
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

from external_library import StockDataset, StockPriceDataModule

Successfully Imported


In [2]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", '#FF7D00', '#FF006D', '#ADFF02', '#8F00FF']

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

In [3]:
L.seed_everything(42)

Seed set to 42


42

# Load Data

In [4]:
df = pd.read_csv("NVDA_110721_Final_1_fdr.csv")
df = df.drop('Date', axis=1)
df.head()


Unnamed: 0,Close,MA5,MA20,MA60,RSI14,std,upperb,lowerb,%K,%D,SP500,NASDAQ_COMP
0,3.675,3.543,3.78225,4.29975,40.927948,0.196449,4.175147,3.389353,40.601489,26.315786,1343.800049,2834.429932
1,3.75,3.588,3.767125,4.281833,45.013998,0.185909,4.138942,3.395308,51.879688,36.090216,1345.02002,2858.830078
2,3.69,3.636,3.754875,4.262042,42.482342,0.182302,4.119478,3.390272,50.000021,47.493733,1337.430054,2842.800049
3,3.6,3.645,3.74225,4.238708,38.944195,0.183921,4.110092,3.374408,35.294105,45.724605,1331.939941,2839.959961
4,3.465,3.636,3.72175,4.21425,34.326053,0.191057,4.103864,3.339636,11.999989,32.431371,1304.890015,2764.790039


In [5]:
train_size = int(len(df) * .9)
train_size

2264

In [6]:
train_df, test_df = df[:train_size], df[train_size+1:]
train_df.shape, test_df.shape

((2264, 12), (251, 12))

In [7]:
scaler = MinMaxScaler()

In [8]:
scaler.fit(train_df)
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index=train_df.index,
    columns=train_df.columns
)
train_df

Unnamed: 0,Close,MA5,MA20,MA60,RSI14,std,upperb,lowerb,%K,%D,SP500,NASDAQ_COMP
0,0.008116,0.006553,0.008491,0.014538,0.327279,0.022327,0.010340,0.007087,0.406015,0.252223,0.106943,0.059137
1,0.008850,0.007002,0.008333,0.014329,0.387794,0.020892,0.009994,0.007156,0.518797,0.352500,0.107476,0.062031
2,0.008263,0.007480,0.008205,0.014098,0.350300,0.020401,0.009808,0.007098,0.500000,0.469489,0.104158,0.060130
3,0.007383,0.007569,0.008072,0.013826,0.297900,0.020621,0.009718,0.006915,0.352941,0.451339,0.101757,0.059793
4,0.006063,0.007480,0.007858,0.013540,0.229505,0.021593,0.009659,0.006513,0.120000,0.314963,0.089929,0.050877
...,...,...,...,...,...,...,...,...,...,...,...,...
2259,0.986921,0.999776,0.977557,0.974899,0.699842,0.628450,0.980370,0.974175,0.780552,0.765087,0.917518,0.966967
2260,0.972277,1.000000,0.983618,0.980833,0.647310,0.624443,0.985343,0.981547,0.701414,0.697269,0.930216,0.974310
2261,0.963232,0.992545,0.988320,0.987440,0.615381,0.619361,0.988923,0.987600,0.652530,0.712179,0.925411,0.965218
2262,0.969759,0.987013,0.993464,0.993367,0.630691,0.610222,0.992338,0.994831,0.687805,0.680462,0.929416,0.968700


In [9]:
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index=test_df.index,
    columns=test_df.columns
)
test_df

Unnamed: 0,Close,MA5,MA20,MA60,RSI14,std,upperb,lowerb,%K,%D,SP500,NASDAQ_COMP
2265,0.992959,0.999392,1.009366,1.011865,0.654377,0.632325,1.009962,1.008637,0.708814,0.730503,0.951844,0.992770
2266,0.962743,0.999293,1.014043,1.017399,0.548284,0.591998,1.008574,1.020645,0.386432,0.583775,0.934195,0.963746
2267,0.969075,0.999153,1.017729,1.022714,0.564638,0.572626,1.009220,1.028001,0.411229,0.497415,0.925437,0.952094
2268,0.991272,0.997376,1.024353,1.028772,0.619646,0.511912,1.006748,1.045618,0.634819,0.472111,0.935835,0.972623
2269,0.971128,0.995125,1.029665,1.034890,0.549769,0.426998,0.999681,1.065892,0.431913,0.487664,0.926666,0.956709
...,...,...,...,...,...,...,...,...,...,...,...,...
2511,1.912431,1.974304,2.019687,1.936474,0.616105,1.110244,1.999673,2.043030,0.478162,0.671980,1.432088,1.459938
2512,1.826843,1.955649,2.025734,1.943855,0.448129,0.976432,1.986414,2.072413,0.053434,0.398088,1.425848,1.447862
2513,1.748099,1.918017,2.023138,1.949301,0.334134,1.035781,1.992374,2.059476,0.031406,0.174777,1.411475,1.434117
2514,1.808605,1.883501,2.023876,1.956944,0.433826,1.023411,1.991311,2.062389,0.303788,0.115146,1.381448,1.416058


In [10]:
def create_sequences(input_data:pd.DataFrame, target_column, sequence_length):

    sequences = []
    data_size = len(input_data)

    for i in tqdm(range(data_size - sequence_length)):

        sequence = input_data[i:i+sequence_length]

        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]

        sequences.append((sequence, label))

    return sequences   

In [11]:
sample_data = pd.DataFrame(dict(feature_1=[1,2,3,4,5],
                                label=[6,7,8,9,10]))

sample_sequences = create_sequences(sample_data, 'label', sequence_length=3)

  0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
print(sample_sequences[1][0])

print(sample_sequences[1][1])


   feature_1  label
1          2      7
2          3      8
3          4      9
10


In [13]:
SEQUENCE_LENGTH = 40

train_sequences = create_sequences(train_df, 'Close', SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, 'Close', SEQUENCE_LENGTH)

test_sequences[0][0]

  0%|          | 0/2224 [00:00<?, ?it/s]

  0%|          | 0/211 [00:00<?, ?it/s]

Unnamed: 0,Close,MA5,MA20,MA60,RSI14,std,upperb,lowerb,%K,%D,SP500,NASDAQ_COMP
2265,0.992959,0.999392,1.009366,1.011865,0.654377,0.632325,1.009962,1.008637,0.708814,0.730503,0.951844,0.99277
2266,0.962743,0.999293,1.014043,1.017399,0.548284,0.591998,1.008574,1.020645,0.386432,0.583775,0.934195,0.963746
2267,0.969075,0.999153,1.017729,1.022714,0.564638,0.572626,1.00922,1.028001,0.411229,0.497415,0.925437,0.952094
2268,0.991272,0.997376,1.024353,1.028772,0.619646,0.511912,1.006748,1.045618,0.634819,0.472111,0.935835,0.972623
2269,0.971128,0.995125,1.029665,1.03489,0.549769,0.426998,0.999681,1.065892,0.431913,0.487664,0.926666,0.956709
2270,0.995575,0.995658,1.034727,1.041081,0.609959,0.385162,0.998432,1.078581,0.678158,0.578946,0.944156,0.973414
2271,1.010097,1.005304,1.040398,1.047442,0.642961,0.344242,0.997867,1.091785,0.824427,0.643786,0.938813,0.978736
2272,1.01017,1.013675,1.045641,1.053607,0.643131,0.298139,0.996185,1.105398,0.84664,0.78561,0.949701,0.997412
2273,1.048845,1.025402,1.051767,1.060197,0.725205,0.334937,1.006943,1.105916,0.933043,0.872772,0.959972,1.016096
2274,1.070114,1.045566,1.05886,1.066839,0.763359,0.394167,1.021734,1.103697,1.0,0.932813,0.965176,1.020647


# Pytorch Dataset

In [14]:
# class StockDataset(Dataset):
    
#     def __init__(self, sequences):
#         self.sequences = sequences

#     def __len__(self):
#         return len(self.sequences)

#     def __getitem__(self, idx):  # Dataset contains tuple (data, label)
#         sequence, label = self.sequences[idx]

#         return dict(
#             sequence = torch.Tensor(sequence.to_numpy()),
#             label = torch.tensor(label).float()
#         )

In [15]:
# class StockPriceDataModule(L.LightningDataModule):

#     def __init__(self, train_sequences, test_sequences, batch_size=8):
#         super().__init__()
#         self.train_sequences = train_sequences
#         self.test_sequences = test_sequences
#         self.batch_size = batch_size

#     def setup(self):
#         self.train_dataset = StockDataset(self.train_sequences)
#         self.test_dataset = StockDataset(self.test_sequences)

#     def train_dataloader(self):
#         return DataLoader(
#             self.train_dataset,
#             batch_size = self.batch_size,
#             shuffle = False,
#             num_workers = 2                  # making it faster?
#         )
    
#     def val_dataloader(self):
#         return DataLoader(
#             self.test_dataset,
#             batch_size = 1,
#             shuffle = False,
#             num_workers = 1
#         )

In [16]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = StockPriceDataModule(train_sequences, test_sequences, batch_size=BATCH_SIZE)
data_module.setup()

In [17]:
train_dataset = StockDataset(train_sequences)

In [18]:
for item in train_dataset:
    print(item['sequence'].shape)
    print(item['label'].shape)
    print(item['label'])
    break

torch.Size([40, 12])
torch.Size([])
tensor(0.0100)


# LSTM Model

In [19]:
train_df.shape

(2264, 12)

In [20]:
class PricePredictionModel(nn.Module):
    
    def __init__(self, n_features, n_hidden=128, n_layers=2):
        super().__init__()
        self.n_hidden = n_hidden
        self.lstm = nn.LSTM(
            input_size = n_features, 
            hidden_size = n_hidden,
            batch_first = True,        # batch_size as the first parameter
            num_layers = n_layers,
            dropout = 0.2
        )

        self.regressor = nn.Linear(n_hidden, 1)  # output line

    def forward(self, x):
        self.lstm.flatten_parameters()

        _, (hidden, _) = self.lstm(x)   # see the  document for reference
        out = hidden[-1]

        return self.regressor(out)

In [21]:
class StockPricePredictor(L.LightningModule):

    def __init__(self, n_features: int):
        super().__init__()
        self.model = PricePredictionModel(n_features)
        self.criterion = nn.MSELoss()

    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels.unsqueeze(dim=1))
        return loss, output
    
    def training_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("validation_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        sequences = batch["sequence"]
        labels = batch["label"]
        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)

In [22]:
model = StockPricePredictor(n_features=train_df.shape[1])

In [23]:
for item in data_module.train_dataloader():
    print(item['sequence'].shape)
    print(item['label'].shape)
    break

Successfully Imported
Successfully Imported
torch.Size([64, 40, 12])
torch.Size([64])


In [24]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [25]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best checkpoint",
    save_top_k = 1,
    verbose= True,
    monitor = 'val_loss',
    mode = "min"          # minimum value
)

logger = TensorBoardLogger("lightning_logs", name='stock-price')

early_stopping_callback = EarlyStopping(
    monitor='val_loss', patience=2)

device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')
print(f"{device} 사용 가능합니다")

trainer = L.Trainer(
    logger=logger,
    callbacks=[early_stopping_callback, checkpoint_callback],
    max_epochs=N_EPOCHS,
)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


mps:0 사용 가능합니다


In [26]:
trainer.fit(model, data_module)


  | Name      | Type                 | Params
---------------------------------------------------
0 | model     | PricePredictionModel | 204 K 
1 | criterion | MSELoss              | 0     
---------------------------------------------------
204 K     Trainable params
0         Non-trainable params
204 K     Total params
0.820     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/sihun/anaconda3/envs/madelion/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


Successfully Imported


/Users/sihun/anaconda3/envs/madelion/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Successfully Imported
Successfully Imported


/Users/sihun/anaconda3/envs/madelion/lib/python3.11/site-packages/lightning/pytorch/loops/fit_loop.py:293: The number of training batches (35) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Successfully Imported


Validation: |          | 0/? [00:00<?, ?it/s]

RuntimeError: Early stopping conditioned on metric `val_loss` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: `train_loss`, `validation_loss`