In [93]:
# %pip install --quiet pytorch-lightning

In [94]:
# %pip install --quiet yfinance

In [95]:
# %pip install --quiet tqdm

In [96]:
# %pip install --quiet tensorboard

Import required libraries

In [97]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
import math
import matplotlib

import yfinance as yf
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader

Create configs for layout, size and colors

In [98]:
%matplotlib inline
%config InLineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale = 1.1)

BRIGHT_COLORS_PALETTE = ['#01BEFE','#FFDD00','#FF7D00','#FF006D','#ADFF02','#8F00FF']

sns.set_palette(sns.color_palette(BRIGHT_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

tqdm.pandas()

Set pytorch lightning seed to 42

In [99]:
pl.seed_everything(42)

Global seed set to 42


42

Read yfinance symbol/s and put them into a dataframe

In [100]:
symbol = "MSFT"
msft = yf.Ticker(symbol)

# get all stock info
msft.info

# get historical market data
hist = msft.history(period="5y")

df = pd.DataFrame(hist)
df.index = pd.to_datetime(df.index)
df = df.sort_values(by='Date')

df.head()


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-08-28 00:00:00-04:00,104.160636,104.691195,104.01852,104.463814,19151500,0.0,0.0
2018-08-29 00:00:00-04:00,104.643819,106.140763,104.473281,106.131287,20818000,0.0,0.0
2018-08-30 00:00:00-04:00,105.799677,106.690265,105.581772,106.064957,22798700,0.0,0.0
2018-08-31 00:00:00-04:00,105.818609,106.851306,105.657541,106.424965,23222700,0.0,0.0
2018-09-04 00:00:00-04:00,105.022779,106.074428,104.425899,105.83757,22634600,0.0,0.0


In [101]:
df.index

DatetimeIndex(['2018-08-28 00:00:00-04:00', '2018-08-29 00:00:00-04:00',
               '2018-08-30 00:00:00-04:00', '2018-08-31 00:00:00-04:00',
               '2018-09-04 00:00:00-04:00', '2018-09-05 00:00:00-04:00',
               '2018-09-06 00:00:00-04:00', '2018-09-07 00:00:00-04:00',
               '2018-09-10 00:00:00-04:00', '2018-09-11 00:00:00-04:00',
               ...
               '2023-08-15 00:00:00-04:00', '2023-08-16 00:00:00-04:00',
               '2023-08-17 00:00:00-04:00', '2023-08-18 00:00:00-04:00',
               '2023-08-21 00:00:00-04:00', '2023-08-22 00:00:00-04:00',
               '2023-08-23 00:00:00-04:00', '2023-08-24 00:00:00-04:00',
               '2023-08-25 00:00:00-04:00', '2023-08-28 00:00:00-04:00'],
              dtype='datetime64[ns, America/New_York]', name='Date', length=1258, freq=None)

Check the shape of the dataframe

In [102]:
df.shape

(1258, 7)

Preprocess the price data

In [103]:
df['Prev_close'] = df.shift(1)['Close']
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Prev_close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-08-28 00:00:00-04:00,104.160636,104.691195,104.01852,104.463814,19151500,0.0,0.0,
2018-08-29 00:00:00-04:00,104.643819,106.140763,104.473281,106.131287,20818000,0.0,0.0,104.463814
2018-08-30 00:00:00-04:00,105.799677,106.690265,105.581772,106.064957,22798700,0.0,0.0,106.131287
2018-08-31 00:00:00-04:00,105.818609,106.851306,105.657541,106.424965,23222700,0.0,0.0,106.064957
2018-09-04 00:00:00-04:00,105.022779,106.074428,104.425899,105.83757,22634600,0.0,0.0,106.424965


In [104]:
df['Close_change'] = df.progress_apply(
        lambda row:0 if np.isnan(row.Prev_close) else row.Close - row.Prev_close, 
        axis=1
    )
df.head()

  0%|          | 0/1258 [00:00<?, ?it/s]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Prev_close,Close_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-08-28 00:00:00-04:00,104.160636,104.691195,104.01852,104.463814,19151500,0.0,0.0,,0.0
2018-08-29 00:00:00-04:00,104.643819,106.140763,104.473281,106.131287,20818000,0.0,0.0,104.463814,1.667473
2018-08-30 00:00:00-04:00,105.799677,106.690265,105.581772,106.064957,22798700,0.0,0.0,106.131287,-0.06633
2018-08-31 00:00:00-04:00,105.818609,106.851306,105.657541,106.424965,23222700,0.0,0.0,106.064957,0.360008
2018-09-04 00:00:00-04:00,105.022779,106.074428,104.425899,105.83757,22634600,0.0,0.0,106.424965,-0.587395


In [105]:
rows = []

In [106]:
for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week=row.name.dayofweek,
        day_of_month=row.name.week,
        week_of_year=row.name.week,
        month=row.name.month,
        open=row.Open,
        high=row.High,
        low=row.Low,
        close_change=row.Close_change,
        close=row.Close
    )
    
    rows.append(row_data)
    
features_df = pd.DataFrame(rows)

  0%|          | 0/1258 [00:00<?, ?it/s]

In [107]:
features_df.shape

(1258, 9)

In [108]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1,35,35,8,104.160636,104.691195,104.01852,0.0,104.463814
1,2,35,35,8,104.643819,106.140763,104.473281,1.667473,106.131287
2,3,35,35,8,105.799677,106.690265,105.581772,-0.06633,106.064957
3,4,35,35,8,105.818609,106.851306,105.657541,0.360008,106.424965
4,1,36,36,9,105.022779,106.074428,104.425899,-0.587395,105.83757


Set the train size

In [109]:
train_size = int(len(features_df) * 0.8)
train_size

1006

In [110]:
features_df[train_size + 1:]

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
1007,0,35,35,8,263.306293,264.841450,261.325430,-2.832581,262.692230
1008,1,35,35,8,264.118405,264.494744,258.165902,-2.238434,260.453796
1009,2,35,35,8,262.850676,264.554189,258.829495,-1.485626,258.968170
1010,3,35,35,9,256.392997,258.393688,252.966112,-1.059814,257.908356
1011,4,35,35,9,259.195973,262.206864,252.035142,-4.298431,253.609924
...,...,...,...,...,...,...,...,...,...
1253,1,34,34,8,325.500000,326.079987,321.459991,0.579987,322.459991
1254,2,34,34,8,323.820007,329.200012,323.459991,4.540009,327.000000
1255,3,34,34,8,332.850006,332.980011,319.959991,-7.029999,319.970001
1256,4,34,34,8,321.470001,325.359985,318.799988,3.010010,322.980011


Create the train and test dataframes

In [111]:
train_df, test_df = features_df[:train_size], features_df[train_size + 1:]
train_df.shape, test_df.shape

((1006, 9), (251, 9))

In [112]:
train_df

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,1,35,35,8,104.160636,104.691195,104.018520,0.000000,104.463814
1,2,35,35,8,104.643819,106.140763,104.473281,1.667473,106.131287
2,3,35,35,8,105.799677,106.690265,105.581772,-0.066330,106.064957
3,4,35,35,8,105.818609,106.851306,105.657541,0.360008,106.424965
4,1,36,36,9,105.022779,106.074428,104.425899,-0.587395,105.837570
...,...,...,...,...,...,...,...,...,...
1001,4,33,33,8,286.135674,286.482331,282.827636,-3.981598,283.411987
1002,0,34,34,8,279.380932,279.757301,274.567449,-8.319611,275.092377
1003,1,34,34,8,273.794922,276.191749,272.764865,-1.297455,273.794922
1004,2,34,34,8,272.774785,274.577378,272.477637,-0.643768,273.151154


In [113]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [114]:
test_df.head()


Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
1007,0,35,35,8,263.306293,264.84145,261.32543,-2.832581,262.69223
1008,1,35,35,8,264.118405,264.494744,258.165902,-2.238434,260.453796
1009,2,35,35,8,262.850676,264.554189,258.829495,-1.485626,258.96817
1010,3,35,35,9,256.392997,258.393688,252.966112,-1.059814,257.908356
1011,4,35,35,9,259.195973,262.206864,252.035142,-4.298431,253.609924


In [115]:
train_df = pd.DataFrame(
    scaler.transform(train_df),
    index=train_df.index,
    columns=train_df.columns
)

In [116]:
test_df = pd.DataFrame(
    scaler.transform(test_df),
    index=test_df.index,
    columns=test_df.columns
)

In [117]:
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month,open,high,low,close_change,close
0,-0.5,0.307692,0.307692,0.272727,-0.890316,-0.90855,-0.881827,0.084297,-0.879891
1,0.0,0.307692,0.307692,0.272727,-0.886428,-0.896993,-0.87815,0.164065,-0.866446
2,0.5,0.307692,0.307692,0.272727,-0.877129,-0.892612,-0.869186,0.081124,-0.866981
3,1.0,0.307692,0.307692,0.272727,-0.876976,-0.891328,-0.868573,0.101519,-0.864078
4,-0.5,0.346154,0.346154,0.454545,-0.883379,-0.897522,-0.878533,0.056198,-0.868814


Define the create sequences function

In [118]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length):
    sequences = []
    data_size = len(input_data)
    
    for i in tqdm(range(data_size - sequence_length)):
        sequence = input_data[i:i+sequence_length]
        
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        
        sequences.append((sequence,label))
        
    return sequences

In [119]:
sample_data = pd.DataFrame(dict(
    feature_1=[1,2,3,4,5],
    label=[6,7,8,9,10]
))

sample_data.head()

Unnamed: 0,feature_1,label
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [120]:
sample_sequences = create_sequences(sample_data, "label", sequence_length=3)

  0%|          | 0/2 [00:00<?, ?it/s]

In [121]:
len(sample_sequences)

2

In [122]:
print(sample_sequences[0][0])
print()
print(f"label: {sample_sequences[0][1]}")

   feature_1  label
0          1      6
1          2      7
2          3      8

label: 9


In [123]:
SEQUENCE_LENGTH = 120

train_sequences = create_sequences(train_df, "close", SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", SEQUENCE_LENGTH)

  0%|          | 0/886 [00:00<?, ?it/s]

  0%|          | 0/131 [00:00<?, ?it/s]

In [124]:
train_sequences[0][0].shape

(120, 9)

In [125]:
len(train_sequences), len(test_sequences)

(886, 131)

Create the stock dataset that inherits from "Dataset"

In [126]:
class StockDataset(Dataset):
    def __init__(self,sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, id):
        sequence, label = self.sequences[id]
        
        return dict(
            sequence=torch.Tensor(sequence.to_numpy()),
            label=torch.tensor(label).float()
        )
        

Create the stock price data module 

In [127]:
class StockPriceDataModule(pl.LightningDataModule):
    def __init__(self, train_sequences, test_sequences, batch_size=8):
        super().__init__()
        self.train_sequences = train_sequences
        self.test_sequences = test_sequences
        self.batch_size = batch_size
        
    def setup(self):
        self.train_dataset = StockDataset(self.train_sequences)
        self.test_dataset = StockDataset(self.test_sequences)
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=0
        )
       
    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0
        ) 
        
    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=0
        ) 

In [128]:
N_EPOCHS = 8
BATCH_SIZE = 64

data_module = StockPriceDataModule(train_sequences,test_sequences,batch_size=BATCH_SIZE)
data_module.setup()

In [129]:
train_dataset = StockDataset(train_sequences)

In [130]:
for i in train_dataset:
    print(i['sequence'].shape)
    print(i['label'].shape)
    print(i['label'])
    break

torch.Size([120, 9])
torch.Size([])
tensor(-0.8792)


Price prediction model creation

In [131]:
class PricePredictionModel(nn.Module):
    def __init__(self,n_features, n_hidden=128, n_layers=2):
        super().__init__()
        
        self.n_hidden = n_hidden
        
        self.lstm = nn.LSTM(
            input_size=n_features,
            hidden_size=n_hidden,
            batch_first=False,
            num_layers=n_layers,
            dropout=0.2
        )
        
        self.regressor = nn.Linear(n_hidden, 1)
        
    def forward(self, x):
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        
        return self.regressor(out)

In [132]:
class StockPricePredictor(pl.LightningModule):
    def __init__(self, n_features):
        super().__init__()
        self.model = PricePredictionModel(n_features)
        self.criteria = nn.MSELoss()
        
    def forward(self, x, labels=None):
        output = self.model(x)
        loss = 0
        if labels is not None:
            loss = self.criteria(output, labels.unsqueeze(dim=1))
        return loss, output
    
    def training_step(self, batch, batch_id):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_id):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_id):
        sequences = batch['sequence']
        labels = batch['label']
        
        loss, outputs = self(sequences, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return optim.AdamW(self.parameters(), lr=0.0001)

In [133]:
model = StockPricePredictor(n_features=train_df.shape[1])

In [134]:
for i in data_module.train_dataloader():
    print(i['sequence'].shape)
    print(i['label'].shape)
    break

torch.Size([64, 120, 9])
torch.Size([64])


In [135]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 13220), started 0:31:24 ago. (Use '!kill 13220' to kill it.)

In [136]:
checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger = TensorBoardLogger('Lightning_logs', name=f"{symbol}-price")

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    callbacks=[early_stopping_callbacks],
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

NameError: name 'early_stopping_callbacks' is not defined