In [1]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc 

import pandas as pd 
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8
tqdm.pandas()

In [3]:
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [4]:
df = pd.read_csv("../data/01_raw/fetched_market_chart.csv", parse_dates=["timestamp"])
df = df.sort_values(by="timestamp").reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_av,trades,tb_base_av,tb_quote_av,ignore
0,2017-08-17,4261.48,4485.39,4200.74,4285.08,795.150377,1503014399999,3454770.0,3427,616.248541,2678216.0,8733.911395
1,2017-08-18,4285.08,4371.52,3938.77,4108.37,1199.888264,1503100799999,5086958.0,5233,972.86871,4129123.0,9384.141409
2,2017-08-19,4108.37,4184.69,3850.0,4139.98,381.309763,1503187199999,1549484.0,2153,274.336042,1118002.0,9184.085529
3,2017-08-20,4120.98,4211.08,4032.62,4086.29,467.083022,1503273599999,1930364.0,2321,376.795947,1557401.0,10125.414084
4,2017-08-21,4069.13,4119.62,3911.79,4016.0,691.74306,1503359999999,2797232.0,3972,557.356107,2255663.0,11706.76997


In [5]:
df.shape

(1621, 12)

## Preprocessing

In [6]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week=row["timestamp"].dayofweek,
        day_of_month=row["timestamp"].day,
        week_of_year=row["timestamp"].week,
        month_of_year=row["timestamp"].month,
        open=row["open"],
        high=row["high"],
        low=row["low"],
        close=row["close"],
        close_change=row["close"] - row["open"],
    )
    rows.append(row_data)

features_df = pd.DataFrame(rows)

  0%|          | 0/1621 [00:00<?, ?it/s]

In [7]:
features_df.shape

(1621, 9)

In [8]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
0,3,17,33,8,4261.48,4485.39,4200.74,4285.08,23.6
1,4,18,33,8,4285.08,4371.52,3938.77,4108.37,-176.71
2,5,19,33,8,4108.37,4184.69,3850.0,4139.98,31.61
3,6,20,33,8,4120.98,4211.08,4032.62,4086.29,-34.69
4,0,21,34,8,4069.13,4119.62,3911.79,4016.0,-53.13


In [9]:
train_size = int(len(features_df) * 0.9)
train_size

1458

In [10]:
train_df, test_df = features_df[:train_size], features_df[train_size + 1:]
train_df.shape, test_df.shape

((1458, 9), (162, 9))

In [11]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [12]:
train_df = pd.DataFrame(scaler.transform(train_df), index=train_df.index, columns=train_df.columns)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
0,0.0,0.066667,0.230769,0.272727,-0.964447,-0.960736,-0.953254,-0.963698,-0.033745
1,0.333333,0.133333,0.230769,0.272727,-0.963665,-0.964434,-0.962104,-0.969551,-0.061151
2,0.666667,0.2,0.230769,0.272727,-0.969518,-0.970503,-0.965103,-0.968504,-0.032649
3,1.0,0.266667,0.230769,0.272727,-0.9691,-0.969645,-0.958934,-0.970282,-0.04172
4,-1.0,0.333333,0.269231,0.272727,-0.970818,-0.972616,-0.963016,-0.97261,-0.044243


In [13]:
test_df = pd.DataFrame(scaler.transform(test_df), index=test_df.index, columns=test_df.columns)
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
1459,1.0,-0.066667,0.192308,0.272727,0.453309,0.432204,0.44192,0.450164,-0.049928
1460,-1.0,0.0,0.230769,0.272727,0.450173,0.454341,0.447325,0.414642,-0.183716
1461,-0.666667,0.066667,0.230769,0.272727,0.414652,0.42531,0.403949,0.374721,-0.201888
1462,-0.333333,0.133333,0.230769,0.272727,0.374731,0.387633,0.398114,0.37503,-0.035696
1463,0.0,0.2,0.230769,0.272727,0.374844,0.421185,0.388805,0.443103,0.245044


In [14]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length: int):

    sequences = []
    data_size = len(input_data)

    for i in range(data_size - sequence_length):
        sequence = input_data[i:i+sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        sequences.append((sequence, label))
        
    return sequences

## Example with dummy df

In [15]:
sample_data = pd.DataFrame(dict(feature_1=[1, 2, 3, 4, 5], label=[6, 7, 8, 9, 10]))
sample_data.head()

Unnamed: 0,feature_1,label
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [16]:
sample_sequences = create_sequences(sample_data, "label", 3)
print(sample_sequences)

[(   feature_1  label
0          1      6
1          2      7
2          3      8, 9), (   feature_1  label
1          2      7
2          3      8
3          4      9, 10)]


In [17]:
len(sample_sequences)

2

In [18]:
SEQUENCE_LENGTH = 60

train_sequences = create_sequences(train_df, "close", SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", SEQUENCE_LENGTH)

In [19]:
train_sequences[0][0].shape

(60, 9)

In [20]:
type(train_sequences)

list

In [21]:
type(train_sequences[0])

tuple

In [22]:
type(train_sequences[0][1])

numpy.float64

In [23]:
import pandas as pd
import torch

from torch.utils.data import Dataset
from typing import List, Tuple


class CryptoDataset(Dataset):
    """
    Dataset class for the LSTM model used by PyTorch Lightning.
    """
    def __init__(self, sequences: List[Tuple[pd.DataFrame]]):
        self.sequences = sequences


    def __len__(self):
        return len(self.sequences)


    def __getitem__(self, index: int):
        sequence, label = self.sequences[index]
        return dict(
            sequence=torch.tensor(sequence.to_numpy()),
            label=torch.tensor(label).float(),
        )

In [24]:
train_dataset = CryptoDataset(train_sequences)

In [25]:
type(train_dataset[0]["label"])

torch.Tensor

In [26]:
train_df.shape

(1458, 9)

In [52]:
def extract_features_from_dataset(data: pd.DataFrame) -> pd.DataFrame:
    """
    Format market chart data to a pandas dataframe.
    """
    data = data.sort_values(by="timestamp").reset_index(drop=True)
    rows = []
    for _, row in data.iterrows():
        row_data = dict(
            day_of_week=row["timestamp"].dayofweek,
            day_of_month=row["timestamp"].day,
            week_of_year=row["timestamp"].week,
            month_of_year=row["timestamp"].month,
            open=row["open"],
            high=row["high"],
            low=row["low"],
            close=row["close"],
            close_change=row["close"] - row["open"],
        )
        rows.append(row_data)
    return pd.DataFrame(rows)


def split_data(data: pd.DataFrame) -> pd.DataFrame:
    """
    Split data into training and test sets.
    """
    train_size = int(len(data) * 0.9)
    train_df, test_df = data[:train_size], data[train_size + 1:]
    return train_df, test_df


def scale_data(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    """
    Scale data to have a mean of 0 and a standard deviation of 1.
    """
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = scaler.fit(train_df)

    scaled_train_df = pd.DataFrame(
        scaler.transform(train_df),
        index=train_df.index, 
        columns=train_df.columns,
    )
    scaled_test_df = pd.DataFrame(
        scaler.transform(test_df),
        index=test_df.index,
        columns=test_df.columns,
    )
    return scaled_train_df, scaled_test_df


def create_sequences(
    input_data: pd.DataFrame, 
    target_column: str, 
    sequence_length: int
    ) -> List[Tuple[pd.DataFrame, float]]:
    """
    Create sequences from the input data.
    """
    sequences = []
    size = len(input_data)
    for i in range(size - sequence_length):
        sequence = input_data[i: i + sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        sequences.append((sequence, label))
    return sequences


def split_train_and_val_sequences(
    sequences: List[Tuple[pd.DataFrame, float]],
    val_size: float,
) -> Tuple[List[Tuple[pd.DataFrame, float]]]:
    """
    Split sequences into training and validation sets.
    """
    train_sequences, val_sequences = [], []
    for sequence, label in sequences:
        if len(train_sequences) < len(sequences) * (1 - val_size):
            train_sequences.append((sequence, label))
        else:
            val_sequences.append((sequence, label))
    return train_sequences, val_sequences

In [54]:
extracted_features = extract_features_from_dataset(df)
splitted_train_data, splitted_test_data = split_data(extracted_features)
scaled_train_data, scaled_test_data = scale_data(splitted_train_data, splitted_test_data)
sequences_train_data = create_sequences(scaled_train_data, "close", 7)
train_sequences, val_sequences = split_train_and_val_sequences(sequences_train_data, 0.2)
test_sequences = create_sequences(scaled_test_data, "close", 7)

(7, 9)
(7, 9)


In [87]:
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn

from pytorch_lightning import (
    Trainer, 
    callbacks,
    seed_everything
)

from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple, Dict


class CryptoDataset(Dataset):
    """
    Dataset class for the LSTM model used by PyTorch Lightning.
    """
    def __init__(self, sequences: List[Tuple[pd.DataFrame, float]]):
        self.sequences = sequences


    def __len__(self):
        return len(self.sequences)


    def __getitem__(self, index: int):
        sequence, label = self.sequences[index]
        return (torch.Tensor(sequence.to_numpy()),torch.tensor(label).float())


class LSTMDataLoader(pl.LightningDataModule):
    """
    Data loader for the LSTM model.
    """
    def __init__(self,
        train_sequences: List[Tuple[pd.DataFrame, float]],
        val_sequences: List[Tuple[pd.DataFrame, float]],
        test_sequences: List[Tuple[pd.DataFrame, float]],
        train_batch_size: int,
        val_batch_size: int,
        train_workers: int = 2,
        val_workers: int = 1,
    ):
        super().__init__()
        self.train_sequences = train_sequences
        self.val_sequences = val_sequences
        self.test_sequences = test_sequences
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.train_workers = train_workers
        self.val_workers = val_workers
        self.test_workers = val_workers


    def setup(self, stage: str = None):
        """
        Load the data.
        """
        self.train_dataset = CryptoDataset(self.train_sequences)
        self.val_dataset = CryptoDataset(self.val_sequences)
        self.test_dataset = CryptoDataset(self.test_sequences)

    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, 
            batch_size=self.train_batch_size, 
            shuffle=False,
            num_workers=self.train_workers
        )


    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, 
            batch_size=self.val_batch_size, 
            shuffle=False,
            num_workers=self.val_workers
        )


    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, 
            batch_size=self.val_batch_size, 
            shuffle=False,
            num_workers=self.test_workers
        )


class PricePredictionModel(nn.Module):
    """
    Standard LSTM model with PyTorch Lightning.
    """
    def __init__(self,
        batch_size: int,
        dropout_rate: float,
        hidden_size: int,
        number_of_features: int,
        number_of_layers: int,
        criterion: nn.Module = nn.MSELoss(),
    ):
        super().__init__()
        self.batch_size = batch_size
        self.criterion = criterion
        self.dropout_rate = dropout_rate
        self.hidden_size = hidden_size
        self.n_features = number_of_features
        self.number_of_layers = number_of_layers
        self.criterion = criterion

        self.lstm = nn.LSTM(
            batch_first=True,
            dropout=self.dropout_rate,
            hidden_size=self.hidden_size,
            input_size=self.n_features,
            num_layers=self.number_of_layers,
        )

        self.regressor = nn.Linear(self.hidden_size, 1)


    def forward(self, x):
        """
        Forward pass through the model.

        lstm_out = (batch_size, sequence_length, hidden_size)
        """
        self.lstm.flatten_parameters()
        _, (hidden, _) = self.lstm(x)
        out = hidden[-1]
        return self.regressor(out)


class PricePredictor(pl.LightningModule):
    """
    Training model with PyTorch Lightning.
    """
    def __init__(self,
        batch_size: int,
        dropout_rate: float,
        hidden_size: int,
        learning_rate: float,
        number_of_features: int,
        number_of_layers: int,
    ):
        super().__init__()
        self.model = PricePredictionModel(
            batch_size, dropout_rate, hidden_size, number_of_features, number_of_layers,
        )
        self.learning_rate = learning_rate


    def forward(self, x, labels=None):
        output = self.model(x)
        print(output)
        if labels is not None:
            loss = self.model.criterion(output, labels.unsqueeze(dim=1))
        else: loss = 0
        return loss, output

        
    def training_step(self, batch, batch_idx):
        sequences, labels = batch
        loss, outputs = self(sequences, labels)
        self.log("train_loss", loss, on_step=True, on_epoch=True)
        # return {"loss": loss}


    def validation_step(self, batch, batch_idx):
        sequences, labels = batch
        loss, outputs = self(sequences, labels)
        self.log("val_loss", loss, on_step=True, on_epoch=True)
        # return {"loss": loss}


    def test_step(self, batch, batch_idx):
        sequences, labels = batch
        loss, outputs = self(sequences, labels)
        self.log("test_loss", loss, on_step=True, on_epoch=True)
        # return {"loss": loss}


    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)


In [88]:
def training_loop(
    train_sequences: List[Tuple[pd.DataFrame, float]], 
    val_sequences: List[Tuple[pd.DataFrame, float]],
    test_sequences: List[Tuple[pd.DataFrame, float]],
    parameters: Dict):
    """
    Training loop for the LSTM model.

    Args:
        train_sequences: List of training sequences.
        val_sequences: List of validation sequences.
        test_sequences: List of test sequences.
        parameters: Dictionary of training parameters.
    """
    seed_everything(1)


    model = PricePredictor(
        batch_size=parameters["train_batch_size"],
        dropout_rate=parameters["dropout_rate"],
        hidden_size=parameters["hidden_size"],
        learning_rate=parameters["learning_rate"],
        number_of_features=parameters["number_of_features"],
        number_of_layers=parameters["number_of_layers"],
    )

    data_module = LSTMDataLoader(
        train_sequences=train_sequences, 
        val_sequences=val_sequences,
        test_sequences=test_sequences,
        train_batch_size=parameters["train_batch_size"], 
        val_batch_size=parameters["val_batch_size"],
        train_workers=parameters["train_workers"],
        val_workers=parameters["val_workers"],
    )
    data_module.setup()

    checkpoint_callback = callbacks.ModelCheckpoint(
        filename="best-checkpoint",
        dirpath="data/06_models/checkpoints",
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min",
    )

    early_stopping_callback = callbacks.EarlyStopping(
        monitor="val_loss",
        patience=2
    )

    trainer = Trainer(
        max_epochs=parameters["max_epochs"],
        logger=False,
        checkpoint_callback=checkpoint_callback,
        callbacks=[early_stopping_callback],
        gpus=1,
        log_every_n_steps=1,
        progress_bar_refresh_rate=10,
    )

    trainer.fit(model, data_module)
    trainer.test(model, data_module)

In [89]:
TRAINING_PARAMS = {
    "train_batch_size": 16,
    "val_batch_size": 1,
    "train_workers": 2,
    "val_workers": 1,
    "max_epochs": 100,
    "hidden_size": 128,
    "number_of_features": 9,
    "number_of_layers": 2,
    "dropout_rate": 0.2,
    "learning_rate": 0.0001,
}

In [90]:
training_loop(train_sequences, val_sequences, test_sequences, TRAINING_PARAMS)

Global seed set to 1
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                 | Params
-----------------------------------------------
0 | model | PricePredictionModel | 203 K 
-----------------------------------------------
203 K     Trainable params
0         Non-trainable params
203 K     Total params
0.814     Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]tensor([[-0.0567]], device='cuda:0')
tensor([[-0.0568]], device='cuda:0')
                                                              

Global seed set to 1


Epoch 0:   0%|          | 0/363 [00:00<?, ?it/s] tensor([[-0.0565],
        [-0.0505],
        [-0.0530],
        [-0.0511],
        [-0.0498],
        [-0.0512],
        [-0.0566],
        [-0.0527],
        [-0.0529],
        [-0.0506],
        [-0.0526],
        [-0.0521],
        [-0.0524],
        [-0.0577],
        [-0.0522],
        [-0.0544]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.0590],
        [-0.0585],
        [-0.0558],
        [-0.0552],
        [-0.0581],
        [-0.0488],
        [-0.0585],
        [-0.0527],
        [-0.0555],
        [-0.0538],
        [-0.0531],
        [-0.0578],
        [-0.0545],
        [-0.0530],
        [-0.0576],
        [-0.0559]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.0507],
        [-0.0583],
        [-0.0535],
        [-0.0603],
        [-0.0550],
        [-0.0539],
        [-0.0504],
        [-0.0524],
        [-0.0564],
        [-0.0505],
        [-0.0554],
        [-0.0566],
        [-0.0514],
        



tensor([[-0.0544],
        [-0.0563],
        [-0.0498],
        [-0.0508],
        [-0.0537],
        [-0.0505],
        [-0.0515],
        [-0.0534],
        [-0.0501],
        [-0.0474],
        [-0.0537],
        [-0.0504],
        [-0.0453],
        [-0.0466],
        [-0.0516],
        [-0.0478]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.0531],
        [-0.0540],
        [-0.0520],
        [-0.0532],
        [-0.0526],
        [-0.0532],
        [-0.0521],
        [-0.0470],
        [-0.0529],
        [-0.0484],
        [-0.0505],
        [-0.0538],
        [-0.0533],
        [-0.0526],
        [-0.0549],
        [-0.0538]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[-0.0508],
        [-0.0531],
        [-0.0506],
        [-0.0530],
        [-0.0484],
        [-0.0483],
        [-0.0492],
        [-0.0515],
        [-0.0510],
        [-0.0502],
        [-0.0576],
        [-0.0545],
        [-0.0487],
        [-0.0500],
        [-0.0507],
        [-0.0482]],

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]tensor([[-0.0596]], device='cuda:0')
tensor([[-0.0589]], device='cuda:0')
tensor([[-0.0596]], device='cuda:0')
tensor([[-0.0596]], device='cuda:0')
tensor([[-0.0598]], device='cuda:0')
tensor([[-0.0592]], device='cuda:0')
tensor([[-0.0591]], device='cuda:0')
tensor([[-0.0586]], device='cuda:0')
tensor([[-0.0579]], device='cuda:0')
tensor([[-0.0585]], device='cuda:0')
Testing:   6%|▋         | 10/155 [00:00<00:01, 89.71it/s]tensor([[-0.0590]], device='cuda:0')
tensor([[-0.0601]], device='cuda:0')
tensor([[-0.0610]], device='cuda:0')
tensor([[-0.0616]], device='cuda:0')
tensor([[-0.0617]], device='cuda:0')
tensor([[-0.0620]], device='cuda:0')
tensor([[-0.0629]], device='cuda:0')
tensor([[-0.0619]], device='cuda:0')
tensor([[-0.0614]], device='cuda:0')
tensor([[-0.0613]], device='cuda:0')
tensor([[-0.0608]], device='cuda:0')
tensor([[-0.0605]], device='cuda:0')
tensor([[-0.0602]], device='cuda:0')
tensor([[-0.0605]], device='cuda:0')
tensor([[-0.0613]], device='