In [92]:
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc 

import pandas as pd 
import numpy as np
from tqdm.notebook import tqdm
import pytorch_lightning as pl
from sklearn.preprocessing import MinMaxScaler

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [93]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8
tqdm.pandas()

In [94]:
pl.seed_everything(42)

Global seed set to 42


42

## Load Data

In [95]:
df = pd.read_csv("../data/01_raw/fetched_market_chart.csv", parse_dates=["timestamp"])
df = df.sort_values(by="timestamp").reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_av,trades,tb_base_av,tb_quote_av,ignore
0,2017-08-17,4261.48,4485.39,4200.74,4285.08,795.150377,1503014399999,3454770.0,3427,616.248541,2678216.0,8733.911395
1,2017-08-18,4285.08,4371.52,3938.77,4108.37,1199.888264,1503100799999,5086958.0,5233,972.86871,4129123.0,9384.141409
2,2017-08-19,4108.37,4184.69,3850.0,4139.98,381.309763,1503187199999,1549484.0,2153,274.336042,1118002.0,9184.085529
3,2017-08-20,4120.98,4211.08,4032.62,4086.29,467.083022,1503273599999,1930364.0,2321,376.795947,1557401.0,10125.414084
4,2017-08-21,4069.13,4119.62,3911.79,4016.0,691.74306,1503359999999,2797232.0,3972,557.356107,2255663.0,11706.76997


In [96]:
df.shape

(1616, 12)

## Preprocessing

In [97]:
rows = []

for _, row in tqdm(df.iterrows(), total=df.shape[0]):
    row_data = dict(
        day_of_week=row["timestamp"].dayofweek,
        day_of_month=row["timestamp"].day,
        week_of_year=row["timestamp"].week,
        month_of_year=row["timestamp"].month,
        open=row["open"],
        high=row["high"],
        low=row["low"],
        close=row["close"],
        close_change=row["close"] - row["open"],
    )
    rows.append(row_data)

features_df = pd.DataFrame(rows)

  0%|          | 0/1616 [00:00<?, ?it/s]

In [98]:
features_df.shape

(1616, 9)

In [99]:
features_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
0,3,17,33,8,4261.48,4485.39,4200.74,4285.08,23.6
1,4,18,33,8,4285.08,4371.52,3938.77,4108.37,-176.71
2,5,19,33,8,4108.37,4184.69,3850.0,4139.98,31.61
3,6,20,33,8,4120.98,4211.08,4032.62,4086.29,-34.69
4,0,21,34,8,4069.13,4119.62,3911.79,4016.0,-53.13


In [100]:
train_size = int(len(features_df) * 0.9)
train_size

1454

In [101]:
train_df, test_df = features_df[:train_size], features_df[train_size + 1:]
train_df.shape, test_df.shape

((1454, 9), (161, 9))

In [102]:
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_df)

In [103]:
train_df = pd.DataFrame(scaler.transform(train_df), index=train_df.index, columns=train_df.columns)
train_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
0,0.0,0.066667,0.230769,0.272727,-0.964447,-0.960736,-0.953254,-0.963698,-0.033745
1,0.333333,0.133333,0.230769,0.272727,-0.963665,-0.964434,-0.962104,-0.969551,-0.061151
2,0.666667,0.2,0.230769,0.272727,-0.969518,-0.970503,-0.965103,-0.968504,-0.032649
3,1.0,0.266667,0.230769,0.272727,-0.9691,-0.969645,-0.958934,-0.970282,-0.04172
4,-1.0,0.333333,0.269231,0.272727,-0.970818,-0.972616,-0.963016,-0.97261,-0.044243


In [104]:
test_df = pd.DataFrame(scaler.transform(test_df), index=test_df.index, columns=test_df.columns)
test_df.head()

Unnamed: 0,day_of_week,day_of_month,week_of_year,month_of_year,open,high,low,close,close_change
1455,-0.333333,-0.333333,0.192308,0.272727,0.404176,0.411781,0.436554,0.401715,-0.047098
1456,0.0,-0.266667,0.192308,0.272727,0.401714,0.394718,0.383477,0.364886,-0.189071
1457,0.333333,-0.2,0.192308,0.272727,0.364931,0.44889,0.398591,0.477528,0.428202
1458,0.666667,-0.133333,0.192308,0.272727,0.477536,0.457269,0.457833,0.453301,-0.137054
1459,1.0,-0.066667,0.192308,0.272727,0.453309,0.432204,0.44192,0.450164,-0.049928


In [105]:
def create_sequences(input_data: pd.DataFrame, target_column, sequence_length: int):

    sequences = []
    data_size = len(input_data)

    for i in range(data_size - sequence_length):
        sequence = input_data[i:i+sequence_length]
        label_position = i + sequence_length
        label = input_data.iloc[label_position][target_column]
        sequences.append((sequence, label))
        
    return sequences

## Example with dummy df

In [106]:
sample_data = pd.DataFrame(dict(feature_1=[1, 2, 3, 4, 5], label=[6, 7, 8, 9, 10]))
sample_data.head()

Unnamed: 0,feature_1,label
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [107]:
sample_sequences = create_sequences(sample_data, "label", 3)
print(sample_sequences)

[(   feature_1  label
0          1      6
1          2      7
2          3      8, 9), (   feature_1  label
1          2      7
2          3      8
3          4      9, 10)]


In [108]:
len(sample_sequences)

2

In [109]:
SEQUENCE_LENGTH = 60

train_sequences = create_sequences(train_df, "close", SEQUENCE_LENGTH)
test_sequences = create_sequences(test_df, "close", SEQUENCE_LENGTH)

In [110]:
train_sequences[0][0].shape

(60, 9)

In [137]:
type(train_sequences)

list

In [147]:
type(train_sequences[0])

tuple

In [126]:
type(train_sequences[0][1])

numpy.float64

In [128]:
import pandas as pd
import torch

from torch.utils.data import Dataset
from typing import List, Tuple


class CryptoDataset(Dataset):
    """
    Dataset class for the LSTM model used by PyTorch Lightning.
    """
    def __init__(self, sequences: List[Tuple[pd.DataFrame]]):
        self.sequences = sequences


    def __len__(self):
        return len(self.sequences)


    def __getitem__(self, index: int):
        sequence, label = self.sequences[index]
        return dict(
            sequence=torch.tensor(sequence.to_numpy()),
            label=torch.tensor(label).float(),
        )

In [140]:
train_dataset = CryptoDataset(train_sequences)

In [136]:
type(train_dataset[0]["label"])

torch.Tensor

In [146]:
train_df.shape

(1454, 9)