In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
from os import path

import numpy as np
import pandas as pd

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

from accelerate import Accelerator

from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.split import split, InputDataset, LabelDataset
from gluonts.time_feature import (
    time_features_from_frequency_str,
    TimeFeature,
    get_lags_for_frequency,
)
from gluonts.dataset.field_names import FieldName
from gluonts.transform import (
    AddAgeFeature,
    AddObservedValuesIndicator,
    AddTimeFeatures,
    AsNumpyArray,
    Chain,
    ExpectedNumInstanceSampler,
    InstanceSplitter,
    RemoveFields,
    SelectFields,
    SetField,
    TestSplitSampler,
    Transformation,
    ValidationSplitSampler,
    VstackFeatures,
    RenameFields,
)
from gluonts.transform.sampler import InstanceSampler
from gluonts.itertools import Cyclic, Cached
from gluonts.dataset.loader import as_stacked_batches

from tqdm import tqdm

from typing import Optional, Iterable, Sized, Iterator

In [3]:
accelerator = Accelerator()
device = accelerator.device

In [4]:
class ModelLogger:
    def log(self, msg):
        pass
    
    def reset(self):
        pass

class BasicModelLogger(ModelLogger):
    def __init__(self, msg_prefix):
        self.msg_prefix = msg_prefix
        # self.log_idx = 0
    
    def log(self, msg):
        print(f"{self.msg_prefix} - {msg}")
        # self.log_idx += 1
    
    def reset(self):
        # self.log_idx = 0
        pass

class NoopModelLogger(ModelLogger):
    def log(self, msg):
        pass
    
    def reset(self):
        pass

# Model

In [5]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_input_features: int,
        num_classes: int,
        embedding_dim: int,
        d_model: int,
        nhead: int,
        d_hid: int,
        nlayers: int,
        dropout: float,
        logger: ModelLogger,
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.input_ff = nn.Linear(num_input_features + embedding_dim, d_model)
        self.input_ff_sigmoid = nn.Sigmoid()
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.final_linear = nn.Linear(d_model, 1)
        self.init_weights()
        self.logger = logger

    def init_weights(self) -> None:
        initrange = 0.1
        self.input_ff.bias.data.zero_()
        self.input_ff.weight.data.uniform_(-initrange, initrange)
        self.final_linear.bias.data.zero_()
        self.final_linear.weight.data.uniform_(-initrange, initrange)

    def forward(
        self,
        src: Tensor,
        item_ids: Tensor,
        src_mask: Tensor = None,
    ) -> Tensor:
        # src: [batch_size b, seq_len k 55, features 27]
        batch_size = src.size(dim=0)
        seq_len = src.size(dim=1)
        num_input_features = src.size(dim=2)
        output = src
        embedded = self.embedding(item_ids)
        # idea from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py#L1290
        embedded = embedded.unsqueeze(dim=1).expand(-1, seq_len, -1)

        output = torch.cat((src, embedded), dim=-1)
        self.logger.log(f"{output.size()}")
        
        output = self.input_ff(output)
        self.logger.log(f"input_ff - {output.size()}")
        # output = self.input_ff_sigmoid(output)
        # self.logger.log(f"input_ff_sigmoid - {output.size()}")
        
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(dim=1)).to(device)
        self.logger.log(f"src_mask - {src_mask.size()}")
        
        output = self.transformer_encoder(output, src_mask)
        self.logger.log(f"encoder - {output.size()}")
        
        output = self.final_linear(output)
        self.logger.log(f"final_linear - {output.size()}")
        
        output = output[:, -1, :]
        output = output.squeeze(dim=1)
        self.logger.log(f"output - {output.size()}")
        
        return output

## Load data

In [6]:
df = pd.read_csv(
    "optiver-trading-at-the-close/train.csv",
    dtype={
        "imbalance_size": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
    },
)
raw_df = df

In [7]:
df = raw_df

In [8]:
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3.180603e+06,1,0.999812,13380277.00,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,1.666039e+05,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,3.028799e+05,-1,0.999561,1819368.00,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,1.191768e+07,-1,1.000171,18389746.00,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,4.475500e+05,-1,0.999532,17860614.00,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


## Data pre-processing and features

In [9]:
feat_dynamic_real = ["imbalance_size", "reference_price", "matched_size"]
# feat_dynamic_real = []
num_input_features = len(feat_dynamic_real)

## Add time index to data (for gluonts)

In [10]:
max_time_id = df["time_id"].max()
dti_by_time_id = pd.date_range("2018-01-01", periods=max_time_id + 1, freq="min")
print(dti_by_time_id)
df_period_index_by_time_id = dti_by_time_id.to_period("1min")
print(df_period_index_by_time_id)
df_index_by_time_id_series = df_period_index_by_time_id.to_series(index=np.arange(max_time_id + 1))
print(df_index_by_time_id_series)

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 00:01:00',
               '2018-01-01 00:02:00', '2018-01-01 00:03:00',
               '2018-01-01 00:04:00', '2018-01-01 00:05:00',
               '2018-01-01 00:06:00', '2018-01-01 00:07:00',
               '2018-01-01 00:08:00', '2018-01-01 00:09:00',
               ...
               '2018-01-19 08:45:00', '2018-01-19 08:46:00',
               '2018-01-19 08:47:00', '2018-01-19 08:48:00',
               '2018-01-19 08:49:00', '2018-01-19 08:50:00',
               '2018-01-19 08:51:00', '2018-01-19 08:52:00',
               '2018-01-19 08:53:00', '2018-01-19 08:54:00'],
              dtype='datetime64[ns]', length=26455, freq='T')
PeriodIndex(['2018-01-01 00:00', '2018-01-01 00:01', '2018-01-01 00:02',
             '2018-01-01 00:03', '2018-01-01 00:04', '2018-01-01 00:05',
             '2018-01-01 00:06', '2018-01-01 00:07', '2018-01-01 00:08',
             '2018-01-01 00:09',
             ...
             '2018-01-19 08:45', '2018-

In [11]:
df["timestamp_by_time_id"] = df["time_id"].map(df_index_by_time_id_series)

In [12]:
df["timestamp"] = df["timestamp_by_time_id"]
df.index = df["timestamp"]

In [13]:
df.head()

Unnamed: 0_level_0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id,timestamp_by_time_id,timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-01-01 00:00,0,0,0,3180603.0,1,0.999812,13380277.0,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0,2018-01-01 00:00,2018-01-01 00:00
2018-01-01 00:00,1,0,0,166603.9,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1,2018-01-01 00:00,2018-01-01 00:00
2018-01-01 00:00,2,0,0,302879.9,-1,0.999561,1819368.0,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2,2018-01-01 00:00,2018-01-01 00:00
2018-01-01 00:00,3,0,0,11917680.0,-1,1.000171,18389746.0,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3,2018-01-01 00:00,2018-01-01 00:00
2018-01-01 00:00,4,0,0,447550.0,-1,0.999532,17860614.0,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4,2018-01-01 00:00,2018-01-01 00:00


## Group by stock_id

In [14]:
df_grouped = df.groupby("stock_id")
print(len(df_grouped))
print(df_grouped.size())

200
stock_id
0      26455
1      26455
2      26455
3      26455
4      26455
       ...  
195    26455
196    26455
197    26455
198    26455
199    21615
Length: 200, dtype: int64


## Create gluonts dataset

In [15]:
freq = "1min"

In [16]:
dfs_dict = {}
for item_id, gdf in df_grouped:
    dfs_dict[item_id] = gdf.reindex(df_index_by_time_id_series).drop("stock_id", axis=1)

In [17]:
for item_id, df in dfs_dict.items():
    df.fillna(0.0, inplace=True)

In [18]:
dataset = PandasDataset(dfs_dict, target="target", feat_dynamic_real=feat_dynamic_real, freq=freq, assume_sorted=False)
print(dataset)
print(len(dataset))

PandasDataset<size=200, freq=1min, num_feat_dynamic_real=3, num_past_feat_dynamic_real=0, num_feat_static_real=0, num_feat_static_cat=0, static_cardinalities=[]>
200


## Prepare datasets

### Pytorch dataset pipeline

In [19]:
training_set_max_date_id = 480 - 20

In [20]:
prediction_length = 1
seq_len = 55

training_batch_size = 256
validation_batch_size = 256

In [21]:
class StockTrainingDataset(torch.utils.data.Dataset):
    def __init__(self, stock_df, feature_names, target_col, item_id, context_length):
        super().__init__()
        self.features = stock_df[feature_names]
        self.targets = stock_df[target_col]
        self.item_id = item_id
        self.context_length = context_length
        self.total_size = self.features.shape[0] - context_length + 1

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        start_idx = idx
        end_idx = idx + self.context_length
        features = self.features.iloc[start_idx:end_idx]
        targets = self.targets.iloc[start_idx:end_idx]
        return features.values, self.item_id, targets.values

In [22]:
class CustomLengthSequentialSampler(torch.utils.data.sampler.Sampler[int]):
    data_source: Sized

    def __init__(self, data_source: Sized, max_samples: int) -> None:
        self.data_source = data_source
        self.num_samples = min(len(data_source), max_samples)

    def __iter__(self) -> Iterator[int]:
        return iter(range(self.num_samples))

    def __len__(self) -> int:
        return self.num_samples

In [23]:
stock_training_datasets = []
stock_validation_datasets = []
for item_id, gdf in df_grouped:
    training_mask = gdf["date_id"] < training_set_max_date_id
    training_df = gdf[training_mask]
    validation_df = gdf[~training_mask]
    assert training_df.shape[0] > 0 \
        and validation_df.shape[0] > 0 \
        and training_df.shape[0] + validation_df.shape[0] == gdf.shape[0], f"{item_id} invalid shape"
    stock_training_datasets.append(StockTrainingDataset(training_df, feat_dynamic_real, "target", item_id, seq_len))
    stock_validation_datasets.append(StockTrainingDataset(validation_df, feat_dynamic_real, "target", item_id, seq_len))
print(len(stock_training_datasets), len(stock_validation_datasets))

200 200


In [24]:
full_training_dataset = torch.utils.data.ConcatDataset(stock_training_datasets)
full_validation_dataset = torch.utils.data.ConcatDataset(stock_validation_datasets)
print(len(full_training_dataset), len(full_validation_dataset))

4996180 220200


In [25]:
# TODO: change to RandomSampler
training_sampler = CustomLengthSequentialSampler(full_training_dataset, 256 * 100)
validation_sampler = torch.utils.data.SequentialSampler(full_validation_dataset)

In [26]:
training_dataloader = torch.utils.data.DataLoader(
    full_training_dataset,
    batch_size=training_batch_size,
    sampler=training_sampler,
)
validation_dataloader = torch.utils.data.DataLoader(
    full_validation_dataset,
    batch_size=validation_batch_size,
    sampler=validation_sampler,
)

In [27]:
training_sample_batch = next(iter(training_dataloader))
print(training_sample_batch[0].size(), training_sample_batch[1].size(), training_sample_batch[2].size())
validation_sample_batch = next(iter(validation_dataloader))
print(validation_sample_batch[0].size(), validation_sample_batch[1].size(), validation_sample_batch[2].size())

torch.Size([256, 55, 3]) torch.Size([256]) torch.Size([256, 55])
torch.Size([256, 55, 3]) torch.Size([256]) torch.Size([256, 55])


### Gluonts data pipeline

In [28]:
# Split the data for training and testing
test_dataset_size = 55 * 20
training_data, test_gen = split(dataset, offset=-test_dataset_size)
test_dataset = test_gen.generate_instances(prediction_length=prediction_length, windows=int(test_dataset_size / prediction_length))

In [29]:
train_dataset = training_data
test_data_input_dataset = InputDataset(test_dataset)
test_data_label_dataset = LabelDataset(test_dataset)

train_dataset_iter = iter(train_dataset)
test_data_input_dataset_iter = iter(test_data_input_dataset)
test_data_label_dataset_iter = iter(test_data_label_dataset)

In [30]:
def print_data_sample(sample):
    print(f"sample snippet: {sample}")
    print(f"sample dict keys: {sample.keys()}")
    print(f'start: {sample["start"]} {type(sample["start"])}')
    print(f'target shape: {type(sample["target"])} {sample["target"].shape}')
    print(f'item id: {sample["item_id"]}')
    if "feat_dynamic_real" in sample:
        print(f'feat_dynamic_real": {type(sample["feat_dynamic_real"])} {sample["feat_dynamic_real"].shape} {sample["feat_dynamic_real"].dtype}')
    else:
        print("no feat_dynamic_real")

In [31]:
train_sample = next(train_dataset_iter)
print(len(training_data))
print_data_sample(train_sample)

200
sample snippet: {'start': Period('2018-01-01 00:00', 'T'), 'target': array([-3.029704  ,  0.38981438,  4.220009  , ..., -4.580021  ,
       -6.740093  ,  0.61035156]), 'item_id': 0, 'feat_dynamic_real': array([[3.1806028e+06, 1.2997728e+06, 1.2997728e+06, ..., 3.3379682e+06,
        3.3363655e+06, 3.3363655e+06],
       [9.9981201e-01, 1.0000260e+00, 9.9991900e-01, ..., 1.0022500e+00,
        1.0022500e+00, 1.0022500e+00],
       [1.3380277e+07, 1.5261107e+07, 1.5261107e+07, ..., 4.8395436e+07,
        4.8397036e+07, 4.8397036e+07]], dtype=float32)}
sample dict keys: dict_keys(['start', 'target', 'item_id', 'feat_dynamic_real'])
start: 2018-01-01 00:00 <class 'pandas._libs.tslibs.period.Period'>
target shape: <class 'numpy.ndarray'> (25355,)
item id: 0
feat_dynamic_real": <class 'numpy.ndarray'> (3, 25355) float32


In [32]:
test_sample = next(test_data_input_dataset_iter)
print(len(test_data_input_dataset))
print_data_sample(test_sample)

test_sample = next(test_data_label_dataset_iter)
print(len(test_data_label_dataset))
print_data_sample(test_sample)

print("----------")

test_sample = next(test_data_input_dataset_iter)
print(len(test_data_input_dataset))
print_data_sample(test_sample)

test_sample = next(test_data_label_dataset_iter)
print(len(test_data_label_dataset))
print_data_sample(test_sample)

220000
sample snippet: {'start': Period('2018-01-01 00:00', 'T'), 'target': array([-3.029704  ,  0.38981438,  4.220009  , ..., -4.580021  ,
       -6.740093  ,  0.61035156]), 'item_id': 0, 'feat_dynamic_real': array([[3.1806028e+06, 1.2997728e+06, 1.2997728e+06, ..., 3.3363655e+06,
        3.3363655e+06, 1.0247004e+06],
       [9.9981201e-01, 1.0000260e+00, 9.9991900e-01, ..., 1.0022500e+00,
        1.0022500e+00, 1.0003310e+00],
       [1.3380277e+07, 1.5261107e+07, 1.5261107e+07, ..., 4.8397036e+07,
        4.8397036e+07, 1.5222857e+07]], dtype=float32)}
sample dict keys: dict_keys(['start', 'target', 'item_id', 'feat_dynamic_real'])
start: 2018-01-01 00:00 <class 'pandas._libs.tslibs.period.Period'>
target shape: <class 'numpy.ndarray'> (25355,)
item id: 0
feat_dynamic_real": <class 'numpy.ndarray'> (3, 25356) float32
220000
sample snippet: {'start': Period('2018-01-18 14:35', 'T'), 'target': array([2.900362]), 'item_id': 0, 'feat_dynamic_real': array([[1.0247004e+06, 1.7997806e+06]

In [33]:
raw_df[raw_df["stock_id"] == 0].iloc[-38:-30][feat_dynamic_real + ["target"]]

Unnamed: 0_level_0,imbalance_size,reference_price,matched_size,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-19 08:17,2713164.75,0.999368,24343336.0,-2.570152
2018-01-19 08:18,2263878.5,0.999368,24792622.0,-5.270243
2018-01-19 08:19,3197117.0,0.999368,24907852.0,-7.500052
2018-01-19 08:20,3197117.0,0.999193,24907852.0,-4.699826
2018-01-19 08:21,3269983.25,0.999193,24907852.0,-5.300045
2018-01-19 08:22,3269983.25,0.999193,24907852.0,1.100302
2018-01-19 08:23,3269983.25,0.998842,24907852.0,0.9799
2018-01-19 08:24,3391427.5,0.998842,24907852.0,-1.369715


In [34]:
def create_transformation() -> Transformation:
    # a bit like torchvision.transforms.Compose
    return Chain(
        []
        # step 1: remove static/dynamic fields if not specified
        # [RemoveFields(field_names=remove_field_names)]
        # step 2: convert the data to NumPy (potentially not needed)
        + (
            [
                AsNumpyArray(
                    field=FieldName.ITEM_ID,
                    expected_ndim=0,
                    dtype=int,
                )
            ]
            # if config.num_static_categorical_features > 0
            # else []
        )
        + (
            # [
            #     AsNumpyArray(
            #         field=FieldName.FEAT_STATIC_REAL,
            #         expected_ndim=1,
            #     )
            # ]
            # if config.num_static_real_features > 0
            # else []
            []
        )
        + [
            AsNumpyArray(
                field=FieldName.TARGET,
                # we expect an extra dim for the multivariate case:
                expected_ndim=1,
            ),
            # step 3: handle the NaN's by filling in the target with zero
            # and return the mask (which is in the observed values)
            # true for observed values, false for nan's
            # the decoder uses this mask (no loss is incurred for unobserved values)
            # see loss_weights inside the xxxForPrediction model
            # AddObservedValuesIndicator(
            #     target_field=FieldName.TARGET,
            #     output_field=FieldName.OBSERVED_VALUES,
            # ),
            # step 4: add temporal features based on freq of the dataset
            # month of year in the case when freq="M"
            # these serve as positional encodings
            # AddTimeFeatures(
            #     start_field=FieldName.START,
            #     target_field=FieldName.TARGET,
            #     output_field=FieldName.FEAT_TIME,
            #     time_features=time_features_from_frequency_str(freq),
            #     pred_length=config.prediction_length,
            # ),
            # step 5: add another temporal feature (just a single number)
            # tells the model where in the life the value of the time series is
            # sort of running counter
            # AddAgeFeature(
            #     target_field=FieldName.TARGET,
            #     output_field=FieldName.FEAT_AGE,
            #     pred_length=config.prediction_length,
            #     log_scale=False,
            # ),
            # step 6: vertically stack all the temporal features into the key FEAT_TIME
            VstackFeatures(
                output_field=FieldName.FEAT_TIME,
                input_fields=[]
                + (
                    [FieldName.FEAT_DYNAMIC_REAL]
                    # if config.num_dynamic_real_features > 0
                    # else []
                ),
            ),
            # step 7: rename to match HuggingFace names
            RenameFields(
                mapping={
                    FieldName.ITEM_ID: "static_categorical_features",
                    FieldName.FEAT_STATIC_REAL: "static_real_features",
                    FieldName.FEAT_TIME: "time_features",
                    FieldName.TARGET: "values",
                    FieldName.OBSERVED_VALUES: "observed_mask",
                }
            ),
        ]
    )

In [35]:
def create_instance_splitter(
    prediction_length: int,
    context_length: int,
    mode: str,
    train_sampler: Optional[InstanceSampler] = None,
    validation_sampler: Optional[InstanceSampler] = None,
) -> Transformation:
    assert mode in ["train", "validation", "test"]

    instance_sampler = {
        "train": train_sampler
        or ExpectedNumInstanceSampler(
            num_instances=1.0, min_future=prediction_length
        ),
        "validation": validation_sampler
        or ValidationSplitSampler(min_future=prediction_length),
        "test": TestSplitSampler(),
    }[mode]

    return InstanceSplitter(
        target_field="values",
        is_pad_field=FieldName.IS_PAD,
        start_field=FieldName.START,
        forecast_start_field=FieldName.FORECAST_START,
        instance_sampler=instance_sampler,
        past_length=context_length,
        future_length=prediction_length,
        time_series_fields=["time_features"],
    )

In [36]:
def create_train_dataloader(
    prediction_length: int,
    context_length: int,
    data,
    batch_size: int,
    num_batches_per_epoch: int,
    shuffle_buffer_length: Optional[int] = None,
    cache_data: bool = True,
    **kwargs,
) -> Iterable:
    PREDICTION_INPUT_NAMES = [
        "past_time_features",
        "past_values",
        "future_time_features",
        "static_categorical_features",
    ]
    TRAINING_INPUT_NAMES = PREDICTION_INPUT_NAMES + [
        "future_values",
    ]

    transformation = create_transformation()
    transformed_data = transformation.apply(data, is_train=True)
    if cache_data:
        transformed_data = Cached(transformed_data)

    # we initialize a Training instance
    instance_splitter = create_instance_splitter(prediction_length, context_length, "train")

    # the instance splitter will sample a window of
    # context length + lags + prediction length (from the 366 possible transformed time series)
    # randomly from within the target time series and return an iterator.
    stream = Cyclic(transformed_data).stream()
    training_instances = instance_splitter.apply(stream)

    return as_stacked_batches(
        training_instances,
        batch_size=batch_size,
        shuffle_buffer_length=shuffle_buffer_length,
        field_names=TRAINING_INPUT_NAMES,
        output_type=torch.tensor,
        num_batches_per_epoch=num_batches_per_epoch,
    )

In [37]:
def create_backtest_dataloader(
    prediction_length: int,
    context_length: int,
    data,
    batch_size: int,
    **kwargs,
):
    PREDICTION_INPUT_NAMES = [
        "past_time_features",
        "past_values",
        "future_time_features",
        "static_categorical_features",
    ]

    transformation = create_transformation()
    transformed_data = transformation.apply(data)

    # We create a Validation Instance splitter which will sample the very last
    # context window seen during training only for the encoder.
    instance_sampler = create_instance_splitter(prediction_length, context_length, "validation")

    # we apply the transformations in train mode
    testing_instances = instance_sampler.apply(transformed_data, is_train=True)

    return as_stacked_batches(
        testing_instances,
        batch_size=batch_size,
        output_type=torch.tensor,
        field_names=PREDICTION_INPUT_NAMES,
    )

In [38]:
def create_test_dataloader(
    prediction_length: int,
    context_length: int,
    data,
    batch_size: int,
    **kwargs,
):
    PREDICTION_INPUT_NAMES = [
        "past_time_features",
        "past_values",
        "future_time_features",
        "static_categorical_features",
    ]

    transformation = create_transformation()
    transformed_data = transformation.apply(data, is_train=False)

    # We create a test Instance splitter to sample the very last
    # context window from the dataset provided.
    instance_sampler = create_instance_splitter(prediction_length, context_length, "test")

    # We apply the transformations in test mode
    testing_instances = instance_sampler.apply(transformed_data, is_train=False)

    return as_stacked_batches(
        testing_instances,
        batch_size=batch_size,
        output_type=torch.tensor,
        field_names=PREDICTION_INPUT_NAMES,
    )

In [39]:
train_dataloader = create_train_dataloader(
    prediction_length=prediction_length,
    context_length=seq_len,
    data=train_dataset,
    batch_size=256,
    num_batches_per_epoch=100,
)

test_dataloader = create_backtest_dataloader(
    prediction_length=prediction_length,
    context_length=seq_len,
    data=test_data_input_dataset,
    batch_size=64,
)

In [40]:
batch = next(iter(train_dataloader))
for k, v in batch.items():
    print(k, v.shape, v.type())
print(len(list(iter(train_dataloader))))

past_time_features torch.Size([256, 55, 3]) torch.FloatTensor
past_values torch.Size([256, 55]) torch.FloatTensor
future_time_features torch.Size([256, 1, 3]) torch.FloatTensor
static_categorical_features torch.Size([256]) torch.LongTensor
future_values torch.Size([256, 1]) torch.FloatTensor
100


In [41]:
test_sample_batch = next(iter(test_dataloader))
for k, v in test_sample_batch.items():
    print(k, v.shape, v.type())

past_time_features torch.Size([64, 55, 3]) torch.FloatTensor
past_values torch.Size([64, 55]) torch.FloatTensor
future_time_features torch.Size([64, 1, 3]) torch.FloatTensor
static_categorical_features torch.Size([64]) torch.LongTensor


## Hyperparameters

In [42]:
num_classes = len(df_grouped)
embedding_dim = 4
d_model = 32
nhead = 4
d_hid = 32
nlayers = 2
dropout = 0.1

## Create model

In [43]:
model = TransformerModel(
    num_input_features,
    num_classes,
    embedding_dim,
    d_model,
    nhead,
    d_hid,
    nlayers,
    dropout,
    NoopModelLogger(),
)

In [44]:
criterion = nn.L1Loss()
validation_criterion = nn.L1Loss(reduction="none")

## Test model with 2 identical samples, test model is non-random

In [45]:
sample_model = TransformerModel(
    num_input_features,
    num_classes,
    4,
    4,
    2,
    4,
    2,
    dropout,
    BasicModelLogger("test_model_with_2_identical_samples"),
)
sample_model.eval()
sample_input_features = training_sample_batch[0][0]
sample_input_features = sample_input_features.expand(2, -1, -1)
sample_input_item_id = training_sample_batch[1][0]
sample_input_item_id = sample_input_item_id.expand(2)
print(sample_input_features.size(), sample_input_item_id.size(), sample_input_item_id)
sample_output = sample_model(sample_input_features, sample_input_item_id)
sample_targets = training_sample_batch[2][0]
sample_targets = sample_targets.expand(2, -1)
sample_actual_targets = sample_targets[:, -1]
print(sample_output.size(), sample_output, sample_targets.size(), sample_actual_targets.size(), sample_actual_targets)
sample_loss = criterion(sample_output, sample_actual_targets)
print(sample_loss)
del sample_model, sample_input_features, sample_input_item_id, sample_output, sample_targets, sample_loss

torch.Size([2, 55, 3]) torch.Size([2]) tensor([0, 0])
test_model_with_2_identical_samples - torch.Size([2, 55, 7])
test_model_with_2_identical_samples - input_ff - torch.Size([2, 55, 4])
test_model_with_2_identical_samples - src_mask - torch.Size([55, 55])
test_model_with_2_identical_samples - encoder - torch.Size([2, 55, 4])
test_model_with_2_identical_samples - final_linear - torch.Size([2, 55, 1])
test_model_with_2_identical_samples - output - torch.Size([2])
torch.Size([2]) tensor([-0.0099, -0.0099], grad_fn=<SqueezeBackward1>) torch.Size([2, 55]) torch.Size([2]) tensor([9.7597, 9.7597], dtype=torch.float64)
tensor(9.7696, dtype=torch.float64, grad_fn=<MeanBackward0>)


## Training

In [46]:
model = model.to(device)

In [47]:
optimizer = AdamW(model.parameters(), lr=6e-4, betas=(0.9, 0.95), weight_decay=1e-1)

In [48]:
model, optimizer = accelerator.prepare(model, optimizer)
training_dataloader, validation_dataloader = accelerator.prepare(training_dataloader, validation_dataloader)

In [49]:
num_epochs = 2

In [50]:
exp_training_num_batches = math.ceil(len(training_sampler) / training_batch_size)
exp_validation_num_batches = math.ceil(len(validation_sampler) / validation_batch_size)
print(exp_training_num_batches, exp_validation_num_batches)

100 861


In [51]:
checkpoint_dir_path = "transformer_encoder_checkpoints"
checkpoint_prefix = "20240313_test"

In [52]:
def train_epoch(model, training_dataloader, epoch, criterion):
    training_iter = enumerate(training_dataloader)
    training_iter = tqdm(training_iter, desc=f"Training epoch {epoch}", total=exp_training_num_batches)

    total_training_loss = 0.0
    training_batch_cnt = 0

    for idx, batch in training_iter:
        optimizer.zero_grad()

        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets[:, -1]

        output = model(features, item_id)

        loss = criterion(output, actual_targets)

        accelerator.backward(loss)
        optimizer.step()

        total_training_loss += loss.item()
        training_batch_cnt += 1

    avg_training_loss = total_training_loss / training_batch_cnt if training_batch_cnt > 0 else np.nan
    return avg_training_loss


def validation_epoch(model, validation_dataloader, epoch, criterion):
    validation_iter = enumerate(validation_dataloader)
    validation_iter = tqdm(validation_iter, desc=f"Validation epoch {epoch}", total=exp_validation_num_batches)
    validation_losses = None
    with torch.no_grad():
        for idx, batch in validation_iter:
            features, item_id, targets = batch[0], batch[1], batch[2]
            actual_targets = targets[:, -1]
            output = model(features, item_id)
            loss = criterion(output, actual_targets)
            loss = loss.cpu().numpy()
            if validation_losses is None:
                validation_losses = loss
            else:
                validation_losses = np.concatenate((validation_losses, loss))
    validation_avg_loss = np.mean(validation_losses)
    return validation_avg_loss, validation_losses


for epoch in range(num_epochs):
    model.train()
    avg_training_loss = train_epoch(model, training_dataloader, epoch, criterion)

    model.eval()
    validation_avg_loss, validation_losses = validation_epoch(model, validation_dataloader, epoch, validation_criterion)

    # https://pytorch.org/tutorials/beginner/saving_loading_models.html
    checkpoint_dict = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'avg_training_loss': avg_training_loss,
        'validation_avg_loss': validation_avg_loss,
    }
    checkpoint_path = path.join(checkpoint_dir_path, f"{checkpoint_prefix}_epoch{epoch}.pt")
    torch.save(checkpoint_dict, checkpoint_path)
    
    print(f"Epoch {epoch} - training loss: {avg_training_loss}, validation loss: {validation_avg_loss} / {validation_losses.shape}, checkpoint_path: {checkpoint_path}")

Training epoch 0: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.36it/s]
Validation epoch 0: 100%|█████████████████████████████████████████████████████████████| 861/861 [00:51<00:00, 16.64it/s]


Epoch 0 - training loss: 4.463226436177743, validation loss: 5.7897948037601505 / (220200,), checkpoint_path: transformer_encoder_checkpoints/20240313_test_epoch0.pt


Training epoch 1: 100%|███████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.27it/s]
Validation epoch 1: 100%|█████████████████████████████████████████████████████████████| 861/861 [00:58<00:00, 14.80it/s]

Epoch 1 - training loss: 4.461385493080297, validation loss: 5.790021116168703 / (220200,), checkpoint_path: transformer_encoder_checkpoints/20240313_test_epoch1.pt



