# Transformer encoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
from os import path
import random

import numpy as np
import pandas as pd

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

from accelerate import Accelerator

import lightning as L
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import RichProgressBar
from lightning.pytorch import loggers as pl_loggers

from gluonts.dataset.pandas import PandasDataset
from gluonts.dataset.split import split, InputDataset, LabelDataset
from gluonts.time_feature import (
    time_features_from_frequency_str,
    TimeFeature,
    get_lags_for_frequency,
)
from gluonts.dataset.field_names import FieldName
from gluonts.transform import (
    AddAgeFeature,
    AddObservedValuesIndicator,
    AddTimeFeatures,
    AsNumpyArray,
    Chain,
    ExpectedNumInstanceSampler,
    InstanceSplitter,
    RemoveFields,
    SelectFields,
    SetField,
    TestSplitSampler,
    Transformation,
    ValidationSplitSampler,
    VstackFeatures,
    RenameFields,
)
from gluonts.transform.sampler import InstanceSampler
from gluonts.itertools import Cyclic, Cached
from gluonts.dataset.loader import as_stacked_batches

from tqdm import tqdm

from typing import Optional, Iterable, Sized, Iterator

In [3]:
# https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
seed_everything(42, workers=True)

Seed set to 42


42

In [4]:
class ModelLogger:
    def log(self, msg):
        pass

    def reset(self):
        pass

class BasicModelLogger(ModelLogger):
    def __init__(self, msg_prefix):
        self.msg_prefix = msg_prefix
        # self.log_idx = 0

    def log(self, msg):
        print(f"{self.msg_prefix} - {msg}")
        # self.log_idx += 1

    def reset(self):
        # self.log_idx = 0
        pass

class NoopModelLogger(ModelLogger):
    def log(self, msg):
        pass

    def reset(self):
        pass

# Model

In [5]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_input_features: int,
        num_classes: int,
        embedding_dim: int,
        d_model: int,
        nhead: int,
        d_hid: int,
        nlayers: int,
        dropout: float,
        logger: ModelLogger,
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.input_ff = nn.Linear(num_input_features + embedding_dim, d_model)
        self.input_ff_sigmoid = nn.Sigmoid()
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.final_linear = nn.Linear(d_model, 1)
        self.logger = logger

    def forward(
        self,
        src: Tensor,
        item_ids: Tensor,
        src_mask: Tensor = None,
    ) -> Tensor:
        # src: [batch_size b, seq_len k 55, features 27]
        batch_size = src.size(dim=0)
        seq_len = src.size(dim=1)
        num_input_features = src.size(dim=2)
        output = src
        embedded = self.embedding(item_ids)
        # idea from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py#L1290
        embedded = embedded.unsqueeze(dim=1).expand(-1, seq_len, -1)

        output = torch.cat((src, embedded), dim=-1)
        self.logger.log(f"{output.size()}")

        output = self.input_ff(output)
        self.logger.log(f"input_ff - {output.size()}")
        # output = self.input_ff_sigmoid(output)
        # self.logger.log(f"input_ff_sigmoid - {output.size()}")

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(dim=1))
        self.logger.log(f"src_mask - {src_mask.size()}")

        output = self.transformer_encoder(output, src_mask)
        self.logger.log(f"encoder - {output.size()}")

        output = self.final_linear(output)
        self.logger.log(f"final_linear - {output.size()}")

        output = output[:, -1, :]
        output = output.squeeze(dim=1)
        self.logger.log(f"output - {output.size()}")

        return output

In [6]:
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

## Load data

In [7]:
df = pd.read_csv(
    "../optiver-trading-at-the-close/train.csv",
    dtype={
        "imbalance_size": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
    },
)
raw_df = df

In [8]:
df = raw_df

In [9]:
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3.180603e+06,1,0.999812,13380277.00,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,1.666039e+05,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,3.028799e+05,-1,0.999561,1819368.00,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,1.191768e+07,-1,1.000171,18389746.00,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,4.475500e+05,-1,0.999532,17860614.00,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


## Data pre-processing and features

In [10]:
feat_dynamic_real = ["imbalance_size", "reference_price", "matched_size"]
# feat_dynamic_real = []
num_input_features = len(feat_dynamic_real)
print(num_input_features)

3


## Process na values

In [11]:
any_na_values_mask = df[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)

(5237980,) (220,)


In [12]:
def process_na(df, any_na_values_mask, method):
    final_df = df
    if method == "fill_as_0":
        final_df = final_df.fillna(0.0)
    else:
        final_df = final_df[~any_na_values_mask]
    return final_df

In [13]:
df = process_na(df, any_na_values_mask, "fill_as_0")

In [14]:
any_na_values_mask = df[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)

(5237980,) (0,)


## Group by stock_id

In [15]:
df_grouped = df.groupby("stock_id")
num_classes = len(df_grouped)
print(num_classes)
print(df_grouped.size())

200
stock_id
0      26455
1      26455
2      26455
3      26455
4      26455
       ...  
195    26455
196    26455
197    26455
198    26455
199    21615
Length: 200, dtype: int64


## Normalize features per stock

In [16]:
display(df_grouped.get_group(0)[feat_dynamic_real].head())
df[feat_dynamic_real] = df_grouped[feat_dynamic_real].transform(lambda x: (x - x.mean()) / x.std())
display(df_grouped.get_group(0)[feat_dynamic_real].head())

Unnamed: 0,imbalance_size,reference_price,matched_size
0,3180602.75,0.999812,13380277.0
191,1299772.75,1.000026,15261107.0
382,1299772.75,0.999919,15261107.0
573,1299772.75,1.000133,15261107.0
764,1218204.375,1.000455,15342675.0


Unnamed: 0,imbalance_size,reference_price,matched_size
0,-0.123787,-0.013458,-0.587909
191,-0.349814,0.114021,-0.498553
382,-0.349814,0.050282,-0.498553
573,-0.349814,0.177796,-0.498553
764,-0.359617,0.369619,-0.494678


## Data hyperparameters

In [17]:
training_set_max_date_id = 480 - 20

In [18]:
prediction_length = 1
seq_len = 55

training_batch_size = 256
validation_batch_size = 256

## Prepare Pytorch datasets

In [19]:
class StockTrainingDataset(torch.utils.data.Dataset):
    def __init__(self, stock_df, feature_names, target_col, item_id, context_length):
        super().__init__()
        self.features = stock_df[feature_names]
        self.targets = stock_df[target_col]
        self.item_id = item_id
        self.context_length = context_length
        self.total_size = self.features.shape[0] - context_length + 1

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        start_idx = idx
        end_idx = idx + self.context_length
        features = self.features.iloc[start_idx:end_idx]
        targets = self.targets.iloc[start_idx:end_idx]
        return features.values, self.item_id, targets.values

In [20]:
class CustomLengthSequentialSampler(torch.utils.data.sampler.Sampler[int]):
    data_source: Sized

    def __init__(self, data_source: Sized, max_samples: int) -> None:
        self.data_source = data_source
        self.num_samples = min(len(data_source), max_samples)

    def __iter__(self) -> Iterator[int]:
        return iter(range(self.num_samples))

    def __len__(self) -> int:
        return self.num_samples

In [21]:
stock_training_datasets = []
stock_validation_datasets = []
for item_id, gdf in df_grouped:
    training_mask = gdf["date_id"] < training_set_max_date_id
    training_df = gdf[training_mask]
    validation_df = gdf[~training_mask]
    assert training_df.shape[0] > 0 \
        and validation_df.shape[0] > 0 \
        and training_df.shape[0] + validation_df.shape[0] == gdf.shape[0], f"{item_id} invalid shape"
    stock_training_datasets.append(StockTrainingDataset(training_df, feat_dynamic_real, "target", item_id, seq_len))
    stock_validation_datasets.append(StockTrainingDataset(validation_df, feat_dynamic_real, "target", item_id, seq_len))
print(len(stock_training_datasets), len(stock_validation_datasets))

200 200


In [22]:
full_training_dataset = torch.utils.data.ConcatDataset(stock_training_datasets)
full_validation_dataset = torch.utils.data.ConcatDataset(stock_validation_datasets)
print(len(full_training_dataset), len(full_validation_dataset))

4996180 220200


In [23]:
# TODO: change to RandomSampler
training_sampler = torch.utils.data.SequentialSampler(full_training_dataset)
validation_sampler = torch.utils.data.SequentialSampler(full_validation_dataset)

In [24]:
training_dataloader = torch.utils.data.DataLoader(
    full_training_dataset,
    batch_size=training_batch_size,
    sampler=training_sampler,
    # https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading
    num_workers=4,
    # https://pytorch.org/docs/stable/data.html#memory-pinning
    pin_memory=True,
)
validation_dataloader = torch.utils.data.DataLoader(
    full_validation_dataset,
    batch_size=validation_batch_size,
    sampler=validation_sampler,
    num_workers=4,
    pin_memory=True,
)

In [25]:
training_sample_batch = next(iter(training_dataloader))
print(training_sample_batch[0].size(), training_sample_batch[1].size(), training_sample_batch[2].size())
validation_sample_batch = next(iter(validation_dataloader))
print(validation_sample_batch[0].size(), validation_sample_batch[1].size(), validation_sample_batch[2].size())

torch.Size([256, 55, 3]) torch.Size([256]) torch.Size([256, 55])
torch.Size([256, 55, 3]) torch.Size([256]) torch.Size([256, 55])


## Hyperparameters

In [26]:
embedding_dim = 4
d_model = 32
nhead = 4
d_hid = 32
nlayers = 2
dropout = 0.1

## Create model

In [27]:
model = TransformerModel(
    num_input_features,
    num_classes,
    embedding_dim,
    d_model,
    nhead,
    d_hid,
    nlayers,
    dropout,
    NoopModelLogger(),
)
model.apply(init_weights)

TransformerModel(
  (embedding): Embedding(200, 4)
  (input_ff): Linear(in_features=7, out_features=32, bias=True)
  (input_ff_sigmoid): Sigmoid()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=32, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (final_linear): Linear(in_features=32, out_features=1, bias=True)
)

In [28]:
criterion = nn.L1Loss()
validation_criterion = nn.L1Loss()

## Test model with 2 identical samples, test model is non-random

In [29]:
sample_model = TransformerModel(
    num_input_features=num_input_features,
    num_classes=num_classes,
    embedding_dim=4,
    d_model=4,
    nhead=2,
    d_hid=4,
    nlayers=2,
    dropout=dropout,
    logger=BasicModelLogger("test_model_with_2_identical_samples"),
)
sample_model.apply(init_weights)
sample_model.eval()
sample_input_features = training_sample_batch[0][0]
sample_input_features = sample_input_features.expand(2, -1, -1)
sample_input_item_id = training_sample_batch[1][0]
sample_input_item_id = sample_input_item_id.expand(2)
print(sample_input_features.size(), sample_input_item_id.size(), sample_input_item_id)
sample_output = sample_model(sample_input_features, sample_input_item_id)
sample_targets = training_sample_batch[2][0]
sample_targets = sample_targets.expand(2, -1)
sample_actual_targets = sample_targets[:, -1]
print(sample_output.size(), sample_output, sample_targets.size(), sample_actual_targets.size(), sample_actual_targets)
sample_loss = criterion(sample_output, sample_actual_targets)
print(sample_loss)
del sample_model, sample_input_features, sample_input_item_id, sample_output, sample_targets, sample_loss

torch.Size([2, 55, 3]) torch.Size([2]) tensor([0, 0])
test_model_with_2_identical_samples - torch.Size([2, 55, 7])
test_model_with_2_identical_samples - input_ff - torch.Size([2, 55, 4])
test_model_with_2_identical_samples - src_mask - torch.Size([55, 55])
test_model_with_2_identical_samples - encoder - torch.Size([2, 55, 4])
test_model_with_2_identical_samples - final_linear - torch.Size([2, 55, 1])
test_model_with_2_identical_samples - output - torch.Size([2])
torch.Size([2]) tensor([-1.7799, -1.7799], grad_fn=<SqueezeBackward1>) torch.Size([2, 55]) torch.Size([2]) tensor([9.7597, 9.7597], dtype=torch.float64)
tensor(11.5395, dtype=torch.float64, grad_fn=<MeanBackward0>)


## Tensorboard

In [30]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/

## Module and trainer (lightning)

In [31]:
class TransformerModelModule(L.LightningModule):
    def __init__(self, model, criterion, validation_criterion):
        super().__init__()
        self.model = model
        self.criterion = criterion
        self.validation_criterion = validation_criterion
        self.validation_step_outputs = []
        self.validation_step_actual_targets = []

    def training_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets[:, -1]
        output = self.model(features, item_id)
        loss = self.criterion(output, actual_targets)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets[:, -1]
        output = self.model(features, item_id)
        # TODO: is loss being averaged based on batch size
        loss = self.validation_criterion(output, actual_targets)
        # lightning will take weighted-average on loss per step based on batch size
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.append(output)
        self.validation_step_actual_targets.append(actual_targets)

    def on_validation_epoch_end(self):
        # cat is used instead of stack, last step may have different batch size
        all_preds = torch.cat(self.validation_step_outputs)
        all_actual_targets = torch.cat(self.validation_step_actual_targets)
        manual_loss = self.validation_criterion(all_preds, all_actual_targets)
        self.log("val_loss_manual", manual_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.clear()  # free memory
        self.validation_step_actual_targets.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=6e-4, betas=(0.9, 0.95), weight_decay=1e-1)
        return [optimizer]

In [32]:
model = TransformerModelModule(model, criterion, validation_criterion)

In [33]:
num_epochs = 1
limit_train_batches = 1.0
# limit_train_batches = 100
gradient_clip_val = 0.5

In [34]:
# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html
# https://lightning.ai/docs/pytorch/stable/extensions/logging.html
# TODO: custom version name
tb_logger = pl_loggers.TensorBoardLogger(".", version=None)

In [35]:
trainer = L.Trainer(
    max_epochs=num_epochs,
    limit_train_batches=limit_train_batches,
    # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html#gradient-clipping
    gradient_clip_val=gradient_clip_val,
    callbacks=[
        # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html#richprogressbar
        RichProgressBar(leave=True)
    ],
    logger=tb_logger,
    # https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
    deterministic=True,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


In [36]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

Output()

[{'val_loss': 5.980157116437966, 'val_loss_manual': 5.980157116437957}]

## Training

In [37]:
trainer.fit(
    model=model,
    train_dataloaders=training_dataloader,
    val_dataloaders=validation_dataloader,
)

Output()

`Trainer.fit` stopped: `max_epochs=1` reached.


In [38]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

Output()

[{'val_loss': 5.789177744009405, 'val_loss_manual': 5.789177744009411}]