# Transformer encoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

import lightning as L
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import RichProgressBar
from lightning.pytorch import loggers as pl_loggers

In [3]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, CompositeGroupedDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
)
from data_preprocessor.stock_feature_engineering import (
    StockNormalizeFeaturesPreprocessor,
)
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor

from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator

In [4]:
# https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
seed_everything(42, workers=True)

[rank: 0] Seed set to 42


42

# Hyperparameters / Setup

In [5]:
model_folder_version_name = None
# change to a string for a specific name
# model_folder_version_name = "test"

# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html
# https://lightning.ai/docs/pytorch/stable/extensions/logging.html
tb_logger = pl_loggers.TensorBoardLogger(".", version=model_folder_version_name)

## Train-validation split (align with LightGBM experiment/setup for fair comparison)

In [6]:
N_fold = 5
time_series_k_fold_data_generator = TimeSeriesKFoldDataGenerator(n_fold=N_fold, test_set_ratio=0.1)

## Data hyperparameters

In [7]:
prediction_length = 1
seq_len = 55

training_batch_size = 256
validation_batch_size = 256

## Model hyperparameters

In [8]:
embedding_dim = 4
d_model = 32
nhead = 4
d_hid = 32
nlayers = 2
dropout = 0.1

## Training hyperparameters

In [9]:
num_epochs = 2
limit_train_batches = 1.0
# limit_train_batches = 100
learning_rate = 1e-3
gradient_clip_val = 0.5

# Model

TODO:
1. use input_ff_sigmoid?
2. any further model enhancement?

In [10]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_input_features: int,
        num_classes: int,
        embedding_dim: int,
        d_model: int,
        nhead: int,
        d_hid: int,
        nlayers: int,
        dropout: float,
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.input_ff = nn.Linear(num_input_features + embedding_dim, d_model)
        self.input_ff_sigmoid = nn.Sigmoid()
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.final_linear = nn.Linear(d_model, 1)

    def forward(
        self,
        src: Tensor,
        item_ids: Tensor,
        src_mask: Tensor = None,
    ) -> Tensor:
        # src: [batch_size b, seq_len k 55, features 27]
        # item_ids: [batch_size b]
        batch_size = src.size(dim=0)
        seq_len = src.size(dim=1)
        num_input_features = src.size(dim=2)
        output = src
        embedded = self.embedding(item_ids)
        # idea from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py#L1290
        embedded = embedded.unsqueeze(dim=1).expand(-1, seq_len, -1)

        output = torch.cat((src, embedded), dim=-1)
        # [batch_size b, seq_len k 55, features 27 + embedding_dim]

        output = self.input_ff(output)
        # [batch_size b, seq_len k 55, d_model]

        # TODO: do we need sigmoid?
        output = self.input_ff_sigmoid(output)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(src.size(dim=1))
            # no "to device" for lightning
            # https://pytorch.org/docs/stable/generated/torch.Tensor.to.html#torch.Tensor.to
            src_mask = src_mask.to(output)
            # src_mask = squared (triangle matrix) matrix [seq_len k 55, seq_len k 55]

        output = self.transformer_encoder(output, src_mask)
        # [batch_size b, seq_len k 55, d_model]

        output = self.final_linear(output)
        # [batch_size b, seq_len k 55, 1]

        # take the "last" prediction, which includes all previous information
        output = output[:, -1, :]
        # [batch_size b, 1]
        output = output.squeeze(dim=1)
        # [batch_size b]

        return output

In [11]:
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

## Load data

In [12]:
df = pd.read_csv(
    "../optiver-trading-at-the-close/train.csv",
    dtype={
        # stock_id should be int64 / long for embedding
        "date_id": np.float32,
        "seconds_in_bucket": np.float32,
        "imbalance_size": np.float32,
        "imbalance_buy_sell_flag": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
        "far_price": np.float32,
        "near_price": np.float32,
        "bid_price": np.float32,
        "bid_size": np.float32,
        "ask_price": np.float32,
        "ask_size": np.float32,
        "wap": np.float32,
        "target": np.float32,
        "time_id": np.float32,
    },
)
raw_df = df

In [13]:
# deep copy for easy reference to raw df without reloading from csv
df = raw_df.copy(deep=True)

In [14]:
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0.0,0.0,3.180603e+06,1.0,0.999812,13380277.00,,,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704,0.0,0_0_0
1,1,0.0,0.0,1.666039e+05,-1.0,0.999896,1642214.25,,,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986,0.0,0_0_1
2,2,0.0,0.0,3.028799e+05,-1.0,0.999561,1819368.00,,,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950,0.0,0_0_2
3,3,0.0,0.0,1.191768e+07,-1.0,1.000171,18389746.00,,,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201,0.0,0_0_3
4,4,0.0,0.0,4.475500e+05,-1.0,0.999532,17860614.00,,,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849,0.0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480.0,540.0,2.440723e+06,-1.0,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276,26454.0,480_540_195
5237976,196,480.0,540.0,3.495105e+05,-1.0,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077,26454.0,480_540_196
5237977,197,480.0,540.0,0.000000e+00,0.0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443,26454.0,480_540_197
5237978,198,480.0,540.0,1.000899e+06,1.0,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184,26454.0,480_540_198


In [15]:
k_fold_train_dfs, k_fold_val_dfs, k_fold_num_train_val_sets = time_series_k_fold_data_generator.generate(df)
k_fold_train_df = k_fold_train_dfs[-1]
k_fold_val_df = k_fold_val_dfs[-1]
k_fold_train_df_index = k_fold_train_df.index
k_fold_val_df_index = k_fold_val_df.index
print(len(k_fold_train_dfs), len(k_fold_val_dfs), k_fold_num_train_val_sets, N_fold)
assert k_fold_num_train_val_sets == N_fold and len(k_fold_train_dfs) == N_fold and len(k_fold_val_dfs) == N_fold
print(k_fold_train_df.shape, k_fold_val_df.shape, df.shape)
assert k_fold_train_df.shape[0] + k_fold_val_df.shape[0] == df.shape[0]

5 5 5 5
(4714182, 17) (523798, 17) (5237980, 17)


## Data pre-processing and features

In [16]:
processors = [
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    # ReduceMemUsageDataPreprocessor(verbose=True),
    # BasicFeaturesPreprocessor(),    
    # DupletsTripletsPreprocessor(),
    MovingAvgPreProcessor("wap"),
    MovingAvgFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),  
    # DropTargetNADataPreprocessor(),    
    # RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(),
    # PolynomialFeaturesPreProcessor(),
]
processor = CompositeDataPreprocessor(processors)

In [17]:
df = processor.apply(df)

CompositeDataPreprocessor - original df shape: (5237980, 17)
Processing RemoveRecordsByStockDateIdPreprocessor...
RemoveRecordsByStockDateIdPreprocessor - removing 220 records
RemoveRecordsByStockDateIdPreprocessor took 0.55s. New df shape: (5237760, 17).
Processing FarNearPriceFillNaPreprocessor...
FarNearPriceFillNaPreprocessor took 0.06s. New df shape: (5237760, 17).
Processing MovingAvgPreProcessor...
MovingAvgPreProcessor took 28.28s. New df shape: (5237760, 21).
Processing MovingAvgFillNaPreprocessor...
MovingAvgFillNaPreprocessor took 0.13s. New df shape: (5237760, 21).
CompositeDataPreprocessor - final df shape: (5237760, 21)


In [18]:
print(df.columns)
display(df)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id', 'wap_mov_avg_3_1',
       'wap_mov_avg_6_3', 'wap_mov_avg_12_6', 'wap_mov_avg_24_12'],
      dtype='object')


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price,ask_size,wap,target,time_id,row_id,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12
0,0,0.0,0.0,3.180603e+06,1.0,0.999812,13380277.00,1.000000,1.000000,0.999812,...,1.000026,8493.030273,1.000000,-3.029704,0.0,0_0_0,1.000000,1.000000,1.000000,1.000000
1,1,0.0,0.0,1.666039e+05,-1.0,0.999896,1642214.25,1.000000,1.000000,0.999896,...,1.000660,20605.089844,1.000000,-5.519986,0.0,0_0_1,1.000000,1.000000,1.000000,1.000000
2,2,0.0,0.0,3.028799e+05,-1.0,0.999561,1819368.00,1.000000,1.000000,0.999403,...,1.000298,18995.000000,1.000000,-8.389950,0.0,0_0_2,1.000000,1.000000,1.000000,1.000000
3,3,0.0,0.0,1.191768e+07,-1.0,1.000171,18389746.00,1.000000,1.000000,0.999999,...,1.000214,479032.406250,1.000000,-4.010201,0.0,0_0_3,1.000000,1.000000,1.000000,1.000000
4,4,0.0,0.0,4.475500e+05,-1.0,0.999532,17860614.00,1.000000,1.000000,0.999394,...,1.000016,434.100006,1.000000,-7.349849,0.0,0_0_4,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480.0,540.0,2.440723e+06,-1.0,1.000317,28280362.00,0.999734,0.999734,1.000317,...,1.000434,319862.406250,1.000328,2.310276,26454.0,480_540_195,1.000345,1.000304,1.000318,1.000202
5237976,196,480.0,540.0,3.495105e+05,-1.0,1.000643,9187699.00,1.000129,1.000386,1.000643,...,1.000900,93393.070312,1.000819,-8.220077,26454.0,480_540_196,1.000816,1.000710,1.000560,1.000506
5237977,197,480.0,540.0,0.000000e+00,0.0,0.995789,12725436.00,0.995789,0.995789,0.995789,...,0.995883,180038.312500,0.995797,1.169443,26454.0,480_540_197,0.995958,0.996070,0.996130,0.996436
5237978,198,480.0,540.0,1.000899e+06,1.0,0.999210,94773272.00,0.999210,0.999210,0.998970,...,0.999210,669893.000000,0.999008,-1.540184,26454.0,480_540_198,0.999116,0.999218,0.999305,0.999313


In [19]:
feat_dynamic_real = [
    "date_id",
    "seconds_in_bucket",
    "imbalance_size",
    "reference_price",
    "matched_size",
    "far_price",
    "near_price",
    "bid_price",
    "bid_size",
    "ask_price",
    "ask_size",
    "wap",
    "wap_mov_avg_3_1",
    "wap_mov_avg_6_3",
    "wap_mov_avg_12_6",
    "wap_mov_avg_24_12",
]
num_input_features = len(feat_dynamic_real)
print(num_input_features)

16


In [20]:
# should not have any na features
any_na_values_mask = df[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)

(5237760,) (0,)


## Group by stock_id

TODO: group by date_id too? or use embedding for date_id?

In [21]:
df_grouped = df.groupby("stock_id")
num_classes = len(df_grouped)
print(num_classes)
print(df_grouped.size())

200
stock_id
0      26455
1      26455
2      26455
3      26455
4      26455
       ...  
195    26455
196    26455
197    26455
198    26455
199    21615
Length: 200, dtype: int64


In [22]:
df_grouped_train_map = {}
df_grouped_val_map = {}
df_grouped_train_map_sizes_sum = 0
df_grouped_val_map_sizes_sum = 0

for item_id, gdf in df_grouped:
    training_k_fold_idx_filter_mask = gdf.index.isin(k_fold_train_df_index)
    
    training_df = gdf[training_k_fold_idx_filter_mask]
    df_grouped_train_map[item_id] = training_df
    df_grouped_train_map_sizes_sum += training_df.shape[0]
    
    validation_df = gdf[~training_k_fold_idx_filter_mask]
    df_grouped_val_map[item_id] = validation_df
    df_grouped_val_map_sizes_sum += validation_df.shape[0]
    
    assert training_df.shape[0] > 0 \
        and validation_df.shape[0] > 0 \
        and training_df.shape[0] + validation_df.shape[0] == gdf.shape[0], f"{item_id} invalid shape, training_df: {training_df.shape}, validation_df: {validation_df.shape}"

print(df_grouped_train_map_sizes_sum, df_grouped_val_map_sizes_sum, k_fold_train_df_index.shape, k_fold_val_df_index.shape)
print(df_grouped_train_map_sizes_sum + df_grouped_val_map_sizes_sum - k_fold_train_df_index.shape[0] - k_fold_val_df_index.shape[0])

4714017 523743 (4714182,) (523798,)
-220


## Stock-based feature engineering

In [23]:
normalize_columns = set([
    "imbalance_size",
    "matched_size",
    "bid_size",
    "ask_size",
])
normalize_columns = list(normalize_columns.intersection(set(feat_dynamic_real)))
print(normalize_columns)

['bid_size', 'matched_size', 'ask_size', 'imbalance_size']


In [24]:
stock_processors = [
    StockNormalizeFeaturesPreprocessor(normalize_columns),
]
stock_processor = CompositeGroupedDataPreprocessor(stock_processors)

In [25]:
stock_processor.fit(df_grouped_train_map)

CompositeGroupedDataPreprocessor - fit - start - processors: 1
Processing StockNormalizeFeaturesPreprocessor...
StockNormalizeFeaturesPreprocessor took 0.50s.
CompositeGroupedDataPreprocessor - fit - end - processors: 1


In [26]:
df_grouped_train_map = stock_processor.apply(df_grouped_train_map)
df_grouped_val_map = stock_processor.apply(df_grouped_val_map)

CompositeGroupedDataPreprocessor - apply - start - processors: 1
Processing StockNormalizeFeaturesPreprocessor...
StockNormalizeFeaturesPreprocessor took 0.90s.
CompositeGroupedDataPreprocessor - apply - end - processors: 1
CompositeGroupedDataPreprocessor - apply - start - processors: 1
Processing StockNormalizeFeaturesPreprocessor...
StockNormalizeFeaturesPreprocessor took 0.57s.
CompositeGroupedDataPreprocessor - apply - end - processors: 1


In [27]:
display(df_grouped_train_map[0])
display(df_grouped_val_map[0])

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price,ask_size,wap,target,time_id,row_id,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12
0,0,0.0,0.0,-0.097735,1.0,0.999812,-0.562402,1.0,1.0,0.999812,...,1.000026,-0.572806,1.000000,-3.029704,0.0,0_0_0,1.000000,1.000000,1.000000,1.000000
191,0,0.0,10.0,-0.335304,1.0,1.000026,-0.470957,1.0,1.0,0.999812,...,1.000026,-0.220622,0.999892,0.389814,1.0,0_10_0,0.999946,1.000000,1.000000,1.000000
382,0,0.0,20.0,-0.335304,1.0,0.999919,-0.470957,1.0,1.0,0.999812,...,0.999919,-0.487525,0.999842,4.220009,2.0,0_20_0,0.999911,0.999911,1.000000,1.000000
573,0,0.0,30.0,-0.335304,1.0,1.000133,-0.470957,1.0,1.0,1.000026,...,1.000133,0.311051,1.000085,5.450249,3.0,0_30_0,0.999940,0.999955,1.000000,1.000000
764,0,0.0,40.0,-0.345607,1.0,1.000455,-0.466991,1.0,1.0,1.000241,...,1.000455,-0.148168,1.000317,3.169775,4.0,0_40_0,1.000081,1.000027,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4713380,0,433.0,170.0,0.090040,-1.0,0.999697,-0.572387,1.0,1.0,0.999697,...,0.999882,0.637466,0.999712,-3.020167,23832.0,433_170_0,1.000047,1.000423,1.000468,1.000286
4713580,0,433.0,180.0,0.211295,-1.0,0.999328,-0.505898,1.0,1.0,0.998960,...,0.999328,-0.379403,0.999187,0.280142,23833.0,433_180_0,0.999669,1.000172,1.000354,1.000228
4713780,0,433.0,190.0,0.137473,-1.0,0.999144,-0.466369,1.0,1.0,0.998960,...,0.999328,-0.169834,0.999069,-0.479817,23834.0,433_190_0,0.999323,0.999878,1.000231,1.000170
4713980,0,433.0,200.0,0.137473,-1.0,0.998960,-0.466369,1.0,1.0,0.998960,...,0.999328,-0.074576,0.999058,-3.259778,23835.0,433_200_0,0.999105,0.999576,1.000097,1.000117


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price,ask_size,wap,target,time_id,row_id,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12
4714380,0,433.0,220.0,0.134918,-1.0,0.998406,-0.466369,1.000000,1.000000,0.998406,...,0.998591,-0.061130,0.998446,5.960464,23837.0,433_220_0,0.998722,0.999022,0.999790,0.999981
4714580,0,433.0,230.0,0.134918,-1.0,0.998591,-0.466369,1.000000,1.000000,0.998591,...,0.998775,1.641277,0.998629,9.599924,23838.0,433_230_0,0.998579,0.998842,0.999632,0.999924
4714780,0,433.0,240.0,0.134918,-1.0,0.998775,-0.466369,1.000000,1.000000,0.998775,...,0.998960,-0.417637,0.998886,7.909536,23839.0,433_240_0,0.998654,0.998791,0.999482,0.999878
4714980,0,433.0,250.0,-0.085818,-1.0,0.999052,-0.380420,1.000000,1.000000,0.998960,...,0.999144,-0.199153,0.999072,16.570091,23840.0,433_250_0,0.998862,0.998792,0.999335,0.999852
4715180,0,433.0,260.0,-0.052557,-1.0,0.999236,-0.393075,1.000000,1.000000,0.998775,...,0.999144,-0.544559,0.999030,19.609928,23841.0,433_260_0,0.998996,0.998787,0.999182,0.999812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5236980,0,480.0,500.0,-0.499480,0.0,0.999017,0.836955,0.999017,0.999017,0.999017,...,0.999193,2.651762,0.999065,3.999472,26450.0,480_500_0,0.999158,0.999185,0.999024,0.998936
5237180,0,480.0,510.0,-0.499480,0.0,0.998842,0.836955,0.998842,0.998842,0.998842,...,0.999017,2.062411,0.998941,3.190041,26451.0,480_510_0,0.999030,0.999190,0.999027,0.998944
5237380,0,480.0,520.0,-0.439417,-1.0,0.999193,0.813836,0.999017,0.999017,0.999193,...,0.999368,0.324581,0.999289,-0.169873,26452.0,480_520_0,0.999098,0.999166,0.999057,0.998952
5237580,0,480.0,530.0,-0.439417,-1.0,0.999193,0.813836,0.999017,0.999017,0.999193,...,0.999368,2.103471,0.999291,3.110170,26453.0,480_530_0,0.999174,0.999166,0.999088,0.998969


## Final features

In [28]:
# update feat_dynamic_real if needed after per-stock feature engineering
feat_dynamic_real = feat_dynamic_real

## Prepare Pytorch datasets

In [29]:
class StockTrainingDataset(torch.utils.data.Dataset):
    def __init__(self, stock_df, feature_names, target_col, item_id, context_length):
        super().__init__()
        self.features = stock_df[feature_names]
        self.targets = stock_df[target_col]
        self.item_id = item_id
        self.context_length = context_length
        # possible idx = 0 ... 26455 - 55 + 1
        self.total_size = self.features.shape[0] - context_length + 1

    def __len__(self):
        return self.total_size

    def __getitem__(self, idx):
        start_idx = idx
        end_idx = idx + self.context_length
        features = self.features.iloc[start_idx:end_idx]
        targets = self.targets.iloc[start_idx:end_idx]
        return features.values, self.item_id, targets.values

In [30]:
stock_training_datasets = []
for item_id, gdf in df_grouped_train_map.items():
    stock_training_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

stock_validation_datasets = []
for item_id, gdf in df_grouped_val_map.items():
    stock_validation_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

print(len(stock_training_datasets), len(stock_validation_datasets))

200 200


In [31]:
full_training_dataset = torch.utils.data.ConcatDataset(stock_training_datasets)
full_validation_dataset = torch.utils.data.ConcatDataset(stock_validation_datasets)
print(len(full_training_dataset), len(full_validation_dataset))

4703217 512943


In [32]:
training_sampler = torch.utils.data.RandomSampler(full_training_dataset)
validation_sampler = torch.utils.data.SequentialSampler(full_validation_dataset)

In [33]:
training_dataloader = torch.utils.data.DataLoader(
    full_training_dataset,
    batch_size=training_batch_size,
    sampler=training_sampler,
    # https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading
    num_workers=4,
    # https://pytorch.org/docs/stable/data.html#memory-pinning
    pin_memory=True,
)
validation_dataloader = torch.utils.data.DataLoader(
    full_validation_dataset,
    batch_size=validation_batch_size,
    sampler=validation_sampler,
    num_workers=4,
    pin_memory=True,
)

In [34]:
training_sample_batch = next(iter(training_dataloader))
print("training_sample_batch", training_sample_batch[0].size(), training_sample_batch[1].size(), training_sample_batch[2].size())
print("training_sample_batch", training_sample_batch[0].type(), training_sample_batch[1].type(), training_sample_batch[2].type())
validation_sample_batch = next(iter(validation_dataloader))
print("validation_sample_batch", validation_sample_batch[0].size(), validation_sample_batch[1].size(), validation_sample_batch[2].size())
print("validation_sample_batch", validation_sample_batch[0].type(), validation_sample_batch[1].type(), validation_sample_batch[2].type())

training_sample_batch torch.Size([256, 55, 16]) torch.Size([256]) torch.Size([256, 55])
training_sample_batch torch.FloatTensor torch.LongTensor torch.FloatTensor
validation_sample_batch torch.Size([256, 55, 16]) torch.Size([256]) torch.Size([256, 55])
validation_sample_batch torch.FloatTensor torch.LongTensor torch.FloatTensor


## Create model

In [35]:
model = TransformerModel(
    num_input_features=num_input_features,
    num_classes=num_classes,
    embedding_dim=embedding_dim,
    d_model=d_model,
    nhead=nhead,
    d_hid=d_hid,
    nlayers=nlayers,
    dropout=dropout,
)
model.apply(init_weights)

TransformerModel(
  (embedding): Embedding(200, 4)
  (input_ff): Linear(in_features=20, out_features=32, bias=True)
  (input_ff_sigmoid): Sigmoid()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=32, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (final_linear): Linear(in_features=32, out_features=1, bias=True)
)

In [36]:
criterion = nn.L1Loss()
validation_criterion = nn.L1Loss()

## Tensorboard

Start with `tensorboard --logdir=lightning_logs/` cmd.

In [37]:
# %reload_ext tensorboard
# %tensorboard --logdir=lightning_logs/

## Module and trainer (lightning)

In [38]:
class TransformerModelModule(L.LightningModule):
    def __init__(self, model, criterion, validation_criterion, lr):
        super().__init__()
        self.model = model
        self.criterion = criterion
        self.validation_criterion = validation_criterion
        self.lr = lr
        self.validation_step_outputs = []
        self.validation_step_actual_targets = []

    def training_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets[:, -1]
        output = self.model(features, item_id)
        loss = self.criterion(output, actual_targets)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets[:, -1]
        output = self.model(features, item_id)
        # TODO: is loss being averaged based on batch size
        loss = self.validation_criterion(output, actual_targets)
        # lightning will take weighted-average on loss per step based on batch size
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.append(output)
        self.validation_step_actual_targets.append(actual_targets)

    def on_validation_epoch_end(self):
        # TODO: remove manual calculation of validation loss if we can confirm lightning will take weighted average
        # cat is used instead of stack, last step may have different batch size
        all_preds = torch.cat(self.validation_step_outputs)
        all_actual_targets = torch.cat(self.validation_step_actual_targets)
        manual_loss = self.validation_criterion(all_preds, all_actual_targets)
        self.log("val_loss_manual", manual_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.clear()  # free memory
        self.validation_step_actual_targets.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=self.lr,
            betas=(0.9, 0.95),
            weight_decay=1e-1,
        )
        return [optimizer]

In [39]:
model = TransformerModelModule(
    model,
    criterion,
    validation_criterion,
    learning_rate,
)

In [40]:
trainer = L.Trainer(
    max_epochs=num_epochs,
    limit_train_batches=limit_train_batches,
    # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html#gradient-clipping
    gradient_clip_val=gradient_clip_val,
    callbacks=[
        # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html#richprogressbar
        RichProgressBar(leave=True),
    ],
    logger=tb_logger,
    # https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
    deterministic=True,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


In [41]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val_loss': 6.159514427185059, 'val_loss_manual': 6.159517765045166}]

## Training

In [42]:
trainer.fit(
    model=model,
    train_dataloaders=training_dataloader,
    val_dataloaders=validation_dataloader,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

Output()

`Trainer.fit` stopped: `max_epochs=2` reached.


In [43]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val_loss': 5.947511672973633, 'val_loss_manual': 5.947508811950684}]