# Transformer encoder

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math

import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

import lightning as L
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import RichProgressBar
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.callbacks import ModelCheckpoint

In [None]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, CompositeGroupedDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
    RemoveIrrelevantFeaturesDataTransformer,
    EWMAPreProcessor, 
    EWMAFillNaPreprocessor
)
from data_preprocessor.stock_feature_engineering import (
    StockNormalizeFeaturesPreprocessor,
)
from data_preprocessor.deep_feature_synthesis import StockDateIdPreprocessor, FeatureToolsDFSTransformer
from data_preprocessor.normalization import NormalizationDataTransformer
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor, StockIdFeaturesDataTransformer
from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_preprocessor.stocks_pca_preprocessor import StocksPcaPreProcessor

from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator
from utils.dataframe_utils import get_df_summary_str

In [None]:
# https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
seed_everything(42, workers=True)

# Hyperparameters / Setup

In [None]:
checkpoint_dir = "transformer_encoder_by_time_checkpoints/20240430_transformer_stockid"

In [None]:
model_folder_version_name = None
# change to a string for a specific name
# model_folder_version_name = "test"

# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html
# https://lightning.ai/docs/pytorch/stable/extensions/logging.html
tb_logger = pl_loggers.TensorBoardLogger(".", version=model_folder_version_name)

## Train-validation split (align with LightGBM experiment/setup for fair comparison)

In [None]:
N_fold = 5
time_series_k_fold_data_generator = TimeSeriesKFoldDataGenerator(n_fold=N_fold, test_set_ratio=0.1)

## Data hyperparameters

In [None]:
prediction_length = 1
seq_len = 55 * 2

training_batch_size = 256
validation_batch_size = 256

## Model hyperparameters

In [None]:
embedding_dim = 4
d_model = 32
nhead = 4
d_hid = 32
nlayers = 2
dropout = 0.1

## Training hyperparameters

In [None]:
num_epochs = 10
limit_train_batches = 1.0
# limit_train_batches = 2
learning_rate = 1e-3
gradient_clip_val = 0.5

# Model

TODO:
1. use input_ff_sigmoid?
2. any further model enhancement?

In [None]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_input_features: int,
        num_classes: int,
        embedding_dim: int,
        d_model: int,
        nhead: int,
        d_hid: int,
        nlayers: int,
        dropout: float,
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.input_ff = nn.Linear(num_input_features + embedding_dim, d_model)
        self.input_ff_sigmoid = nn.Sigmoid()
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.final_linear = nn.Linear(d_model, 1)

    def forward(
        self,
        src: Tensor,
        item_ids: Tensor,
        src_mask: Tensor = None,
    ) -> Tensor:
        # src: [batch_size b, seq_len k 200, features 27]
        # item_ids: [batch_size b, seq_len 200]
        batch_size = src.size(dim=0)
        seq_len = src.size(dim=1)
        num_input_features = src.size(dim=2)
        output = src
        embedded = self.embedding(item_ids)
        # embedded: [batch_size b, seq_len 200, embedding_dim]
        # idea from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py#L1290
        # embedded = embedded.unsqueeze(dim=1)
        # embedded: [batch_size b, 1, embedding_dim]
        # embedded = embedded.expand(-1, seq_len, -1)
        # embedded: [batch_size b, seq_len k 55, embedding_dim]

        output = torch.cat((src, embedded), dim=-1)
        # [batch_size b, seq_len k 55, features 27 + embedding_dim]

        output = self.input_ff(output)
        # [batch_size b, seq_len k 55, d_model]

        # TODO: do we need sigmoid?
        output = self.input_ff_sigmoid(output)

        # if src_mask is None:
        #     """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
        #     Unmasked positions are filled with float(0.0).
        #     """
        #     src_mask = nn.Transformer.generate_square_subsequent_mask(seq_len)
        #     # no "to device" for lightning
        #     # https://pytorch.org/docs/stable/generated/torch.Tensor.to.html#torch.Tensor.to
        #     src_mask = src_mask.to(output)
        #     # src_mask = squared (triangle matrix) matrix [seq_len k 55, seq_len k 55]

        output = self.transformer_encoder(output, src_mask)
        # [batch_size b, seq_len k 200, d_model]

        output = self.final_linear(output)
        # [batch_size b, seq_len k 200, 1]

        # # take the "last" prediction, which includes all previous information
        # output = output[:, -1, :]
        # # [batch_size b, 1]
        output = output.squeeze(dim=2)
        # [batch_size b, , seq_len k 200]

        return output

In [None]:
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

## Load data

In [None]:
df = pd.read_csv(
    "../optiver-trading-at-the-close/train.csv",
    dtype={
        # stock_id should be int64 / long for embedding
        "date_id": np.float32,
        "seconds_in_bucket": np.float32,
        "imbalance_size": np.float32,
        "imbalance_buy_sell_flag": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
        "far_price": np.float32,
        "near_price": np.float32,
        "bid_price": np.float32,
        "bid_size": np.float32,
        "ask_price": np.float32,
        "ask_size": np.float32,
        "wap": np.float32,
        "target": np.float32,
        "time_id": np.int64,
    },
    usecols=[
        "stock_id",
        "date_id",
        "seconds_in_bucket",
        "imbalance_size",
        "imbalance_buy_sell_flag",
        "reference_price",
        "matched_size",
        "far_price",
        "near_price",
        "bid_price",
        "bid_size",
        "ask_price",
        "ask_size",
        "wap",
        "target",
        "time_id",
    ]
)
raw_df = df.copy(deep=True)

In [None]:
# deep copy for easy reference to raw df without reloading from csv
df = raw_df.copy(deep=True)

In [None]:
display(df)
print(df.dtypes)

In [None]:
k_fold_train_dfs, k_fold_val_dfs, k_fold_num_train_val_sets = time_series_k_fold_data_generator.generate(df)
k_fold_train_df = k_fold_train_dfs[-1]
k_fold_val_df = k_fold_val_dfs[-1]
k_fold_train_df_index = k_fold_train_df.index
k_fold_val_df_index = k_fold_val_df.index
print(len(k_fold_train_dfs), len(k_fold_val_dfs), k_fold_num_train_val_sets, N_fold)
assert k_fold_num_train_val_sets == N_fold and len(k_fold_train_dfs) == N_fold and len(k_fold_val_dfs) == N_fold
print(k_fold_train_df.shape, k_fold_val_df.shape, df.shape)
assert k_fold_train_df.shape[0] + k_fold_val_df.shape[0] == df.shape[0]

## Data pre-processing and features

In [None]:
processors = [    
    ReduceMemUsageDataPreprocessor(verbose=True),
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    # BasicFeaturesPreprocessor(),
    # DupletsTripletsPreprocessor(enable_triplets=False),
    # MovingAvgPreProcessor("wap"),
    # MovingAvgFillNaPreprocessor("wap", 1.0),
    # EWMAPreProcessor("wap", 10),
    # EWMAFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),   
    # StocksPcaPreProcessor(),
    # DTWKMeansPreprocessor(),
    # DfsPreProcessor(),
    # StockDateIdPreprocessor(), 
    # FeatureToolsDFSPreprocessor(),
    # DropTargetNADataPreprocessor(),    
    # RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(1.0),
    # PolynomialFeaturesPreProcessor(),
]
processor = CompositeDataPreprocessor(processors)

In [None]:
df = processor.apply(df)

In [None]:
print(df.columns)
display(df)

In [None]:
not_include_in_feat = ['stock_id', 'target', 'time_id']

In [None]:
feat_dynamic_real = [item for item in df.columns if item not in not_include_in_feat]
feat_dynamic_real

In [None]:
# feat_dynamic_real = [
#     "date_id",
#     "seconds_in_bucket",
#     "imbalance_size",
#     "reference_price",
#     "matched_size",
#     "far_price",
#     "near_price",
#     "bid_price",
#     "bid_size",
#     "ask_price",
#     "ask_size",
#     "wap",
#     # "wap_mov_avg_3_1",
#     # "wap_mov_avg_6_3",
#     # "wap_mov_avg_12_6",
#     # "wap_mov_avg_24_12",
# ]
num_input_features = len(feat_dynamic_real)
num_classes = 200
print(num_input_features, num_classes)

In [None]:
# should not have any na features
any_na_values_mask = df[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)
assert any_na_values_mask[any_na_values_mask].shape[0] == 0

In [None]:
df[any_na_values_mask]

In [None]:
df = df.set_index(["stock_id", "time_id"])
stock_id_list = np.arange(200, dtype=np.int64)
time_id_list = np.arange(26455, dtype=np.int64)
pd_multiindex = pd.MultiIndex.from_product([stock_id_list, time_id_list], names=("stock_id", "time_id"))
df = df.reindex(pd_multiindex, fill_value=0.0)
df = df.reset_index()
display(df)

## Group by stock_id

TODO: group by date_id too? or use embedding for date_id?

In [None]:
# df_grouped = df.groupby("date_id")
# num_classes = len(df_grouped)
# print(num_classes)
# print(df_grouped.size())

### Separate df_grouped into training and validation set by time-series k-fold index

In [None]:
# df_grouped_train_map = {}
# df_grouped_val_map = {}
# df_grouped_train_map_sizes_sum = 0
# df_grouped_val_map_sizes_sum = 0

# for item_id, gdf in df_grouped:
#     training_k_fold_idx_filter_mask = gdf.index.isin(k_fold_train_df_index)

#     training_df = gdf[training_k_fold_idx_filter_mask]
#     df_grouped_train_map[item_id] = training_df
#     df_grouped_train_map_sizes_sum += training_df.shape[0]

#     validation_df = gdf[~training_k_fold_idx_filter_mask]
#     df_grouped_val_map[item_id] = validation_df
#     df_grouped_val_map_sizes_sum += validation_df.shape[0]

#     assert training_df.shape[0] > 0 \
#         and validation_df.shape[0] > 0 \
#         and training_df.shape[0] + validation_df.shape[0] == gdf.shape[0], f"{item_id} invalid shape, training_df: {training_df.shape}, validation_df: {validation_df.shape}"

# print(df_grouped_train_map_sizes_sum, df_grouped_val_map_sizes_sum, k_fold_train_df_index.shape, k_fold_val_df_index.shape)
# print(df_grouped_train_map_sizes_sum + df_grouped_val_map_sizes_sum - k_fold_train_df_index.shape[0] - k_fold_val_df_index.shape[0])

## Stock-based feature engineering

In [None]:
# normalize_columns = set([
#     "imbalance_size",
#     "matched_size",
#     "bid_size",
#     "ask_size",
# ])
# normalize_columns = list(normalize_columns.intersection(set(feat_dynamic_real)))
# print(normalize_columns)

In [None]:
# stock_processors = [
#     StockNormalizeFeaturesPreprocessor(normalize_columns),
# ]
# stock_processor = CompositeGroupedDataPreprocessor(stock_processors)

In [None]:
# stock_processor.fit(df_grouped_train_map)

In [None]:
# df_grouped_train_map = stock_processor.apply(df_grouped_train_map)
# df_grouped_val_map = stock_processor.apply(df_grouped_val_map)

In [None]:
# display(df_grouped_train_map[0])
# display(df_grouped_val_map[0])

## Final features

In [None]:
# update feat_dynamic_real if needed after per-stock feature engineering
feat_dynamic_real = feat_dynamic_real

## Prepare Pytorch datasets

In [None]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, df, feature_names, category_cols, target_col, time_idx_col, num_time_pts, time_idx_offset):
        super().__init__()
        self.df = df
        self.features = df[feature_names]
        self.categories = df[category_cols]
        self.targets = df[target_col]
        self.time_idx = df[time_idx_col]
        self.num_time_pts = num_time_pts
        self.time_idx_offset = time_idx_offset

    def __len__(self):
        return self.num_time_pts

    def __getitem__(self, idx):
        time_idx_mask = self.time_idx == self.time_idx_offset + idx
        features = self.features[time_idx_mask]
        categories = self.categories[time_idx_mask]
        targets = self.targets[time_idx_mask]
        return features.values, categories.values, targets.values

In [None]:
# stock_training_datasets = []
# for item_id, gdf in df_grouped_train_map.items():
#     stock_training_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

# stock_validation_datasets = []
# for item_id, gdf in df_grouped_val_map.items():
#     stock_validation_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

# print(len(stock_training_datasets), len(stock_validation_datasets))

In [None]:
# full_training_dataset = torch.utils.data.ConcatDataset(stock_training_datasets)
# full_validation_dataset = torch.utils.data.ConcatDataset(stock_validation_datasets)
# print(len(full_training_dataset), len(full_validation_dataset))

In [None]:
val_cutoff_time_id = math.floor(26455 * 0.9)
df_train = df[df["time_id"] < val_cutoff_time_id]
df_eval = df[df["time_id"] >= val_cutoff_time_id]
print(df_train.shape, df_eval.shape, val_cutoff_time_id)

### DFS, normalize, and other fit-transform pipelines

In [None]:
transform_pipeline = make_pipeline(
    # FeatureToolsDFSTransformer(
    #     group_by_stock=True,
    #     group_by_date=False,
    #     group_by_stock_date=False,
    # ),
    StockIdFeaturesDataTransformer(),
    NormalizationDataTransformer(
        [
            "imbalance_size",
            "matched_size",
            "bid_size",
            "ask_size",
        ],
        "closing_movements",
    ),
    # do not remove columns, use feat_dynamic_real white-list
    RemoveIrrelevantFeaturesDataTransformer(['stock_id', 'date_id','time_id', 'row_id', "stock_date_id"]),
    verbose=True,
)
# transform_pipeline = None

In [None]:
print(f"before transform_pipeline - df_train: {get_df_summary_str(df_train)}, df_eval: {get_df_summary_str(df_eval)}")
if transform_pipeline is not None:
    df_train = transform_pipeline.fit_transform(df_train)
    print(f"fit_transform df_train - df_train: {get_df_summary_str(df_train)}")
    df_eval = transform_pipeline.transform(df_eval)
    print(f"transform df_eval - df_eval: {get_df_summary_str(df_eval)}")

In [None]:
display(df_train)
display(df_eval)

In [None]:
# temp solution to add dfs features to feat_dynamic_real, dfs feature columns will contain "closing_movements"
for col in df_train.columns.tolist():
    if col.find("closing_movements") != -1:
        feat_dynamic_real.append(col)
num_input_features = len(feat_dynamic_real)
print(num_input_features, feat_dynamic_real)

In [None]:
# temp solution to force cast data types for pytorch model
df_train = df_train.astype(np.float32)
df_eval = df_eval.astype(np.float32)
df_train = df_train.astype({
    "stock_id": np.int64,
    "time_id": np.int64
})
df_eval = df_eval.astype({
    "stock_id": np.int64,
    "time_id": np.int64
})

In [None]:
display(df_train)
display(df_eval)

### Create Pytorch datasets from final dfs

In [None]:
full_training_dataset = OptiverDataset(df_train, feat_dynamic_real, "stock_id", "target", "time_id", val_cutoff_time_id, 0)
full_validation_dataset = OptiverDataset(df_eval, feat_dynamic_real, "stock_id", "target", "time_id", 26455 - val_cutoff_time_id, val_cutoff_time_id)
print(len(full_training_dataset), len(full_validation_dataset))

In [None]:
training_sampler = torch.utils.data.RandomSampler(full_training_dataset)
validation_sampler = torch.utils.data.SequentialSampler(full_validation_dataset)

In [None]:
training_dataloader = torch.utils.data.DataLoader(
    full_training_dataset,
    batch_size=training_batch_size,
    sampler=training_sampler,
    # https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading
    # num_workers=4,
    # https://pytorch.org/docs/stable/data.html#memory-pinning
    # pin_memory=True,
)
validation_dataloader = torch.utils.data.DataLoader(
    full_validation_dataset,
    batch_size=validation_batch_size,
    sampler=validation_sampler,
    # num_workers=4,
    # pin_memory=True,
)

In [None]:
training_sample_batch = next(iter(training_dataloader))
print("training_sample_batch", training_sample_batch[0].size(), training_sample_batch[1].size())
print("training_sample_batch", training_sample_batch[0].type(), training_sample_batch[1].type())
validation_sample_batch = next(iter(validation_dataloader))
print("validation_sample_batch", validation_sample_batch[0].size(), validation_sample_batch[1].size())
print("validation_sample_batch", validation_sample_batch[0].type(), validation_sample_batch[1].type())

## Create model

In [None]:
model = TransformerModel(
    num_input_features=num_input_features,
    num_classes=num_classes,
    embedding_dim=embedding_dim,
    d_model=d_model,
    nhead=nhead,
    d_hid=d_hid,
    nlayers=nlayers,
    dropout=dropout,
)
model.apply(init_weights)

In [None]:
criterion = nn.L1Loss()
validation_criterion = nn.L1Loss()

## Tensorboard

Start with `tensorboard --logdir=lightning_logs/` cmd.

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir=lightning_logs/

## Module and trainer (lightning)

In [None]:
class TransformerModelModule(L.LightningModule):
    def __init__(
        self,
        model: nn.Module,
        criterion: nn.Module,
        validation_criterion: nn.Module,
        lr,
    ):
        super().__init__()
        self.save_hyperparameters()
        self.model = model
        self.criterion = criterion
        self.validation_criterion = validation_criterion
        self.lr = lr
        self.validation_step_outputs = []
        self.validation_step_actual_targets = []

    def training_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets
        output = self.model(features, item_id)
        loss = self.criterion(output, actual_targets)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets
        output = self.model(features, item_id)
        # TODO: is loss being averaged based on batch size
        loss = self.validation_criterion(output, actual_targets)
        # lightning will take weighted-average on loss per step based on batch size
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.append(output)
        self.validation_step_actual_targets.append(actual_targets)

    def on_validation_epoch_end(self):
        # TODO: remove manual calculation of validation loss if we can confirm lightning will take weighted average
        # cat is used instead of stack, last step may have different batch size
        all_preds = torch.cat(self.validation_step_outputs)
        all_actual_targets = torch.cat(self.validation_step_actual_targets)
        manual_loss = self.validation_criterion(all_preds, all_actual_targets)
        self.log("val_loss_manual", manual_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.clear()  # free memory
        self.validation_step_actual_targets.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=self.lr,
            betas=(0.9, 0.95),
            weight_decay=1e-1,
        )
        return [optimizer]

In [None]:
model = TransformerModelModule(
    model,
    criterion,
    validation_criterion,
    learning_rate,
)

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath=checkpoint_dir,
    save_top_k=2,
    monitor="val_loss",
    mode="min",
    filename="transformer-{epoch:02d}-{val_loss:.4f}"
)

In [None]:
trainer = L.Trainer(
    max_epochs=num_epochs,
    limit_train_batches=limit_train_batches,
    # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html#gradient-clipping
    gradient_clip_val=gradient_clip_val,
    callbacks=[
        # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html#richprogressbar
        RichProgressBar(leave=True),
        checkpoint_callback,
    ],
    logger=tb_logger,
    # https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
    deterministic=True,
)
print(trainer.callbacks)

In [None]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

## Training

In [None]:
print(checkpoint_dir)
trainer.fit(
    model=model,
    train_dataloaders=training_dataloader,
    val_dataloaders=validation_dataloader,
)

In [None]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

In [None]:
model

## Load saved checkpoint

In [None]:
checkpoint_callback.best_model_path

In [None]:
checkpoint_path = checkpoint_callback.best_model_path
print(checkpoint_path)

In [None]:
saved_model = TransformerModelModule.load_from_checkpoint(checkpoint_path)
print(saved_model)

In [None]:
torch_checkpoint = torch.load(checkpoint_path)
print(torch_checkpoint.keys())
print(torch_checkpoint["state_dict"].keys())
print(torch_checkpoint["hyper_parameters"])