# Transformer encoder

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math

import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim import AdamW

import lightning as L
from lightning.pytorch import seed_everything
from lightning.pytorch.callbacks import RichProgressBar
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.callbacks import ModelCheckpoint

In [3]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, CompositeGroupedDataPreprocessor, ReduceMemUsageDataPreprocessor, FillNaPreProcessor
from data_preprocessor.feature_engineering import (
    BasicFeaturesPreprocessor,
    DupletsTripletsPreprocessor,
    MovingAvgPreProcessor,
    RemoveIrrelevantFeaturesDataPreprocessor,
    DropTargetNADataPreprocessor,
    FarNearPriceFillNaPreprocessor,
    MovingAvgFillNaPreprocessor,
    RemoveRecordsByStockDateIdPreprocessor,
    RemoveIrrelevantFeaturesDataTransformer,
)
from data_preprocessor.stock_feature_engineering import (
    StockNormalizeFeaturesPreprocessor,
)
from data_preprocessor.deep_feature_synthesis import StockDateIdPreprocessor, FeatureToolsDFSTransformer
from data_preprocessor.normalization import NormalizationDataTransformer
from data_preprocessor.polynomial_features import PolynomialFeaturesPreProcessor
from data_preprocessor.stockid_features import StockIdFeaturesPreProcessor
from data_preprocessor.deep_feature_synthesis import DfsPreProcessor
from data_generator.data_generator import DefaultTrainEvalDataGenerator, ManualKFoldDataGenerator, TimeSeriesKFoldDataGenerator
from utils.dataframe_utils import get_df_summary_str

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fb389bd18a0>
Traceback (most recent call last):
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^^^^^

In [4]:
# https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
seed_everything(42, workers=True)

Seed set to 42


42

# Hyperparameters / Setup

In [5]:
checkpoint_dir = "transformer_encoder_by_time_checkpoints/20240426_transformer_stockid"

In [6]:
model_folder_version_name = None
# change to a string for a specific name
# model_folder_version_name = "test"

# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.loggers.tensorboard.html
# https://lightning.ai/docs/pytorch/stable/extensions/logging.html
tb_logger = pl_loggers.TensorBoardLogger(".", version=model_folder_version_name)

## Train-validation split (align with LightGBM experiment/setup for fair comparison)

In [7]:
N_fold = 5
time_series_k_fold_data_generator = TimeSeriesKFoldDataGenerator(n_fold=N_fold, test_set_ratio=0.1)

## Data hyperparameters

In [8]:
prediction_length = 1
seq_len = 55 * 2

training_batch_size = 256
validation_batch_size = 256

## Model hyperparameters

In [9]:
embedding_dim = 4
d_model = 32
nhead = 4
d_hid = 32
nlayers = 2
dropout = 0.1

## Training hyperparameters

In [10]:
num_epochs = 10
limit_train_batches = 1.0
# limit_train_batches = 2
learning_rate = 1e-3
gradient_clip_val = 0.5

# Model

TODO:
1. use input_ff_sigmoid?
2. any further model enhancement?

In [11]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        num_input_features: int,
        num_classes: int,
        embedding_dim: int,
        d_model: int,
        nhead: int,
        d_hid: int,
        nlayers: int,
        dropout: float,
    ):
        super().__init__()
        self.model_type = 'Transformer'
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.input_ff = nn.Linear(num_input_features + embedding_dim, d_model)
        self.input_ff_sigmoid = nn.Sigmoid()
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=True)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.d_model = d_model
        self.final_linear = nn.Linear(d_model, 1)

    def forward(
        self,
        src: Tensor,
        item_ids: Tensor,
        src_mask: Tensor = None,
    ) -> Tensor:
        # src: [batch_size b, seq_len k 200, features 27]
        # item_ids: [batch_size b, seq_len 200]
        batch_size = src.size(dim=0)
        seq_len = src.size(dim=1)
        num_input_features = src.size(dim=2)
        output = src
        embedded = self.embedding(item_ids)
        # embedded: [batch_size b, seq_len 200, embedding_dim]
        # idea from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py#L1290
        # embedded = embedded.unsqueeze(dim=1)
        # embedded: [batch_size b, 1, embedding_dim]
        # embedded = embedded.expand(-1, seq_len, -1)
        # embedded: [batch_size b, seq_len k 55, embedding_dim]

        output = torch.cat((src, embedded), dim=-1)
        # [batch_size b, seq_len k 55, features 27 + embedding_dim]

        output = self.input_ff(output)
        # [batch_size b, seq_len k 55, d_model]

        # TODO: do we need sigmoid?
        output = self.input_ff_sigmoid(output)

        # if src_mask is None:
        #     """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
        #     Unmasked positions are filled with float(0.0).
        #     """
        #     src_mask = nn.Transformer.generate_square_subsequent_mask(seq_len)
        #     # no "to device" for lightning
        #     # https://pytorch.org/docs/stable/generated/torch.Tensor.to.html#torch.Tensor.to
        #     src_mask = src_mask.to(output)
        #     # src_mask = squared (triangle matrix) matrix [seq_len k 55, seq_len k 55]

        output = self.transformer_encoder(output, src_mask)
        # [batch_size b, seq_len k 200, d_model]

        output = self.final_linear(output)
        # [batch_size b, seq_len k 200, 1]

        # # take the "last" prediction, which includes all previous information
        # output = output[:, -1, :]
        # # [batch_size b, 1]
        output = output.squeeze(dim=2)
        # [batch_size b, , seq_len k 200]

        return output

In [12]:
# https://stackoverflow.com/questions/49433936/how-do-i-initialize-weights-in-pytorch
# https://pytorch.org/docs/stable/nn.init.html#torch.nn.init.xavier_uniform_
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

## Load data

In [13]:
df = pd.read_csv(
    "../optiver-trading-at-the-close/train.csv",
    dtype={
        # stock_id should be int64 / long for embedding
        "date_id": np.float32,
        "seconds_in_bucket": np.float32,
        "imbalance_size": np.float32,
        "imbalance_buy_sell_flag": np.float32,
        "reference_price": np.float32,
        "matched_size": np.float32,
        "far_price": np.float32,
        "near_price": np.float32,
        "bid_price": np.float32,
        "bid_size": np.float32,
        "ask_price": np.float32,
        "ask_size": np.float32,
        "wap": np.float32,
        "target": np.float32,
        "time_id": np.int64,
    },
    usecols=[
        "stock_id",
        "date_id",
        "seconds_in_bucket",
        "imbalance_size",
        "imbalance_buy_sell_flag",
        "reference_price",
        "matched_size",
        "far_price",
        "near_price",
        "bid_price",
        "bid_size",
        "ask_price",
        "ask_size",
        "wap",
        "target",
        "time_id",
    ]
)
raw_df = df.copy(deep=True)

In [14]:
# deep copy for easy reference to raw df without reloading from csv
df = raw_df.copy(deep=True)

In [15]:
display(df)
print(df.dtypes)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id
0,0,0.0,0.0,3.180603e+06,1.0,0.999812,13380277.00,,,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704,0
1,1,0.0,0.0,1.666039e+05,-1.0,0.999896,1642214.25,,,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986,0
2,2,0.0,0.0,3.028799e+05,-1.0,0.999561,1819368.00,,,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950,0
3,3,0.0,0.0,1.191768e+07,-1.0,1.000171,18389746.00,,,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201,0
4,4,0.0,0.0,4.475500e+05,-1.0,0.999532,17860614.00,,,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480.0,540.0,2.440723e+06,-1.0,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276,26454
5237976,196,480.0,540.0,3.495105e+05,-1.0,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077,26454
5237977,197,480.0,540.0,0.000000e+00,0.0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443,26454
5237978,198,480.0,540.0,1.000899e+06,1.0,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184,26454


stock_id                     int64
date_id                    float32
seconds_in_bucket          float32
imbalance_size             float32
imbalance_buy_sell_flag    float32
reference_price            float32
matched_size               float32
far_price                  float32
near_price                 float32
bid_price                  float32
bid_size                   float32
ask_price                  float32
ask_size                   float32
wap                        float32
target                     float32
time_id                      int64
dtype: object


In [16]:
k_fold_train_dfs, k_fold_val_dfs, k_fold_num_train_val_sets = time_series_k_fold_data_generator.generate(df)
k_fold_train_df = k_fold_train_dfs[-1]
k_fold_val_df = k_fold_val_dfs[-1]
k_fold_train_df_index = k_fold_train_df.index
k_fold_val_df_index = k_fold_val_df.index
print(len(k_fold_train_dfs), len(k_fold_val_dfs), k_fold_num_train_val_sets, N_fold)
assert k_fold_num_train_val_sets == N_fold and len(k_fold_train_dfs) == N_fold and len(k_fold_val_dfs) == N_fold
print(k_fold_train_df.shape, k_fold_val_df.shape, df.shape)
assert k_fold_train_df.shape[0] + k_fold_val_df.shape[0] == df.shape[0]

5 5 5 5
(4714182, 16) (523798, 16) (5237980, 16)


## Data pre-processing and features

In [17]:
processors = [
    RemoveRecordsByStockDateIdPreprocessor([
        {"stock_id": 19, "date_id": 438},
        {"stock_id": 101, "date_id": 328},
        {"stock_id": 131, "date_id": 35},
        {"stock_id": 158, "date_id": 388},
    ]),
    FarNearPriceFillNaPreprocessor(),
    # ReduceMemUsageDataPreprocessor(verbose=True),
    BasicFeaturesPreprocessor(),
    # DupletsTripletsPreprocessor(enable_triplets=False),
    # MovingAvgPreProcessor("wap"),
    # MovingAvgFillNaPreprocessor("wap", 1.0),
    # StockIdFeaturesPreProcessor(),
    # DropTargetNADataPreprocessor(),
    # RemoveIrrelevantFeaturesDataPreprocessor(['stock_id', 'date_id','time_id', 'row_id']),
    # FillNaPreProcessor(),
    # PolynomialFeaturesPreProcessor(),
    StockDateIdPreprocessor(),
]
processor = CompositeDataPreprocessor(processors)

In [18]:
df = processor.apply(df)

CompositeDataPreprocessor - original df shape: (5237980, 16)
Processing RemoveRecordsByStockDateIdPreprocessor...
RemoveRecordsByStockDateIdPreprocessor - removing 220 records
RemoveRecordsByStockDateIdPreprocessor took 0.46s. New df shape: (5237760, 16).
Processing FarNearPriceFillNaPreprocessor...
FarNearPriceFillNaPreprocessor took 0.06s. New df shape: (5237760, 16).
Processing BasicFeaturesPreprocessor...
BasicFeaturesPreprocessor took 0.68s. New df shape: (5237760, 20).
Processing StockDateIdPreprocessor...
StockDateIdPreprocessor took 0.04s. New df shape: (5237760, 21).
CompositeDataPreprocessor - final df shape: (5237760, 21)


In [19]:
print(df.columns)
display(df)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'bid_ask_rr', 'shortage_s2',
       'pressure', 'shortage_s1', 'stock_date_id'],
      dtype='object')


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price,ask_size,wap,target,time_id,bid_ask_rr,shortage_s2,pressure,shortage_s1,stock_date_id
0,0,0.0,0.0,3.180603e+06,1.0,0.999812,13380277.00,1.000000,1.000000,0.999812,...,1.000026,8493.030273,1.000000,-3.029704,0,0.754340,-0.615890,374.495636,0.237708,0.0
1,1,0.0,0.0,1.666039e+05,-1.0,0.999896,1642214.25,1.000000,1.000000,0.999896,...,1.000660,20605.089844,1.000000,-5.519986,0,-0.728751,-0.815787,51.531654,0.101451,1.0
2,2,0.0,0.0,3.028799e+05,-1.0,0.999561,1819368.00,1.000000,1.000000,0.999403,...,1.000298,18995.000000,1.000000,-8.389950,0,0.332935,-0.714567,7.979763,0.166475,2.0
3,3,0.0,0.0,1.191768e+07,-1.0,1.000171,18389746.00,1.000000,1.000000,0.999999,...,1.000214,479032.406250,1.000000,-4.010201,0,-0.990340,-0.213547,5126.105469,0.648061,3.0
4,4,0.0,0.0,4.475500e+05,-1.0,0.999532,17860614.00,1.000000,1.000000,0.999394,...,1.000016,434.100006,1.000000,-7.349849,0,0.948687,-0.951109,27.148033,0.025058,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480.0,540.0,2.440723e+06,-1.0,1.000317,28280362.00,0.999734,0.999734,1.000317,...,1.000434,319862.406250,1.000328,2.310276,26454,-0.816784,-0.841104,75.664818,0.086305,480195.0
5237976,196,480.0,540.0,3.495105e+05,-1.0,1.000643,9187699.00,1.000129,1.000386,1.000643,...,1.000900,93393.070312,1.000819,-8.220077,26454,0.374254,-0.926706,1.704028,0.038041,480196.0
5237977,197,480.0,540.0,0.000000e+00,0.0,0.995789,12725436.00,0.995789,0.995789,0.995789,...,0.995883,180038.312500,0.995797,1.169443,26454,-0.829388,-1.000000,0.000000,0.000000,480197.0
5237978,198,480.0,540.0,1.000899e+06,1.0,0.999210,94773272.00,0.999210,0.999210,0.998970,...,0.999210,669893.000000,0.999008,-1.540184,26454,-0.684154,-0.979099,1.494117,0.010561,480198.0


In [20]:
not_include_in_feat = ['stock_id', 'target', 'time_id']

In [21]:
feat_dynamic_real = [item for item in df.columns if item not in not_include_in_feat]
feat_dynamic_real

['date_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap',
 'bid_ask_rr',
 'shortage_s2',
 'pressure',
 'shortage_s1',
 'stock_date_id']

In [22]:
# feat_dynamic_real = [
#     "date_id",
#     "seconds_in_bucket",
#     "imbalance_size",
#     "reference_price",
#     "matched_size",
#     "far_price",
#     "near_price",
#     "bid_price",
#     "bid_size",
#     "ask_price",
#     "ask_size",
#     "wap",
#     # "wap_mov_avg_3_1",
#     # "wap_mov_avg_6_3",
#     # "wap_mov_avg_12_6",
#     # "wap_mov_avg_24_12",
# ]
num_input_features = len(feat_dynamic_real)
num_classes = 200
print(num_input_features, num_classes)

18 200


In [23]:
# should not have any na features
any_na_values_mask = df[feat_dynamic_real].isna().any(axis=1)
print(any_na_values_mask.shape, any_na_values_mask[any_na_values_mask].shape)
assert any_na_values_mask[any_na_values_mask].shape[0] == 0

(5237760,) (0,)


In [24]:
df[any_na_values_mask]

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,ask_price,ask_size,wap,target,time_id,bid_ask_rr,shortage_s2,pressure,shortage_s1,stock_date_id


In [25]:
df = df.set_index(["stock_id", "time_id"])
stock_id_list = np.arange(200, dtype=np.int64)
time_id_list = np.arange(26455, dtype=np.int64)
pd_multiindex = pd.MultiIndex.from_product([stock_id_list, time_id_list], names=("stock_id", "time_id"))
df = df.reindex(pd_multiindex, fill_value=0.0)
df = df.reset_index()
display(df)

Unnamed: 0,stock_id,time_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,...,bid_size,ask_price,ask_size,wap,target,bid_ask_rr,shortage_s2,pressure,shortage_s1,stock_date_id
0,0,0,0.0,0.0,3180602.750,1.0,0.999812,13380277.0,1.000000,1.000000,...,60651.500000,1.000026,8493.030273,1.000000,-3.029704,0.754340,-0.615890,374.495636,0.237708,0.0
1,0,1,0.0,10.0,1299772.750,1.0,1.000026,15261107.0,1.000000,1.000000,...,13996.500000,1.000026,23519.160156,0.999892,0.389814,-0.253832,-0.843031,55.264420,0.085169,0.0
2,0,2,0.0,20.0,1299772.750,1.0,0.999919,15261107.0,1.000000,1.000000,...,4665.500000,0.999919,12131.599609,0.999842,4.220009,-0.444487,-0.843031,107.139435,0.085169,0.0
3,0,3,0.0,30.0,1299772.750,1.0,1.000133,15261107.0,1.000000,1.000000,...,55998.000000,1.000133,46203.300781,1.000085,5.450249,0.095837,-0.843031,28.131599,0.085169,0.0
4,0,4,0.0,40.0,1218204.375,1.0,1.000455,15342675.0,1.000000,1.000000,...,14655.950195,1.000455,26610.449219,1.000317,3.169775,-0.289691,-0.852882,45.779175,0.079400,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5290995,199,26450,480.0,500.0,2661783.500,-1.0,1.002764,23969216.0,1.000859,1.001494,...,122246.179688,1.003082,425296.156250,1.002835,-7.209778,-0.553473,-0.800099,21.773960,0.111050,480199.0
5290996,199,26451,480.0,510.0,2658917.500,-1.0,1.002764,23972082.0,1.000859,1.001494,...,677012.062500,1.003082,347268.875000,1.002974,-9.750128,0.321927,-0.800314,3.927430,0.110917,480199.0
5290997,199,26452,480.0,520.0,1352558.875,-1.0,1.001812,23978380.0,1.000859,1.001494,...,225361.656250,1.002129,194630.515625,1.001982,3.629923,0.073171,-0.893209,6.001726,0.056407,480199.0
5290998,199,26453,480.0,530.0,1352558.875,-1.0,1.001812,23978380.0,1.000859,1.001494,...,285559.062500,1.002129,214513.312500,1.001993,4.760027,0.142071,-0.893209,4.736529,0.056407,480199.0


## Group by stock_id

TODO: group by date_id too? or use embedding for date_id?

In [26]:
# df_grouped = df.groupby("date_id")
# num_classes = len(df_grouped)
# print(num_classes)
# print(df_grouped.size())

### Separate df_grouped into training and validation set by time-series k-fold index

In [27]:
# df_grouped_train_map = {}
# df_grouped_val_map = {}
# df_grouped_train_map_sizes_sum = 0
# df_grouped_val_map_sizes_sum = 0

# for item_id, gdf in df_grouped:
#     training_k_fold_idx_filter_mask = gdf.index.isin(k_fold_train_df_index)

#     training_df = gdf[training_k_fold_idx_filter_mask]
#     df_grouped_train_map[item_id] = training_df
#     df_grouped_train_map_sizes_sum += training_df.shape[0]

#     validation_df = gdf[~training_k_fold_idx_filter_mask]
#     df_grouped_val_map[item_id] = validation_df
#     df_grouped_val_map_sizes_sum += validation_df.shape[0]

#     assert training_df.shape[0] > 0 \
#         and validation_df.shape[0] > 0 \
#         and training_df.shape[0] + validation_df.shape[0] == gdf.shape[0], f"{item_id} invalid shape, training_df: {training_df.shape}, validation_df: {validation_df.shape}"

# print(df_grouped_train_map_sizes_sum, df_grouped_val_map_sizes_sum, k_fold_train_df_index.shape, k_fold_val_df_index.shape)
# print(df_grouped_train_map_sizes_sum + df_grouped_val_map_sizes_sum - k_fold_train_df_index.shape[0] - k_fold_val_df_index.shape[0])

## Stock-based feature engineering

In [28]:
# normalize_columns = set([
#     "imbalance_size",
#     "matched_size",
#     "bid_size",
#     "ask_size",
# ])
# normalize_columns = list(normalize_columns.intersection(set(feat_dynamic_real)))
# print(normalize_columns)

In [29]:
# stock_processors = [
#     StockNormalizeFeaturesPreprocessor(normalize_columns),
# ]
# stock_processor = CompositeGroupedDataPreprocessor(stock_processors)

In [30]:
# stock_processor.fit(df_grouped_train_map)

In [31]:
# df_grouped_train_map = stock_processor.apply(df_grouped_train_map)
# df_grouped_val_map = stock_processor.apply(df_grouped_val_map)

In [32]:
# display(df_grouped_train_map[0])
# display(df_grouped_val_map[0])

## Final features

In [33]:
# update feat_dynamic_real if needed after per-stock feature engineering
feat_dynamic_real = feat_dynamic_real

## Prepare Pytorch datasets

In [34]:
class OptiverDataset(torch.utils.data.Dataset):
    def __init__(self, df, feature_names, category_cols, target_col, time_idx_col, num_time_pts, time_idx_offset):
        super().__init__()
        self.df = df
        self.features = df[feature_names]
        self.categories = df[category_cols]
        self.targets = df[target_col]
        self.time_idx = df[time_idx_col]
        self.num_time_pts = num_time_pts
        self.time_idx_offset = time_idx_offset

    def __len__(self):
        return self.num_time_pts

    def __getitem__(self, idx):
        time_idx_mask = self.time_idx == self.time_idx_offset + idx
        features = self.features[time_idx_mask]
        categories = self.categories[time_idx_mask]
        targets = self.targets[time_idx_mask]
        return features.values, categories.values, targets.values

In [35]:
# stock_training_datasets = []
# for item_id, gdf in df_grouped_train_map.items():
#     stock_training_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

# stock_validation_datasets = []
# for item_id, gdf in df_grouped_val_map.items():
#     stock_validation_datasets.append(StockTrainingDataset(gdf, feat_dynamic_real, "target", item_id, seq_len))

# print(len(stock_training_datasets), len(stock_validation_datasets))

In [36]:
# full_training_dataset = torch.utils.data.ConcatDataset(stock_training_datasets)
# full_validation_dataset = torch.utils.data.ConcatDataset(stock_validation_datasets)
# print(len(full_training_dataset), len(full_validation_dataset))

In [37]:
val_cutoff_time_id = math.floor(26455 * 0.9)
df_train = df[df["time_id"] < val_cutoff_time_id]
df_eval = df[df["time_id"] >= val_cutoff_time_id]
print(df_train.shape, df_eval.shape, val_cutoff_time_id)

(4761800, 21) (529200, 21) 23809


### DFS, normalize, and other fit-transform pipelines

In [38]:
transform_pipeline = make_pipeline(
    FeatureToolsDFSTransformer(
        group_by_stock=True,
        group_by_date=False,
        group_by_stock_date=False,
    ),
    NormalizationDataTransformer(
        [
            "imbalance_size",
            "matched_size",
            "bid_size",
            "ask_size",
        ],
        "closing_movements",
    ),
    # do not remove columns, use feat_dynamic_real white-list
    # RemoveIrrelevantFeaturesDataTransformer(['stock_id', 'date_id','time_id', 'row_id', "stock_date_id"]),
    verbose=True,
)
# transform_pipeline = None

In [39]:
print(f"before transform_pipeline - df_train: {get_df_summary_str(df_train)}, df_eval: {get_df_summary_str(df_eval)}")
if transform_pipeline is not None:
    df_train = transform_pipeline.fit_transform(df_train)
    print(f"fit_transform df_train - df_train: {get_df_summary_str(df_train)}")
    df_eval = transform_pipeline.transform(df_eval)
    print(f"transform df_eval - df_eval: {get_df_summary_str(df_eval)}")

before transform_pipeline - df_train: shape: (4761800, 21), mem: 472.28546142578125 MB, df_eval: shape: (529200, 21), mem: 52.4871826171875 MB




FeatureToolsDFSTransformer - normalize_dataframe - start
FeatureToolsDFSTransformer - group_by_stock - normalize_dataframe
FeatureToolsDFSTransformer - normalize_dataframe - end
FeatureToolsDFSTransformer - es: Entityset: closing_movements_data
  DataFrames:
    closing_movements [Rows: 4761800, Columns: 22]
    stocks [Rows: 200, Columns: 2]
  Relationships:
    closing_movements.stock_id -> stocks.stock_id
FeatureToolsDFSTransformer - es closing_movements schema:                         Logical Type              Semantic Tag(s)
Column                                                           
row_id                       Integer                    ['index']
stock_id                 Categorical  ['category', 'foreign_key']
time_id                      Integer    ['time_index', 'numeric']
date_id                  Categorical                 ['category']
seconds_in_bucket             Double                  ['numeric']
imbalance_size                Double                  ['numeric']
im

In [40]:
display(df_train)
display(df_eval)

Unnamed: 0,row_id,stock_id,time_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,...,SKEW(closing_movements.shortage_s1),SKEW(closing_movements.shortage_s2),STD(closing_movements.ask_price),STD(closing_movements.ask_size),STD(closing_movements.bid_ask_rr),STD(closing_movements.bid_size),STD(closing_movements.pressure),STD(closing_movements.seconds_in_bucket),STD(closing_movements.shortage_s1),STD(closing_movements.shortage_s2)
0,0,0,0,0.0,0.0,-0.118290,1.0,0.999812,-0.225990,1.000000,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
1,23809,1,0,0.0,0.0,-0.271574,-1.0,0.999896,-0.313429,1.000000,...,-0.415788,0.715967,-0.214267,-0.585612,0.880545,-0.677303,-0.852820,-0.181670,-0.302637,-0.038810
2,47618,2,0,0.0,0.0,-0.264643,-1.0,0.999561,-0.312109,1.000000,...,0.430402,0.211268,-0.216366,-0.554285,1.004648,-0.663667,-0.729002,-0.181670,0.040105,0.407337
3,71427,3,0,0.0,0.0,0.326054,-1.0,1.000171,-0.188674,1.000000,...,-0.630114,0.397449,-0.231076,-0.413335,0.307877,-0.444531,0.370229,-0.181670,-0.822291,-1.222011
4,95236,4,0,0.0,0.0,-0.257286,-1.0,0.999532,-0.192615,1.000000,...,-0.878069,-0.097522,-0.225445,-0.528982,0.463505,-0.496452,-0.214835,-0.181670,-0.640601,-0.549764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4761795,4666563,195,23808,432.0,480.0,-0.267436,1.0,1.001092,0.101067,1.001729,...,-0.059426,0.304206,-0.229943,-0.392253,-0.468020,-0.433651,-0.012014,-0.181670,-0.445850,-0.479921
4761796,4690372,196,23808,432.0,480.0,-0.267754,-1.0,1.000865,-0.281080,1.000624,...,-0.141509,0.448341,-0.227846,-0.473906,-0.879986,-0.613923,-0.423832,-0.181670,0.243293,0.534821
4761797,4714181,197,23808,432.0,480.0,-0.208348,1.0,0.999787,-0.244587,1.001675,...,0.840622,0.751667,-0.219607,-0.546332,0.473733,-0.610200,0.131537,-0.181670,1.388817,0.922801
4761798,4737990,198,23808,432.0,480.0,-0.280047,0.0,0.999857,0.059750,0.999857,...,-1.071903,0.072023,-0.229440,1.128630,-1.545880,1.602468,-0.573867,-0.181670,-0.900137,-1.315216


Unnamed: 0,stock_id,time_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,...,SKEW(closing_movements.shortage_s1),SKEW(closing_movements.shortage_s2),STD(closing_movements.ask_price),STD(closing_movements.ask_size),STD(closing_movements.bid_ask_rr),STD(closing_movements.bid_size),STD(closing_movements.pressure),STD(closing_movements.seconds_in_bucket),STD(closing_movements.shortage_s1),STD(closing_movements.shortage_s2)
0,0,23809,432.0,490.0,-0.170633,-1.0,1.000234,-0.151898,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
1,0,23810,432.0,500.0,-0.171910,-1.0,1.000234,-0.151711,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
2,0,23811,432.0,510.0,-0.172728,-1.0,1.000234,-0.151591,0.999489,0.999861,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
3,0,23812,432.0,520.0,-0.172728,-1.0,1.000234,-0.151591,0.999489,0.999861,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
4,0,23813,432.0,530.0,-0.208531,-1.0,1.000048,-0.146805,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529195,199,26450,480.0,500.0,-0.144676,-1.0,1.002764,-0.147112,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529196,199,26451,480.0,510.0,-0.144821,-1.0,1.002764,-0.147090,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529197,199,26452,480.0,520.0,-0.211259,-1.0,1.001812,-0.147043,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529198,199,26453,480.0,530.0,-0.211259,-1.0,1.001812,-0.147043,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194


In [41]:
# temp solution to add dfs features to feat_dynamic_real, dfs feature columns will contain "closing_movements"
for col in df_train.columns.tolist():
    if col.find("closing_movements") != -1:
        feat_dynamic_real.append(col)
num_input_features = len(feat_dynamic_real)
print(num_input_features, feat_dynamic_real)

60 ['date_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap', 'bid_ask_rr', 'shortage_s2', 'pressure', 'shortage_s1', 'stock_date_id', 'MAX(closing_movements.ask_price)', 'MAX(closing_movements.ask_size)', 'MAX(closing_movements.bid_ask_rr)', 'MAX(closing_movements.bid_size)', 'MAX(closing_movements.far_price)', 'MAX(closing_movements.imbalance_size)', 'MAX(closing_movements.matched_size)', 'MAX(closing_movements.near_price)', 'MAX(closing_movements.pressure)', 'MAX(closing_movements.shortage_s1)', 'MAX(closing_movements.shortage_s2)', 'MEAN(closing_movements.ask_price)', 'MEAN(closing_movements.ask_size)', 'MEAN(closing_movements.bid_ask_rr)', 'MEAN(closing_movements.imbalance_size)', 'MEAN(closing_movements.pressure)', 'MEAN(closing_movements.shortage_s1)', 'MEAN(closing_movements.shortage_s2)', 'MIN(closing_movements.ask_price)', 'MIN(closing_move

In [42]:
# temp solution to force cast data types for pytorch model
df_train = df_train.astype(np.float32)
df_eval = df_eval.astype(np.float32)
df_train = df_train.astype({
    "stock_id": np.int64,
    "time_id": np.int64
})
df_eval = df_eval.astype({
    "stock_id": np.int64,
    "time_id": np.int64
})

In [43]:
display(df_train)
display(df_eval)

Unnamed: 0,row_id,stock_id,time_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,...,SKEW(closing_movements.shortage_s1),SKEW(closing_movements.shortage_s2),STD(closing_movements.ask_price),STD(closing_movements.ask_size),STD(closing_movements.bid_ask_rr),STD(closing_movements.bid_size),STD(closing_movements.pressure),STD(closing_movements.seconds_in_bucket),STD(closing_movements.shortage_s1),STD(closing_movements.shortage_s2)
0,0.0,0,0,0.0,0.0,-0.118290,1.0,0.999812,-0.225990,1.000000,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
1,23809.0,1,0,0.0,0.0,-0.271574,-1.0,0.999896,-0.313429,1.000000,...,-0.415788,0.715967,-0.214267,-0.585612,0.880545,-0.677303,-0.852820,-0.181670,-0.302637,-0.038810
2,47618.0,2,0,0.0,0.0,-0.264643,-1.0,0.999561,-0.312109,1.000000,...,0.430402,0.211268,-0.216366,-0.554285,1.004648,-0.663667,-0.729002,-0.181670,0.040105,0.407337
3,71427.0,3,0,0.0,0.0,0.326054,-1.0,1.000171,-0.188674,1.000000,...,-0.630114,0.397449,-0.231076,-0.413335,0.307877,-0.444531,0.370229,-0.181670,-0.822291,-1.222011
4,95236.0,4,0,0.0,0.0,-0.257286,-1.0,0.999532,-0.192615,1.000000,...,-0.878069,-0.097522,-0.225445,-0.528982,0.463505,-0.496452,-0.214835,-0.181670,-0.640601,-0.549764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4761795,4666563.0,195,23808,432.0,480.0,-0.267436,1.0,1.001092,0.101067,1.001729,...,-0.059426,0.304206,-0.229943,-0.392253,-0.468020,-0.433651,-0.012014,-0.181670,-0.445850,-0.479921
4761796,4690372.0,196,23808,432.0,480.0,-0.267754,-1.0,1.000865,-0.281080,1.000624,...,-0.141509,0.448341,-0.227846,-0.473906,-0.879986,-0.613923,-0.423832,-0.181670,0.243293,0.534821
4761797,4714181.0,197,23808,432.0,480.0,-0.208348,1.0,0.999787,-0.244587,1.001675,...,0.840622,0.751667,-0.219607,-0.546332,0.473733,-0.610200,0.131537,-0.181670,1.388817,0.922801
4761798,4737990.0,198,23808,432.0,480.0,-0.280047,0.0,0.999857,0.059750,0.999857,...,-1.071903,0.072023,-0.229440,1.128630,-1.545880,1.602468,-0.573867,-0.181670,-0.900137,-1.315216


Unnamed: 0,stock_id,time_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,...,SKEW(closing_movements.shortage_s1),SKEW(closing_movements.shortage_s2),STD(closing_movements.ask_price),STD(closing_movements.ask_size),STD(closing_movements.bid_ask_rr),STD(closing_movements.bid_size),STD(closing_movements.pressure),STD(closing_movements.seconds_in_bucket),STD(closing_movements.shortage_s1),STD(closing_movements.shortage_s2)
0,0,23809,432.0,490.0,-0.170633,-1.0,1.000234,-0.151898,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
1,0,23810,432.0,500.0,-0.171910,-1.0,1.000234,-0.151711,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
2,0,23811,432.0,510.0,-0.172728,-1.0,1.000234,-0.151591,0.999489,0.999861,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
3,0,23812,432.0,520.0,-0.172728,-1.0,1.000234,-0.151591,0.999489,0.999861,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
4,0,23813,432.0,530.0,-0.208531,-1.0,1.000048,-0.146805,0.999489,0.999675,...,-0.499333,0.000995,-0.227967,-0.470616,-0.303347,-0.057324,-0.118523,-0.181670,-0.606148,-0.621215
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529195,199,26450,480.0,500.0,-0.144676,-1.0,1.002764,-0.147112,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529196,199,26451,480.0,510.0,-0.144821,-1.0,1.002764,-0.147090,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529197,199,26452,480.0,520.0,-0.211259,-1.0,1.001812,-0.147043,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194
529198,199,26453,480.0,530.0,-0.211259,-1.0,1.001812,-0.147043,1.000859,1.001494,...,-0.173776,-2.390451,4.894075,0.024029,-2.036538,0.488040,-0.782829,6.110255,-0.617465,1.202194


### Create Pytorch datasets from final dfs

In [44]:
full_training_dataset = OptiverDataset(df_train, feat_dynamic_real, "stock_id", "target", "time_id", val_cutoff_time_id, 0)
full_validation_dataset = OptiverDataset(df_eval, feat_dynamic_real, "stock_id", "target", "time_id", 26455 - val_cutoff_time_id, val_cutoff_time_id)
print(len(full_training_dataset), len(full_validation_dataset))

23809 2646


In [45]:
training_sampler = torch.utils.data.RandomSampler(full_training_dataset)
validation_sampler = torch.utils.data.SequentialSampler(full_validation_dataset)

In [46]:
training_dataloader = torch.utils.data.DataLoader(
    full_training_dataset,
    batch_size=training_batch_size,
    sampler=training_sampler,
    # https://pytorch.org/docs/stable/data.html#single-and-multi-process-data-loading
    # num_workers=4,
    # https://pytorch.org/docs/stable/data.html#memory-pinning
    # pin_memory=True,
)
validation_dataloader = torch.utils.data.DataLoader(
    full_validation_dataset,
    batch_size=validation_batch_size,
    sampler=validation_sampler,
    # num_workers=4,
    # pin_memory=True,
)

In [47]:
training_sample_batch = next(iter(training_dataloader))
print("training_sample_batch", training_sample_batch[0].size(), training_sample_batch[1].size())
print("training_sample_batch", training_sample_batch[0].type(), training_sample_batch[1].type())
validation_sample_batch = next(iter(validation_dataloader))
print("validation_sample_batch", validation_sample_batch[0].size(), validation_sample_batch[1].size())
print("validation_sample_batch", validation_sample_batch[0].type(), validation_sample_batch[1].type())

training_sample_batch torch.Size([256, 200, 60]) torch.Size([256, 200])
training_sample_batch torch.FloatTensor torch.LongTensor
validation_sample_batch torch.Size([256, 200, 60]) torch.Size([256, 200])
validation_sample_batch torch.FloatTensor torch.LongTensor


## Create model

In [48]:
model = TransformerModel(
    num_input_features=num_input_features,
    num_classes=num_classes,
    embedding_dim=embedding_dim,
    d_model=d_model,
    nhead=nhead,
    d_hid=d_hid,
    nlayers=nlayers,
    dropout=dropout,
)
model.apply(init_weights)

TransformerModel(
  (embedding): Embedding(200, 4)
  (input_ff): Linear(in_features=64, out_features=32, bias=True)
  (input_ff_sigmoid): Sigmoid()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=32, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (final_linear): Linear(in_features=32, out_features=1, bias=True)
)

In [49]:
criterion = nn.L1Loss()
validation_criterion = nn.L1Loss()

## Tensorboard

Start with `tensorboard --logdir=lightning_logs/` cmd.

In [50]:
# %reload_ext tensorboard
# %tensorboard --logdir=lightning_logs/

## Module and trainer (lightning)

In [51]:
class TransformerModelModule(L.LightningModule):
    def __init__(
        self,
        model: nn.Module,
        criterion: nn.Module,
        validation_criterion: nn.Module,
        lr,
    ):
        super().__init__()
        self.save_hyperparameters()
        self.model = model
        self.criterion = criterion
        self.validation_criterion = validation_criterion
        self.lr = lr
        self.validation_step_outputs = []
        self.validation_step_actual_targets = []

    def training_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets
        output = self.model(features, item_id)
        loss = self.criterion(output, actual_targets)
        self.log("train_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        features, item_id, targets = batch[0], batch[1], batch[2]
        actual_targets = targets
        output = self.model(features, item_id)
        # TODO: is loss being averaged based on batch size
        loss = self.validation_criterion(output, actual_targets)
        # lightning will take weighted-average on loss per step based on batch size
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.append(output)
        self.validation_step_actual_targets.append(actual_targets)

    def on_validation_epoch_end(self):
        # TODO: remove manual calculation of validation loss if we can confirm lightning will take weighted average
        # cat is used instead of stack, last step may have different batch size
        all_preds = torch.cat(self.validation_step_outputs)
        all_actual_targets = torch.cat(self.validation_step_actual_targets)
        manual_loss = self.validation_criterion(all_preds, all_actual_targets)
        self.log("val_loss_manual", manual_loss, on_step=False, on_epoch=True, prog_bar=True)
        self.validation_step_outputs.clear()  # free memory
        self.validation_step_actual_targets.clear()  # free memory

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.model.parameters(),
            lr=self.lr,
            betas=(0.9, 0.95),
            weight_decay=1e-1,
        )
        return [optimizer]

In [52]:
model = TransformerModelModule(
    model,
    criterion,
    validation_criterion,
    learning_rate,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'criterion' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['criterion'])`.
/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'validation_criterion' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['validation_criterion'])`.


In [53]:
checkpoint_callback = ModelCheckpoint(
    dirpath=checkpoint_dir,
    save_top_k=2,
    monitor="val_loss",
    mode="min",
    filename="transformer-{epoch:02d}-{val_loss:.4f}"
)

In [54]:
trainer = L.Trainer(
    max_epochs=num_epochs,
    limit_train_batches=limit_train_batches,
    # https://lightning.ai/docs/pytorch/stable/advanced/training_tricks.html#gradient-clipping
    gradient_clip_val=gradient_clip_val,
    callbacks=[
        # https://lightning.ai/docs/pytorch/stable/common/progress_bar.html#richprogressbar
        RichProgressBar(leave=True),
        checkpoint_callback,
    ],
    logger=tb_logger,
    # https://lightning.ai/docs/pytorch/stable/common/trainer.html#reproducibility
    deterministic=True,
)
print(trainer.callbacks)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


[<lightning.pytorch.callbacks.progress.rich_progress.RichProgressBar object at 0x7fb373972b90>, <lightning.pytorch.callbacks.rich_model_summary.RichModelSummary object at 0x7fb387df08d0>, <lightning.pytorch.callbacks.model_checkpoint.ModelCheckpoint object at 0x7fb373995310>]


In [55]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
2024-04-28 18:29:52.146645: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


[{'val_loss': 6.0114006996154785, 'val_loss_manual': 6.011401176452637}]

## Training

In [56]:
print(checkpoint_dir)
trainer.fit(
    model=model,
    train_dataloaders=training_dataloader,
    val_dataloaders=validation_dataloader,
)

/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python /userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/si ...
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


transformer_encoder_by_time_checkpoints/20240426_transformer_stockid


Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

Output()

`Trainer.fit` stopped: `max_epochs=10` reached.


In [57]:
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val_loss': 5.9592390060424805, 'val_loss_manual': 5.959239482879639}]

In [58]:
model

TransformerModelModule(
  (model): TransformerModel(
    (embedding): Embedding(200, 4)
    (input_ff): Linear(in_features=64, out_features=32, bias=True)
    (input_ff_sigmoid): Sigmoid()
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
          )
          (linear1): Linear(in_features=32, out_features=32, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=32, out_features=32, bias=True)
          (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (final_linear): Linear(in_features=32, out_features=1, bias=True)


## Load saved checkpoint

In [59]:
checkpoint_callback.best_model_path

'/userhome/cs2/tsangsyf/optiver2023/florence/transformer_encoder_by_time_checkpoints/20240426_transformer_stockid/transformer-epoch=08-val_loss=5.9584.ckpt'

In [60]:
checkpoint_path = checkpoint_callback.best_model_path
print(checkpoint_path)

/userhome/cs2/tsangsyf/optiver2023/florence/transformer_encoder_by_time_checkpoints/20240426_transformer_stockid/transformer-epoch=08-val_loss=5.9584.ckpt


In [61]:
saved_model = TransformerModelModule.load_from_checkpoint(checkpoint_path)
print(saved_model)

TransformerModelModule(
  (model): TransformerModel(
    (embedding): Embedding(200, 4)
    (input_ff): Linear(in_features=64, out_features=32, bias=True)
    (input_ff_sigmoid): Sigmoid()
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-1): 2 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
          )
          (linear1): Linear(in_features=32, out_features=32, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=32, out_features=32, bias=True)
          (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (final_linear): Linear(in_features=32, out_features=1, bias=True)


/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'criterion' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['criterion'])`.
/userhome/cs2/tsangsyf/anaconda3_2/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'validation_criterion' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['validation_criterion'])`.


In [62]:
torch_checkpoint = torch.load(checkpoint_path)
print(torch_checkpoint.keys())
print(torch_checkpoint["state_dict"].keys())
print(torch_checkpoint["hyper_parameters"])

dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers', 'hparams_name', 'hyper_parameters'])
odict_keys(['model.embedding.weight', 'model.input_ff.weight', 'model.input_ff.bias', 'model.transformer_encoder.layers.0.self_attn.in_proj_weight', 'model.transformer_encoder.layers.0.self_attn.in_proj_bias', 'model.transformer_encoder.layers.0.self_attn.out_proj.weight', 'model.transformer_encoder.layers.0.self_attn.out_proj.bias', 'model.transformer_encoder.layers.0.linear1.weight', 'model.transformer_encoder.layers.0.linear1.bias', 'model.transformer_encoder.layers.0.linear2.weight', 'model.transformer_encoder.layers.0.linear2.bias', 'model.transformer_encoder.layers.0.norm1.weight', 'model.transformer_encoder.layers.0.norm1.bias', 'model.transformer_encoder.layers.0.norm2.weight', 'model.transformer_encoder.layers.0.norm2.bias', 'model.transformer_encoder.layers.1.self_attn.in_proj_weight', 'model.transformer_

In [63]:
saved_model.eval()
trainer.validate(
    model=model,
    dataloaders=validation_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val_loss': 5.9592390060424805, 'val_loss_manual': 5.959239482879639}]