```
Prediction task is (using the output of this method):
    For stock 0 day 0 (i.e. dim 0 = index 0),
    Input = Window size of 27 features (matrix of 10 * 27)
    Output = Vector of window size (vector of size 10, representing target of window_start_idx --> window_start_idx + 10 - 1)
    Target 0 --> only use window 0 - 0
    Target 1 --> only use window 0 - 1
    Target 9 --> use full window 0 - 9
    This is achieved by using a squared source additive mask of the following (e.g. window = 3)
    [0, -inf, -inf]
    [0, 0,    -inf]
    [0, 0,    0   ]
```

In [1]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import math
import os
import time
from tempfile import TemporaryDirectory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn

import joblib 

from load_data import load_data_from_csv

In [4]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, TimeSeriesDataPreprocessor
from data_preprocessor.feature_engineering import EnrichDFDataPreprocessor, MovingAvgPreProcessor, RemoveIrrelevantFeaturesDataPreprocessor, AddStockDateIdxDataPreprocessor

In [5]:
from transformer.transformer_model import TransformerModel
from transformer.transformer_pipeline import TransformerPipeline

In [6]:
from utils.ml_utils import NoopModelLogger, BasicModelLogger

In [7]:
np.random.seed(42)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Data preprocessing pipeline

In [9]:
processors = [
    EnrichDFDataPreprocessor(),
    MovingAvgPreProcessor("wap"),
    RemoveIrrelevantFeaturesDataPreprocessor([
        'row_id'
    ]),
    AddStockDateIdxDataPreprocessor()
]
processor = CompositeDataPreprocessor(processors)

### Load data

In [10]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
df_train_raw = df_train
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


### Pre-process data

In [11]:
df_train = processor.apply(df_train)
print(df_train.shape)
print(df_train.columns)
print(df_train.dtypes)
display(df_train.tail())

(5237980, 29)
Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'imb_s1', 'imb_s2',
       'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2',
       'pressure', 'inefficiency', 'wap_mov_avg_3_1', 'wap_mov_avg_6_3',
       'wap_mov_avg_12_6', 'wap_mov_avg_24_12', 'index_col_id'],
      dtype='object')
stock_id                                      int64
date_id                                       int64
seconds_in_bucket                             int64
imbalance_size                              float64
imbalance_buy_sell_flag                       int64
reference_price                             float64
matched_size                                float64
far_price   

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,reference_price_ask_price_wap_imb2,reference_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2,pressure,inefficiency,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12,index_col_id
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,...,9.636364,,9.636364,75.664813,0.086305,1.000345,1.000305,1.000318,1.000202,480195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,...,0.4602273,792633500000.0,0.460227,1.704028,0.038041,1.000816,1.00071,1.00056,1.000506,480196
5237977,197,480,540,0.0,0,0.995789,12725436.1,0.995789,0.995789,0.995789,...,10.75,,10.75,0.0,0.0,0.995958,0.99607,0.996131,0.996436,480197
5237978,198,480,540,1000898.84,1,0.99921,94773271.05,0.99921,0.99921,0.99897,...,-1.099231e-12,5.315789,5.315789,1.494117,0.010561,0.999116,0.999217,0.999305,0.999313,480198
5237979,199,480,540,1884285.71,-1,1.002129,24073677.32,1.000859,1.001494,1.002129,...,1.193103,-653021900000.0,1.193103,7.534688,0.078272,1.002083,1.002337,1.002156,1.001945,480199


In [12]:
# temp nan handling
df_train = df_train.fillna(0.0)

In [13]:
time_series_data_preprocessor = TimeSeriesDataPreprocessor()
train_data_arr = time_series_data_preprocessor.apply(df_train)
print(train_data_arr.shape)

(95236, 55, 29)


In [14]:
time_series_irrelevant_features = ["index_col_id"]
time_series_irrelevant_features_col_idx = np.isin(df_train.columns, time_series_irrelevant_features)
train_data_arr = train_data_arr[:, :, ~time_series_irrelevant_features_col_idx]
print(train_data_arr.shape)

(95236, 55, 28)


In [15]:
target_col_idx = np.where(df_train.columns == "target")[0][0]
print(target_col_idx)

14


### Transformer

### Run transformer

In [187]:
batch_size = 20
eval_batch_size = 10
window_size = 55

input_features = train_data_arr.shape[2] - 1
d_model = 4  # attention dimension 32/64
d_hid = 4  # feed-forward layer dimension after attention layer 32/64
nlayers = 1  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention`` 4/8
dropout = 0.2  # dropout probability

epochs = 3
lr = 5.0  # learning rate

In [188]:
model = TransformerModel(input_features, d_model, nhead, d_hid, nlayers, dropout)

In [189]:
criterion = nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [190]:
pipeline_logger = NoopModelLogger()
train_logger = NoopModelLogger()
eval_logger = NoopModelLogger()
test_logger = NoopModelLogger()
# test_logger = BasicModelLogger("transformer")

In [191]:
pipeline = TransformerPipeline(model, optimizer, criterion, pipeline_logger, train_logger, eval_logger, test_logger, device)

In [192]:
# first_n_batches_only = 1
first_n_batches_only = -1

In [193]:
with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch_idx in range(1, epochs + 1):
        epoch_start_time = time.time()

        train_loss = pipeline.train_transformer(train_data_arr, target_col_idx, batch_size, first_n_batches_only=first_n_batches_only)

        # TODO: change to validation set
        val_loss = pipeline.evaluate_transformer(train_data_arr, target_col_idx, batch_size)
        
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch_idx:3d} | time: {elapsed:5.2f}s | '
            f'train loss {train_loss:5.2f} | '
            f'valid loss {val_loss:5.2f}')
        print('-' * 89)

        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    # model.load_state_dict(torch.load(best_model_params_path)) # load best model states

-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 32.00s | train loss  6.41 | valid loss  6.43
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 31.81s | train loss  6.41 | valid loss  6.43
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 32.94s | train loss  6.41 | valid loss  6.44
-----------------------------------------------------------------------------------------


In [194]:
test_input = np.ones((1, 10, 27), dtype=np.float32)
test_output = pipeline.test_transformer(test_input)
print(test_output)

0.5715


In [97]:
test_input = np.ones((1, 10, 27), dtype=np.float32)
test_input = torch.from_numpy(test_input).to(device)
exp_output = np.ones((1, 10, 4), dtype=np.float32) * 0.1
exp_output = torch.from_numpy(exp_output).to(device)
print(test_input.shape)
print(test_input.device)
layer = nn.Linear(27, 4).to(device)
initrange = 0.1
layer.bias.data.zero_()
layer.weight.data.uniform_(-initrange, initrange)
criterion = nn.L1Loss()
optimizer = torch.optim.SGD(layer.parameters(), lr=0.05)
output = layer(test_input)
print(output.device)
output_arr = output.detach().cpu().numpy()
print(output_arr.shape)
print(output_arr)
loss = criterion(output, exp_output)
print(loss.item())
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(layer.parameters(), 0.5)
print(layer.bias)
print(layer.bias.grad)
optimizer.step()
print(layer.bias)
print(layer.bias.grad)

torch.Size([1, 10, 27])
cuda:0
cuda:0
(1, 10, 4)
[[[-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]
  [-0.08349273 -0.04848591 -0.44402292 -0.18492313]]]
0.29023119807243347
Parameter containing:
tensor([0., 0., 0., 0.], device='cuda:0', requires_grad=True)
tensor([-0.0472, -0.0472, -0.0472, -0.0472], device='cuda:0')
Parameter containing:
tensor([0.0024, 0.0024, 0.0024, 0.0024], device='cuda:0', requires_grad=True)
tensor([-0.0472, -0.0472, -0.0472, -0.0472], device='cuda:0')


In [25]:
test_input = np.ones((1, 10, 27), dtype=np.float32)
test_input = torch.from_numpy(test_input).to(device)
print(test_input.shape)
print(test_input.device)
from torch.nn import TransformerEncoder, TransformerEncoderLayer
encoder_layers = TransformerEncoderLayer(27, 1, 4, 0.1, batch_first=True)
transformer_encoder = TransformerEncoder(encoder_layers, 1).to(device)
output = transformer_encoder(test_input)
print(output.device)
output = output.detach().cpu().numpy()
print(output.shape)
print(output)

torch.Size([1, 10, 27])
cuda:0
cuda:0
(1, 10, 27)
[[[-1.0531951   0.26237547 -0.2205062  -0.09366804  0.8443889
   -0.40777266 -0.5517922   1.8366932  -0.6029663  -0.00811083
   -0.43278176  0.6012799  -0.05290572 -1.003386    1.643272
    0.6413467  -1.5264015  -0.8164899  -0.5777112  -0.7960871
    2.4972708  -0.5085025  -1.9703547   0.8720963   0.2825181
    0.98318124  0.15820837]
  [-1.0899979   0.48622718 -0.37410682  0.45900217  0.84799457
   -0.51746345 -0.8156284   1.7392398   0.18157476 -0.7832901
   -0.18742183  0.87744224 -0.39637813 -0.7242037   0.55665255
    0.9680532  -1.4000664  -0.5899497  -0.24402978 -1.1308447
    2.7416294  -1.0448083   0.23980479  1.1667427  -1.7359328
    0.688326    0.08143301]
  [-0.6490619   0.33597106 -0.21184339 -0.3084784   0.5654775
   -0.4097036  -0.6132796   1.7852457   0.37223202 -0.74656236
   -0.18406135  0.9443145  -0.41657957 -0.8498585   1.4588904
    0.9307838  -1.2867061  -1.1388993  -0.22840178 -0.2200712
    2.568572   -0.80192

