```
Prediction task is (using the output of this method):
    For stock 0 day 0 (i.e. dim 0 = index 0),
    Input = Window size of 27 features (matrix of 10 * 27)
    Output = Vector of window size (vector of size 10, representing target of window_start_idx --> window_start_idx + 10 - 1)
    Target 0 --> only use window 0 - 0
    Target 1 --> only use window 0 - 1
    Target 9 --> use full window 0 - 9
    This is achieved by using a squared source additive mask of the following (e.g. window = 3)
    [0, -inf, -inf]
    [0, 0,    -inf]
    [0, 0,    0   ]
```

In [1]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
import math
import os
import time
from tempfile import TemporaryDirectory

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn

import joblib 

from load_data import load_data_from_csv

In [3]:
from data_preprocessor.data_preprocessor import CompositeDataPreprocessor, TimeSeriesDataPreprocessor
from data_preprocessor.feature_engineering import EnrichDFDataPreprocessor, MovingAvgPreProcessor, RemoveIrrelevantFeaturesDataPreprocessor, AddStockDateIdxDataPreprocessor

In [4]:
from transformer.transformer_model import TransformerModel
from transformer.transformer_pipeline import TransformerPipeline

In [5]:
from utils.ml_utils import NoopModelLogger

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data preprocessing pipeline

In [7]:
processors = [
    EnrichDFDataPreprocessor(),
    MovingAvgPreProcessor("wap"),
    RemoveIrrelevantFeaturesDataPreprocessor([
        'row_id'
    ]),
    AddStockDateIdxDataPreprocessor()
]
processor = CompositeDataPreprocessor(processors)

### Load data

In [8]:
# DATA_PATH = '/kaggle/input'
DATA_PATH = '..'
df_train, df_test, revealed_targets, sample_submission = load_data_from_csv(DATA_PATH)
print(df_train.columns)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


### Pre-process data

In [9]:
df_train = processor.apply(df_train)
print(df_train.shape[0])
print(df_train.columns)
display(df_train.tail())

5237980
Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'imb_s1', 'imb_s2',
       'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2', 'ask_price_bid_price_wap_imb2',
       'pressure', 'inefficiency', 'wap_mov_avg_3_1', 'wap_mov_avg_6_3',
       'wap_mov_avg_12_6', 'wap_mov_avg_24_12', 'index_col_id'],
      dtype='object')


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,reference_price_ask_price_wap_imb2,reference_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2,pressure,inefficiency,wap_mov_avg_3_1,wap_mov_avg_6_3,wap_mov_avg_12_6,wap_mov_avg_24_12,index_col_id
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,...,9.636364,,9.636364,75.664813,0.086305,1.000345,1.000305,1.000318,1.000202,480195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,...,0.4602273,792633500000.0,0.460227,1.704028,0.038041,1.000816,1.00071,1.00056,1.000506,480196
5237977,197,480,540,0.0,0,0.995789,12725436.1,0.995789,0.995789,0.995789,...,10.75,,10.75,0.0,0.0,0.995958,0.99607,0.996131,0.996436,480197
5237978,198,480,540,1000898.84,1,0.99921,94773271.05,0.99921,0.99921,0.99897,...,-1.099231e-12,5.315789,5.315789,1.494117,0.010561,0.999116,0.999217,0.999305,0.999313,480198
5237979,199,480,540,1884285.71,-1,1.002129,24073677.32,1.000859,1.001494,1.002129,...,1.193103,-653021900000.0,1.193103,7.534688,0.078272,1.002083,1.002337,1.002156,1.001945,480199


In [10]:
time_series_data_preprocessor = TimeSeriesDataPreprocessor()
train_data_arr = time_series_data_preprocessor.apply(df_train)
print(train_data_arr.shape)

(95236, 55, 29)


In [11]:
time_series_irrelevant_features = ["index_col_id"]
time_series_irrelevant_features_col_idx = np.isin(df_train.columns, time_series_irrelevant_features)
train_data_arr = train_data_arr[:, :, ~time_series_irrelevant_features_col_idx]
print(train_data_arr.shape)

(95236, 55, 28)


In [12]:
target_col_idx = np.where(df_train.columns == "target")[0][0]
print(target_col_idx)

14


### Transformer

### Run transformer

In [23]:
batch_size = 20
eval_batch_size = 10
window_size = 55

input_features = train_data_arr.shape[2] - 1
d_model = 4  # attention dimension
d_hid = 4  # feed-forward layer dimension after attention layer
nlayers = 1  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability

epochs = 1
lr = 5.0  # learning rate

In [24]:
model = TransformerModel(input_features, d_model, nhead, d_hid, nlayers, dropout).to(device)

In [25]:
criterion = nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [26]:
train_logger = NoopModelLogger()
eval_logger = NoopModelLogger()
test_logger = NoopModelLogger()

In [27]:
pipeline = TransformerPipeline(model, optimizer, criterion, train_logger, eval_logger, test_logger)

In [28]:
with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch_idx in range(1, epochs + 1):
        epoch_start_time = time.time()

        train_loss = pipeline.train_transformer(train_data_arr, target_col_idx, 10, batch_size)

        # TODO: change to validation set
        val_loss = pipeline.evaluate_transformer(train_data_arr, target_col_idx, batch_size)
        
        val_ppl = math.exp(val_loss)
        elapsed = time.time() - epoch_start_time
        print('-' * 89)
        print(f'| end of epoch {epoch_idx:3d} | time: {elapsed:5.2f}s | '
            f'train loss {train_loss:5.2f} | '
            f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
        print('-' * 89)

        # if val_loss < best_val_loss:
        #     best_val_loss = val_loss
        #     torch.save(model.state_dict(), best_model_params_path)

        scheduler.step()
    # model.load_state_dict(torch.load(best_model_params_path)) # load best model states

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  2.64s | train loss   nan | valid loss   nan | valid ppl      nan
-----------------------------------------------------------------------------------------


In [29]:
test_input = np.ones((1, 10, 27), dtype=np.float32)
test_output = pipeline.test_transformer(test_input)
print(test_output)

nan
