# Section 1: Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

In [2]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# import lightgbm as lgb
import gc
import sys

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Section 2: Data Preprocessing

In [3]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
# revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
# sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

In [4]:
train.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')

In [5]:
train.shape

(5237980, 17)

In [6]:
train.head(5)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [7]:
train = train.fillna(1) # write nan to 1
train.shape

(5237980, 17)

## Section 2A: Feature Engineering
> TODO: Feature Engin.


In [8]:
def feat_eng(df):
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']] # compatible for training, test and inference
    df = df[cols]
    # feature_engineering
    df.drop(columns=[
        'date_id', 
#         'reference_price_far_price_imb',
#         'reference_price_minus_near_price',
#         'reference_price_near_price_imb',
#         'far_price_near_price_imb',
#         'far_price_ask_price_imb',
#         'far_price_bid_price_imb',
#         'far_price_minus_wap',
#         'std_size',
#         'bid_size_over_ask_size',
#         'ask_price_bid_price_imb',
#         'near_price_times_wap'
    ], inplace=True)
        
    gc.collect()
    
    return df

## Section 2B: Ready X, y

> TODO: train_test_split

In [9]:
%%time

y = train['target'].values
X = feat_eng(train.drop(columns='target'))

# normalization of data
mean = X.mean(axis=0)
std = X.std(axis=0)
std[std == 0] = 1 # Avoid division by zero
X = (X - mean) / std


# y = y[:50000]
# X = X[:50000]



# prices = [c for c in train.columns if 'price' in c]
# pca_prices = PCA(n_components=1)
# X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

CPU times: user 1.28 s, sys: 907 ms, total: 2.19 s
Wall time: 1.87 s


In [10]:
display(y.shape)
display(X.shape)

(5237980,)

(5237980, 13)

In [11]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y=None, time_steps=1):
        self.X = X
        self.y = y
        self.time_steps = time_steps

    def __len__(self):
        return len(self.X) - self.time_steps

    def __getitem__(self, idx): # compatible for both train, test and inferencing
        X_sample = self.X[idx:idx+self.time_steps]
        if self.y is not None:
            y_sample = self.y[idx:idx+self.time_steps]
            return X_sample, y_sample
        else:
            return X_sample

# Section 3: Train Model

> TODO: Hyperparameters Tuning (CV),
Optimizer,
Loss Function,
Learning Rate Scheduler,

## Section 3a-1: Model Architecture 

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.fc = nn.Linear(d_model, d_model)

    def split_heads(self, x, batch_size):
        x = x.view(batch_size, -1, self.num_heads, self.depth)
        return x.permute(2, 0, 1, 3)

    def forward(self, query, key, value):
        batch_size = query.size(0)
        query = self.split_heads(self.wq(query), batch_size)
        key = self.split_heads(self.wk(key), batch_size)
        value = self.split_heads(self.wv(value), batch_size)

        matmul_qk = torch.matmul(query, key.permute(0, 1, 3, 2))
        d_k = key.size(-1)
        scaled_attention_logits = matmul_qk / d_k**0.5
        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, value)
        output = output.permute(1, 2, 0, 3).contiguous().view(batch_size, -1, self.d_model)
        return self.fc(output)

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_output = self.mha(x, x, x)
        out1 = self.norm1(x + attn_output)
        fc_output = self.fc(out1)
        return self.norm2(out1 + fc_output)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(),
            nn.Linear(d_model, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_output):
        attn1 = self.mha1(x, x, x)
        out1 = self.norm1(attn1 + x)
        attn2 = self.mha2(out1, enc_output, enc_output)
        out2 = self.norm2(attn2 + out1)
        fc_output = self.fc(out2)
        return self.norm3(fc_output + out2)

class Transformer(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, input_seq_len, output_seq_len, num_variables):
        super(Transformer, self).__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads) for _ in range(num_layers)])

        self.fc_in = nn.Linear(num_variables, d_model)
        self.fc_out = nn.Linear(d_model, 1)

    def forward(self, src, tgt):
        src = self.fc_in(src)
        tgt = self.fc_in(tgt)

        for layer in self.encoder_layers:
            src = layer(src)

        for layer in self.decoder_layers:
            tgt = layer(tgt, src)

        return self.fc_out(tgt)

In [13]:
# Ready DataSet from X, y
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # determined the device
display(device)
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device) 
y_tensor = torch.tensor(y, dtype=torch.float32).to(device)
# Create Dataset and DataLoader
batch_size = 512
time_steps = 50
dataset = TimeSeriesDataset(X_tensor, y_tensor, time_steps)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=False)

device(type='cuda')

In [14]:
%%time
# Transformer
input_seq_len = time_steps
output_seq_len = time_steps
d_model = 64
num_heads = 4
num_layers = 2
num_variables = X.shape[1]

model = Transformer(
    num_layers, d_model, num_heads, input_seq_len, output_seq_len, num_variables
).to(device)

criterion = torch.nn.L1Loss()  
optimizer = optim.Adam(model.parameters(), lr=0.01)

CPU times: user 9.48 ms, sys: 2.14 ms, total: 11.6 ms
Wall time: 36.9 ms


In [15]:
IS_TRAIN = False # set it to train model or load model

In [16]:
%%time
if IS_TRAIN:
    num_epochs = 5
    print_every_n_batches = 10  # Adjust this to your preference

    # Training loop
    model.train()
    losses = []
    # accuracies = []
    for epoch in range(num_epochs):
        epoch_losses = []
    #     epoch_accuracies = []
        for i, (batch_X, batch_y) in enumerate(dataloader):
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            outputs = model(batch_X, batch_X) 
            outputs = outputs.squeeze(-1)  # Remove the last dimension of size 1
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_losses.append(loss.item())
            # Print loss every n batches
            if (i + 1) % print_every_n_batches == 0:
                sys.stdout.write('\r' + f"Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(dataloader)}, Loss: {loss.item()}")
                sys.stdout.flush()


        print(f"\n => Epoch {epoch+1}/{num_epochs}, Loss: {np.mean(epoch_losses)}")
        losses.append(np.mean(epoch_losses))
    
    print(losses)
else:
    model_save_path = "/kaggle/input/finance-transformer-1/transformer_model.pth"
    model.load_state_dict(torch.load(model_save_path))

CPU times: user 10.1 ms, sys: 1.89 ms, total: 12 ms
Wall time: 28.8 ms


>Epoch 1/5, Batch 10230/10231, Loss: 3.8339190483093263
 => Epoch 1/5, Loss: 6.410258663887968
Epoch 2/5, Batch 10230/10231, Loss: 3.8925738334655761
 => Epoch 2/5, Loss: 6.410010247341921
Epoch 3/5, Batch 10230/10231, Loss: 3.8768408298492435
 => Epoch 3/5, Loss: 6.4099562952984765
Epoch 4/5, Batch 10230/10231, Loss: 3.8611757755279544
 => Epoch 4/5, Loss: 6.410132407978965
Epoch 5/5, Batch 10230/10231, Loss: 3.9083080291748047
 => Epoch 5/5, Loss: 6.4099540124831105
[6.410258663887968, 6.410010247341921, 6.4099562952984765, 6.410132407978965, 6.4099540124831105]
CPU times: user 31min 5s, sys: 5.43 s, total: 31min 11s
Wall time: 31min 17s

## Save Model: Training in GPU is expensive

In [17]:
if IS_TRAIN:
    model_save_path = "transformer_model.pth"
    torch.save(model.state_dict(), model_save_path)

## Section 3a: Inspect Model
> TODO: Inspect Training results ( Overfit/underfit)

In [18]:
model

Transformer(
  (encoder_layers): ModuleList(
    (0-1): 2 x EncoderLayer(
      (mha): MultiHeadAttention(
        (wq): Linear(in_features=64, out_features=64, bias=True)
        (wk): Linear(in_features=64, out_features=64, bias=True)
        (wv): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (fc): Sequential(
        (0): Linear(in_features=64, out_features=64, bias=True)
        (1): ReLU()
        (2): Linear(in_features=64, out_features=64, bias=True)
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
  )
  (decoder_layers): ModuleList(
    (0-1): 2 x DecoderLayer(
      (mha1): MultiHeadAttention(
        (wq): Linear(in_features=64, out_features=64, bias=True)
        (wk): Linear(in_features=64, out_features=64, bias=True)
        (wv): Linear(in_features=64, out_features=64, bias=True)
        (

In [19]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
display(pytorch_total_params)
display(pytorch_trainable_params)

135361

135361

## Section 3b: Test Model

In [20]:
def test_iterator(df, test_size=200):
    """Yields batches of rows from the dataframe."""
    num_batches = len(df) // test_size + (1 if len(df) % test_size else 0)
    
    for i in range(num_batches):
        start_idx = i * test_size
        end_idx = start_idx + test_size
        yield df[start_idx:end_idx]

# inference helper
def inference_model_from_df(df, m):
    test_df = t
    feat = feat_eng(test_df)
    feat = feat.fillna(1)
    feat = (feat - mean) / std # normalize
    
    # Convert to tensor and move to device
    feat_tensor = torch.tensor(feat.values, dtype=torch.float32).to(device)
    
    # Ensure you're in eval mode
    model.eval()
    
    # Initialize the full predictions list to store the entire sequence of predictions
    full_predictions = []
    
    # Predict the 20 future steps for every `time_steps` chunk
    for start_idx in range(0, len(test_df) - time_steps + 1, 20):
        sequence = feat_tensor[start_idx:start_idx+time_steps].unsqueeze(0)
        
        with torch.no_grad():
            predictions = model(sequence, sequence).squeeze().cpu().numpy()
            
            # Add the predicted values to the full_predictions list
            extra = len(full_predictions) + len(predictions) - len(test_df)
            if extra > 0:
                predictions = predictions[extra:]
            full_predictions.extend(predictions.tolist())

    
    # Make sure the predictions size matches the size of sample_prediction
    assert len(full_predictions) == len(t), "Mismatch between prediction size and expected size"
    print(len(full_predictions), np.mean(full_predictions), np.std(full_predictions))
    return full_predictions
    
test_df_iterator = test_iterator(test, test_size=200)
for t in test_df_iterator:
    full_predictions = inference_model_from_df(df=t, m=model)


200 0.49637895822525024 0.0
200 0.4963789615035057 1.3588682550404067e-08
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637896060943604 1.1680077279964341e-08
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.4963789728283882 2.5635194805893768e-08
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.4963789653778076 1.93692169299983e-08
200 0.4963789623975754 1.520794728061327e-08
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637897312641144 2.5809568279517847e-08
200 0.4963789615035057 1.3588682550404067e-08
200 0.49637895822525024 0.0
200 0.4963789594173431 8.344650268554688e-09
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.4963789668679237 2.098684536392481e-08
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
2

# Section 4: Inference Model

## Section 4a: Helper Function

In [21]:
def zero_sum(prices, volumes): 
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [22]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [23]:
counter = 0
for (test_df, revealed_targets, sample_prediction) in iter_test:
    full_predictions = inference_model_from_df(df=test_df, m=model)
    
    # Fill the 'target' field with the model's outputs
    sample_prediction['target'] = full_predictions
    
    # Apply any other necessary post-processing
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test_df.loc[:, 'bid_size'] + test_df.loc[:, 'ask_size'])
    
    env.predict(sample_prediction)
    counter += 1 


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.49637895822525024 0.0
200 0.4