In [1]:
import glob

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

from pytorch_lightning import LightningModule

In [2]:
class VolatilityClassifier(LightningModule):

    def __init__(self, input_width=600):

        super(VolatilityClassifier, self).__init__()

        in_channels = 10
        hidden_size = input_width

        self.batchnorm = nn.BatchNorm1d(in_channels)

        self.conv1 = nn.Conv1d(in_channels, in_channels*8, kernel_size=3, padding=1, bias=True)
        self.conv2 = nn.Conv1d(in_channels*8, in_channels*8, kernel_size=3, padding=1, bias=True)
        self.conv3 = nn.Conv1d(in_channels*8, in_channels*4, kernel_size=3, padding=1, bias=True)

        self.dense = nn.Linear(in_channels*4*600, 1)

        self.linear = nn.Linear(2, 1)

        self.loss = nn.MSELoss()
        
    def forward(self, series, stats):

        x = self.batchnorm(series)

        x = self.conv1(x)
        x = F.leaky_relu(x)

        x = self.conv2(x)
        x = F.leaky_relu(x)

        x = self.conv3(x)
        x = F.leaky_relu(x)

        x = torch.flatten(x, start_dim=1, end_dim=2)
        x = self.dense(x)

        x = torch.hstack((x,stats))

        return self.linear(x)

In [3]:
def get_tensors(file_path):

    df = pd.read_parquet(file_path, engine='pyarrow')
    stock_id = int(file_path.split('/')[-1].split('=')[-1])

    for time_id in np.unique(df.time_id):

        df_time = df[df.time_id == time_id].reset_index(drop=True)
        changes_len = len(df_time)

        df_time = df_time.reindex(list(range(600))).reset_index(drop=True)

        missing = set(range(600)) - set(df_time.seconds_in_bucket)
        df_time.loc[changes_len:,'seconds_in_bucket'] = list(missing)

        df_time = df_time.sort_values(by='seconds_in_bucket').reset_index(drop=True)
        df_time.loc[:,'time_id'] = time_id
        df_time['stock_id'] = stock_id

        df_time.ffill(axis = 0, inplace=True)

        yield df_time.T.to_numpy(dtype=np.float32)

In [4]:
model = VolatilityClassifier.load_from_checkpoint('../input/optiver-best/optiver_best.ckpt')
model.cpu()
model.eval()

VolatilityClassifier(
  (batchnorm): BatchNorm1d(10, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv1): Conv1d(10, 80, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(80, 80, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(80, 40, kernel_size=(3,), stride=(1,), padding=(1,))
  (dense): Linear(in_features=24000, out_features=1, bias=True)
  (linear): Linear(in_features=2, out_features=1, bias=True)
  (loss): MSELoss()
)

In [5]:
book_test_files = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')

In [6]:
tensors = []

for f in book_test_files:
    print(f)
    tensors.extend(list(get_tensors(f)))
    
tensors = np.stack(tensors, axis=0)
np.nan_to_num(tensors, copy=False)

stockids = tensors[:,-1,0]
timeids  = tensors[:,0,0]

df = pd.DataFrame(data=np.hstack((stockids.reshape(-1,1),timeids.reshape(-1,1))), columns=["row_id", "target"])

df['row_id'] = df['row_id'].astype('int').astype('str') + '-' + df['target'].astype('int').astype('str')
df['target'] = np.nan

/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/stock_id=0


In [7]:
tensors[:,0] = (tensors[:,2]*tensors[:,7] + tensors[:,3]* tensors[:,6]) / (tensors[:,6] + tensors[:,7]) 
tensors[:,1] = np.diff(np.log(tensors[:,0]), prepend=0)  #TODO

stats = np.apply_along_axis(lambda x : np.sqrt(np.sum(x**2)), 1, tensors[:,1]).reshape(-1,1)

In [8]:
df['target'] = model(torch.Tensor(tensors[:,:-1]), torch.Tensor(stats)).detach().numpy().squeeze()

In [9]:
df.to_csv('submission.csv', index = False)