In [1]:
!pip install rtdl_num_embeddings -q --no-index --find-links=/kaggle/input/jane-street-import/rtdl_num_embeddings

In [2]:
# imports
import os
import glob
import numpy as np
import pandas as pd
import polars as pl
import pickle
import gc
import warnings
import math
from collections import OrderedDict

# plotting and progress bar
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

# PyTorch Lightning
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Timer

# Scikit-learn
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor

# Gradient Boosting Models
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Miscellaneous
import sys
from tanm_reference import Model, make_parameter_groups
import joblib
import dill

# Kaggle Evaluation
import kaggle_evaluation.jane_street_inference_server

# Warnings and Polars table settings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None
pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(400)
pl.Config.set_fmt_table_cell_list_len(5)

polars.config.Config

In [3]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    # feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    
    model_paths = [
        #"/kaggle/input/js24-train-gbdt-model-with-lags-singlemodel/result.pkl",
        #"/kaggle/input/js24-trained-gbdt-model/result.pkl",
        "/kaggle/input/js-xs-nn-trained-model",
    ]
    debug = False
    lag_cols_rename = { f"responder_{idx}_lag_1" : f"responder_{idx}" for idx in range(9)}
    lag_target_cols_name = [f"responder_{idx}" for idx in range(9)]
    lag_cols_original = ["date_id", "time_id", "symbol_id"] + [f"responder_{idx}" for idx in range(9)]
    model_path = "/kaggle/input/janestreet-public-model/xgb_001.pkl"
    lag_ndays = 4
    all_cols = ["date_id", "symbol_id", "time_id", "weight"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)] + [target_col]
    test_cols = ["row_id", "date_id", "symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)] + [target_col]
    feature_colss = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    #기억해두자 온라인 리트레인 feature cols를 feature colss로 바꿨다.
    only_features = ["row_id", "date_id", "symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]
    only_lags = ["row_id", "date_id", "symbol_id", "time_id"] + [f"responder_{idx}_lag_1" for idx in range(9)] 
    data_paths = ["/kaggle/input/lgbm-model-training/lgbm_model_0.json","/kaggle/input/js24-preprocessing-create-lags/validation.parquet/"]
    retrain = True
    EVAL = False

In [4]:
def create_agg_list(day, columns):
    agg_mean_list = [pl.col(c).mean().name.suffix(f"_mean_{day}d") for c in columns]
    agg_std_list = [pl.col(c).std().name.suffix(f"_std_{day}d") for c in columns]
    agg_max_list = [pl.col(c).max().name.suffix(f"_max_{day}d") for c in columns]
    agg_last_list = [pl.col(c).last().name.suffix(f"_last_{day}d") for c in columns]
    agg_list = agg_mean_list + agg_std_list + agg_max_list + agg_last_list
    return agg_list

In [5]:
# Custom R2 metric for validation
def r2_val(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return r2


class NN(LightningModule):
    def __init__(self, input_dim, hidden_dims, dropouts, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        layers = []
        in_dim = input_dim
        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(in_dim))
            if i > 0:
                layers.append(nn.SiLU())
            if i < len(dropouts):
                layers.append(nn.Dropout(dropouts[i]))
            layers.append(nn.Linear(in_dim, hidden_dim))
            # layers.append(nn.ReLU())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, 1))  # 输出层
        layers.append(nn.Tanh())
        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay
        self.validation_step_outputs = []

    def forward(self, x):
        return 5 * self.model(x).squeeze(-1)  # 输出为一维张量

    def training_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w  # 考虑样本权重
        loss = loss.mean()
        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        return loss

    def validation_step(self, batch):
        x, y, w = batch
        y_hat = self(x)
        loss = F.mse_loss(y_hat, y, reduction='none') * w
        loss = loss.mean()
        self.log('val_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))
        self.validation_step_outputs.append((y_hat, y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5,
                                                               verbose=True)
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_loss',
            }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")

In [6]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [7]:
TARGET = 'responder_6'
FEAT_COLS_CAT = [f"feature_{i:02d}" for i in range(79)]
FEAT_COLS_LGB = [f"feature_{i:02d}" for i in range(79)]+ ['responder_0_lag_1', 'responder_1_lag_1', 'responder_2_lag_1',
       'responder_3_lag_1', 'responder_4_lag_1', 'responder_5_lag_1',
       'responder_6_lag_1', 'responder_7_lag_1', 'responder_8_lag_1']

In [8]:
# 이거는 catboost 일때

model_path = '/kaggle/input/jsmodel-chan-lgbupdate'
cat_file_name = 'catboost_models'
lgb_file_name = 'lgb_models'

lgb_models = joblib.load(f'{model_path}/{lgb_file_name}.pkl')
catboost_models = joblib.load(f'{model_path}/{cat_file_name}.pkl')

catboost_holdout_model = joblib.load(f'/kaggle/input/jsmodel-chan-catholdout/catboost_holdout_model.pkl')

print(f"Loaded model from the saved file.")

Loaded model from the saved file.


In [9]:
with open( CONFIG.model_path, "rb") as fp:
    result = pickle.load(fp)
    
model = result["model"]
features = result["features"]
print(len(features))

116


In [10]:
feature_list = [f"feature_{idx:02d}" for idx in range(79) if idx != 61]

target_col = "responder_6" 

feature_test = feature_list \
                + [f"responder_{idx}_lag_1" for idx in range(9)] 

feature_cat = ["feature_09", "feature_10", "feature_11"]
feature_cont = [item for item in feature_test if item not in feature_cat]

batch_size = 8192

std_feature = [i for i in feature_list if i not in feature_cat] + [f"responder_{idx}_lag_1" for idx in range(9)]

data_stats = joblib.load("/kaggle/input/my-own-js/data_stats.pkl")
means = data_stats['mean']
stds = data_stats['std']

def standardize(df, feature_cols, means, stds):
    return df.with_columns([
        ((pl.col(col) - means[col]) / stds[col]).alias(col) for col in feature_cols
    ])

In [11]:
category_mappings = {'feature_09': {2: 0, 4: 1, 9: 2, 11: 3, 12: 4, 14: 5, 15: 6, 25: 7, 26: 8, 30: 9, 34: 10, 42: 11, 44: 12, 46: 13, 49: 14, 50: 15, 57: 16, 64: 17, 68: 18, 70: 19, 81: 20, 82: 21},
 'feature_10': {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 10: 7, 12: 8},
 'feature_11': {9: 0, 11: 1, 13: 2, 16: 3, 24: 4, 25: 5, 34: 6, 40: 7, 48: 8, 50: 9, 59: 10, 62: 11, 63: 12, 66: 13,
  76: 14, 150: 15, 158: 16, 159: 17, 171: 18, 195: 19, 214: 20, 230: 21, 261: 22, 297: 23, 336: 24, 376: 25, 388: 26, 410: 27, 522: 28, 534: 29, 539: 30},
 'symbol_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 12: 12, 13: 13, 14: 14, 15: 15, 16: 16, 17: 17, 18: 18, 19: 19,
  20: 20, 21: 21, 22: 22, 23: 23, 24: 24, 25: 25, 26: 26, 27: 27, 28: 28, 29: 29, 30: 30, 31: 31, 32: 32, 33: 33, 34: 34, 35: 35, 36: 36, 37: 37, 38: 38},
 'time_id' : {i : i for i in range(968)}}

def encode_column(df, column, mapping):
    max_value = max(mapping.values())  

    def encode_category(category):
        return mapping.get(category, max_value + 1)  
    
    return df.with_columns(
        pl.col(column).map_elements(encode_category).alias(column)
    )

In [12]:
class R2Loss(nn.Module):
    def __init__(self):
        super(R2Loss, self).__init__()

    def forward(self, y_pred, y_true):
        mse_loss = torch.sum((y_pred - y_true) ** 2)
        var_y = torch.sum(y_true ** 2)
        loss = mse_loss / (var_y + 1e-38)
        return loss

class TAB(LightningModule):
    def __init__(self, n_cont_features, cat_cardinalities, n_classes, lr, weight_decay):
        super().__init__()
        self.save_hyperparameters()
        self.k = 16
        self.model = Model(
                n_num_features=n_cont_features,
                cat_cardinalities=cat_cardinalities,
                n_classes=n_classes,
                backbone={
                    'type': 'MLP',
                    'n_blocks': 3 ,
                    'd_block': 512,
                    'dropout': 0.25,
                },
                bins=None,
                num_embeddings= None,
                arch_type='tabm',
                k=self.k,
            )
        self.lr = lr
        self.weight_decay = weight_decay
        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.loss_fn = R2Loss()
        # self.loss_fn = weighted_mse_loss

    def forward(self, x_cont, x_cat):
        return self.model(x_cont, x_cat).squeeze(-1)

    def training_step(self, batch):
        x_cont,x_cat, y, w , w_y= batch
        x_cont = x_cont + torch.randn_like(x_cont) * 0.02
        y_hat = self(x_cont, x_cat)
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k), w_y.repeat_interleave(self.k))
        loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k))
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True, batch_size=x_cont.size(0))
        self.training_step_outputs.append((y_hat.mean(1), y, w))
        return loss

    def validation_step(self, batch):
        x_cont,x_cat, y, w, w_y = batch
        x_cont = x_cont + torch.randn_like(x_cont) * 0.02
        y_hat = self(x_cont, x_cat)
        # loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k), w_y.repeat_interleave(self.k))
        loss = self.loss_fn(y_hat.flatten(0, 1), y.repeat_interleave(self.k))
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True, batch_size=x_cont.size(0))
        self.validation_step_outputs.append((y_hat.mean(1), y, w))
        return loss

    def on_validation_epoch_end(self):
        """Calculate validation WRMSE at the end of the epoch."""
        y = torch.cat([x[1] for x in self.validation_step_outputs]).cpu().numpy()
        if self.trainer.sanity_checking:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
        else:
            prob = torch.cat([x[0] for x in self.validation_step_outputs]).cpu().numpy()
            weights = torch.cat([x[2] for x in self.validation_step_outputs]).cpu().numpy()
            # r2_val
            val_r_square = r2_val(y, prob, weights)
            self.log("val_r_square", val_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.validation_step_outputs.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(make_parameter_groups(self.model), lr=self.lr, weight_decay=self.weight_decay)
        # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5,
        #                                                        verbose=True)
        return {
            'optimizer': optimizer,
            # 'lr_scheduler': {
            #     'scheduler': scheduler,
            #     'monitor': 'val_r_square',
            # }
        }

    def on_train_epoch_end(self):
        if self.trainer.sanity_checking:
            return

        y = torch.cat([x[1] for x in self.training_step_outputs]).cpu().numpy()
        prob = torch.cat([x[0] for x in self.training_step_outputs]).detach().cpu().numpy()
        weights = torch.cat([x[2] for x in self.training_step_outputs]).cpu().numpy()
        # r2_training
        train_r_square = r2_val(y, prob, weights)
        self.log("train_r_square", train_r_square, prog_bar=True, on_step=False, on_epoch=True)
        self.training_step_outputs.clear()

        epoch = self.trainer.current_epoch
        metrics = {k: v.item() if isinstance(v, torch.Tensor) else v for k, v in self.trainer.logged_metrics.items()}
        formatted_metrics = {k: f"{v:.5f}" for k, v in metrics.items()}
        print(f"Epoch {epoch}: {formatted_metrics}")
        
class custom_args():
    def __init__(self):
        self.usegpu = True
        self.gpuid = 0
        self.seed = 42
        self.model = 'nn'
        self.use_wandb = False
        self.project = 'js-tabm-with-lags'
        self.dname = "./input_df/"
        self.loader_workers = 10   
        self.bs = 8192
        self.lr = 1e-3
        self.weight_decay = 8e-4
        self.n_cont_features = 84
        self.n_cat_features = 5
        self.n_classes = None
        self.cat_cardinalities = [23, 10, 32, 40, 969]
        self.patience = 7
        self.max_epochs = 10
        self.N_fold = 5


my_args = custom_args()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tab_model = TAB.load_from_checkpoint('/kaggle/input/my-own-js/tabm_epochepoch03.ckpt').to(device)

# NN

In [13]:
N_folds = 5
# 加载最佳模型
nn_models = []
for fold in range(N_folds):
    checkpoint_path = f"{CONFIG.model_paths[0]}/nn_{fold}.model"
    nn_model = NN.load_from_checkpoint(checkpoint_path)
    nn_models.append(nn_model.to("cuda:0"))
nn_models[0]

NN(
  (model): Sequential(
    (0): BatchNorm1d(88, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Dropout(p=0.1, inplace=False)
    (2): Linear(in_features=88, out_features=512, bias=True)
    (3): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): SiLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): SiLU()
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): Linear(in_features=256, out_features=1, bias=True)
    (11): Tanh()
  )
)

In [14]:
history = pl.scan_parquet(
    "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
).select(['date_id','time_id','symbol_id'] + [f"responder_{idx}" for idx in range(9)]).filter(
    (pl.col("date_id")>=(1698 - CONFIG.lag_ndays))&(pl.col("date_id")<1698)
)

# 这里将历史date_id变为从-N到-1, 假设test的date_id=0紧随train的date_id=1698,
# 在第一个batch给出的lags应该是date_id=1698的responser(但date_id给的0),
# 这样history中最后一个date_id=1697变为-1, 正好可以和推理时给的lags衔接上
history = history.with_columns(
    date_id = (pl.col("date_id") - pl.lit(1698)).cast(pl.Int16)
)
history = history.collect()

# 这里是为了统一特征的dtypes(polars在concat时如果dtype对不上会报错)
history_column_types = {
    'date_id': pl.Int16,
    'time_id': pl.Int16,
    'symbol_id': pl.Int16
}
feature_column_types = {}
for f in [f"feature_{idx:02d}" for idx in range(79)]:
    feature_column_types[f] = pl.Float32

responder_column_types = {}
for f in [f"responder_{idx}" for idx in range(9)]:
    responder_column_types[f] = pl.Float32

history = history.cast(history_column_types)
history = history.cast(responder_column_types)
history.tail()

date_id,time_id,symbol_id,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
i16,i16,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32
-1,967,34,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
-1,967,35,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
-1,967,36,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
-1,967,37,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708
-1,967,38,0.542563,0.513193,0.814393,0.032767,0.025435,0.311465,-0.044797,0.011133,-0.0793


In [15]:
with open("/kaggle/input/jsridgev01011635/Ridge.dill", "rb") as file_handle:
    rdg = dill.load(file_handle)

def predict_ridge(test, lags):
    cols = [f'feature_{i:02}' for i in range(79)]
    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    test_preds = rdg.predict(test[cols].to_pandas().fillna(3).values)
    return test_preds

In [16]:
xgb_feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)] + [f"responder_{idx}_lag_1" for idx in range(9)]

xgb_model = None
model_path = "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result0.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model = result["model"] # モデルオブジェクトを指定
    xgb_model.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )

xgb_model1 = None
model_path = "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result1.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model1 = result["model"]
    xgb_model1.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )

xgb_model2 = None
model_path = "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result2.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model2 = result["model"]
    xgb_model2.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )
    
xgb_model3 = None
model_path = "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result3.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model3 = result["model"]
    xgb_model3.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )
    
xgb_model4 = None
model_path = "/kaggle/input/als-e-106-pp0-40-xgb-5fold/result4.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model4 = result["model"]
    xgb_model4.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )
    
xgb_model5 = None
model_path = "/kaggle/input/js-with-lags-trained-xgb/result.pkl"
with open( model_path, "rb") as fp:
    result = pickle.load(fp)
    xgb_model5 = result["model"]
    xgb_model5.set_params(
        early_stopping_rounds=50,
        gamma=0.4,
        tree_method="hist",
        max_depth=5,
        eval_metric='rmse',
        learning_rate=0.05
    )

In [17]:
# Show model
display(xgb_model5)

In [18]:
class GaussianNoise(nn.Module):
    def __init__(self, std=0.1):
        super().__init__()
        self.std = std

    def forward(self, x):
        if self.training:  # Only add noise during training
            noise = torch.randn_like(x) * self.std
            return x + noise
        return x

In [19]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, output_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.noise = GaussianNoise(std = .1)
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)
    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.noise(x)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out.squeeze()

In [20]:
lstm_model = LSTM(input_size=79, hidden_dim=512, output_size = 1, num_layers = 1).to(device)
lstm_model.load_state_dict(torch.load('/kaggle/input/jsmodel-chan-lstm/torchlstm.pth', map_location=device,weights_only= True))
lstm_model.eval()
sel_cols  = [f"feature_{i:02d}" for i in range(79)]

In [21]:
means_ = {'feature_00': 0.640198826789856, 'feature_01': 0.03755598142743111, 'feature_02': 0.6368075609207153, 'feature_03': 0.6365063786506653, 'feature_04': 0.013741530478000641, 'feature_05': -0.02173694409430027, 'feature_06': -0.006415014620870352, 'feature_07': -0.010971736162900925, 'feature_08': -0.04653771221637726, 'feature_09': 32.596106194690265, 'feature_10': 4.95929203539823, 'feature_11': 167.6541592920354, 'feature_12': -0.13415881991386414, 'feature_13': -0.07573335617780685, 'feature_14': -0.12015637010335922, 'feature_15': -0.7470195889472961, 'feature_16': -0.6257441639900208, 'feature_17': -0.7294047474861145, 'feature_18': -0.042215555906295776, 'feature_19': -0.08798160403966904, 'feature_20': -0.15741558372974396, 'feature_21': 0.10528526455163956, 'feature_22': 0.018054703250527382, 'feature_23': 0.03165541961789131, 'feature_24': 2.733017921447754, 'feature_25': 0.39958420395851135, 'feature_26': -0.11045943945646286, 'feature_27': -0.5332594513893127, 'feature_28': -0.4522790312767029, 'feature_29': -0.5739678144454956, 'feature_30': -0.7905704975128174, 'feature_31': 0.10600688308477402, 'feature_32': 0.40044134855270386, 'feature_33': -0.021725023165345192, 'feature_34': 0.4226262867450714, 'feature_35': 0.42143046855926514, 'feature_36': -0.00023802756913937628, 'feature_37': 0.027961043640971184, 'feature_38': 0.010258913040161133, 'feature_39': 0.005768273025751114, 'feature_40': 0.017485467717051506, 'feature_41': 0.038347117602825165, 'feature_42': -0.06123563274741173, 'feature_43': -0.11644423753023148, 'feature_44': -0.12342483550310135, 'feature_45': -0.028769943863153458, 'feature_46': -0.015200662426650524, 'feature_47': 0.015717582777142525, 'feature_48': -0.0033910537604242563, 'feature_49': -0.0052393232472240925, 'feature_50': -0.2285808026790619, 'feature_51': -0.3548349440097809, 'feature_52': -0.358092725276947, 'feature_53': 0.2607136368751526, 'feature_54': 0.18796788156032562, 'feature_55': 0.3154229521751404, 'feature_56': -0.1471923440694809, 'feature_57': 0.15730056166648865, 'feature_58': -0.021774644032120705, 'feature_59': -0.0037768862675875425, 'feature_60': -0.010220836848020554, 'feature_61': -0.03178725391626358, 'feature_62': -0.3769100308418274, 'feature_63': -0.3229374587535858, 'feature_64': -0.3718394339084625, 'feature_65': -0.10233989357948303, 'feature_66': -0.13688170909881592, 'feature_67': -0.14402112364768982, 'feature_68': -0.06875362992286682, 'feature_69': -0.11862917989492416, 'feature_70': -0.11789549142122269, 'feature_71': -0.06013699993491173, 'feature_72': -0.10766122490167618, 'feature_73': -0.09921672940254211, 'feature_74': -0.10233042389154434, 'feature_75': -0.05991339311003685, 'feature_76': -0.06349952518939972, 'feature_77': -0.07424316555261612, 'feature_78': -0.07759837061166763}
stds_ = {'feature_00': 1.027751088142395, 'feature_01': 1.0967519283294678, 'feature_02': 1.0156300067901611, 'feature_03': 1.0170334577560425, 'feature_04': 1.0726385116577148, 'feature_05': 0.9639211297035217, 'feature_06': 1.0963259935379028, 'feature_07': 1.0789952278137207, 'feature_08': 0.7962697148323059, 'feature_09': 23.72976726545254, 'feature_10': 3.1867162933797224, 'feature_11': 163.44513161352285, 'feature_12': 0.6700984835624695, 'feature_13': 0.5805172920227051, 'feature_14': 0.664044201374054, 'feature_15': 0.37517768144607544, 'feature_16': 0.3393096327781677, 'feature_17': 0.3603287935256958, 'feature_18': 0.9911752939224243, 'feature_19': 1.0550744533538818, 'feature_20': 0.6643751263618469, 'feature_21': 0.38239365816116333, 'feature_22': 0.950261116027832, 'feature_23': 0.8119344711303711, 'feature_24': 1.4362775087356567, 'feature_25': 1.0947270393371582, 'feature_26': 1.077124834060669, 'feature_27': 1.0645726919174194, 'feature_28': 1.0676648616790771, 'feature_29': 0.2640742361545563, 'feature_30': 0.19689509272575378, 'feature_31': 0.3815343976020813, 'feature_32': 1.2996565103530884, 'feature_33': 0.9989405870437622, 'feature_34': 1.3409572839736938, 'feature_35': 1.3365675210952759, 'feature_36': 0.8695492148399353, 'feature_37': 0.7334080934524536, 'feature_38': 0.698810338973999, 'feature_39': 0.7965824604034424, 'feature_40': 0.518515944480896, 'feature_41': 0.6384949088096619, 'feature_42': 0.8168442249298096, 'feature_43': 0.5228385925292969, 'feature_44': 0.6521403193473816, 'feature_45': 0.8666537404060364, 'feature_46': 0.9039222002029419, 'feature_47': 3.2711963653564453, 'feature_48': 0.6570901274681091, 'feature_49': 0.7083076238632202, 'feature_50': 1.0132617950439453, 'feature_51': 0.6081287860870361, 'feature_52': 0.9250587224960327, 'feature_53': 1.0421689748764038, 'feature_54': 0.5859629511833191, 'feature_55': 0.9191848039627075, 'feature_56': 0.9549097418785095, 'feature_57': 1.0204777717590332, 'feature_58': 0.8327276110649109, 'feature_59': 0.8309783339500427, 'feature_60': 0.8389413356781006, 'feature_61': 1.192766547203064, 'feature_62': 1.388945460319519, 'feature_63': 0.09957146644592285, 'feature_64': 0.3396177291870117, 'feature_65': 1.01683509349823, 'feature_66': 1.0824761390686035, 'feature_67': 0.642227828502655, 'feature_68': 0.5312599539756775, 'feature_69': 0.6208390593528748, 'feature_70': 0.6724499464035034, 'feature_71': 0.5356909036636353, 'feature_72': 0.6534596681594849, 'feature_73': 1.0855497121810913, 'feature_74': 1.0880277156829834, 'feature_75': 1.2321789264678955, 'feature_76': 1.2345560789108276, 'feature_77': 1.0921478271484375, 'feature_78': 1.0924347639083862}
def normalize_dataframe(df: pl.DataFrame, means: dict, stds: dict) -> pl.DataFrame:
    """
    Normalize a Polars DataFrame using the provided means and standard deviations.

    Args:
    df (pl.DataFrame): The input DataFrame to normalize
    means (dict): A dictionary of column means
    stds (dict): A dictionary of column standard deviations

    Returns:
    pl.DataFrame: The normalized DataFrame
    """

    # Create a list to store our normalization expressions
    normalize_exprs = []

    for col in df.columns:
        if col in means and col in stds:
            # Ensure we don't divide by zero
            if stds[col] != 0:
                normalize_exprs.append(
                    ((pl.col(col) - means[col]) / stds[col]).alias(col)
                )
            else:
                # If std is 0, just subtract the mean
                normalize_exprs.append(
                    (pl.col(col) - means[col]).alias(col)
                )
        else:
            # If we don't have mean/std for this column, leave it as is
            normalize_exprs.append(pl.col(col))

    # Apply the normalization to the dataframe
    normalized_df = df.select(normalize_exprs)

    return normalized_df

In [22]:
lgbm_original = lgb.Booster(model_file=CONFIG.data_paths[0])

In [23]:
# Params used to retrain
input_params = {"num_leaves": 31, "feature_fraction": 0.9, "n_estimators": 100, "learning_rate": 0.1}

# Define Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',                                      # Root Mean Squared Error
    'boosting_type': 'gbdt',                               # Gradient Boosted Decision Trees
    'num_leaves': input_params['num_leaves'],
    'learning_rate': input_params['learning_rate'],
    'feature_fraction': input_params['feature_fraction'],
    'n_estimators': input_params['n_estimators']      
}

In [24]:
lags_ : pl.DataFrame | None = None

lags_history = None

# lgb online retrain global variables start

# Initialize global vars
cache = None
cache_list = []
# tot nb of days counter
day_count = 0
# training counter to be reset after each train
train_counter = 0
lgbm_retrained = None

labels : pl.DataFrame | None = None
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.

# end

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    
    global cache          # Declare the global cache
    global day_count
    global lgbm_retrained
    global lags_
    global history
    global lags_infer
    global lags_history
    global cache_list
    global labels
    global train_counter
    # global gt

    if lags is not None:
        lags_ = lags

        # lgb online retrain global variable update start
        day_count += 1
        train_counter += 1
        # store ground truth from previous day
        update_labels = lags_["date_id", "symbol_id", "time_id","responder_6_lag_1"]
        lag_cols_rename = {"responder_6_lag_1": "responder_6"}
        update_labels = update_labels.rename(lag_cols_rename)
        if labels is not None:
            labels = pl.concat([labels, update_labels], rechunk=True)
        else:
            labels = update_labels
        # end
    
    # lstm start
    test_lstm = test.clone()
    missing_cols = set(sel_cols) - set(test_lstm.columns)
    if missing_cols:
        raise ValueError(f"Missing columns in test data: {missing_cols}")
        
    # Select the features
    test_features_lstm = test_lstm.select(sel_cols)
    # **Apply forward fill and then fill remaining missing values with zero**
    test_features_lstm = test_features_lstm.fill_null(strategy='forward').fill_null(0)
    test_features_lstm = normalize_dataframe(test_features_lstm, means_,stds_)
    # Convert Polars DataFrame to NumPy array
    X_test_lstm = test_features_lstm.to_numpy()
    # Convert to Torch tensor
    X_test_tensor_lstm = torch.tensor(X_test_lstm, dtype=torch.float32).to(device)
    
    # Make predictions
    with torch.no_grad():        
        outputs_lstm = lstm_model(X_test_tensor_lstm)
        # Assuming the model outputs a tensor of shape (batch_size, 1)
        pred_lstm = outputs_lstm.squeeze().cpu().numpy()
    
    # lstm end

    # quadra xgb start
    # lagsxgb = lags_.clone().group_by(["date_id", "symbol_id"], maintain_order=True).last()
    # testxgb = test.clone().join(lagsxgb, on=["date_id", "symbol_id"], how="left")

    # preds_qxgb = np.zeros((testxgb.shape[0],))
    # preds_qxgb += xgb_model.predict(testxgb[xgb_feature_cols].to_pandas()) * 0.25
    # preds_qxgb += xgb_model2.predict(testxgb[xgb_feature_cols].to_pandas()) * 0.25
    # preds_qxgb += xgb_model4.predict(testxgb[xgb_feature_cols].to_pandas()) * 0.25
    # preds_qxgb += xgb_model5.predict(testxgb[xgb_feature_cols].to_pandas()) * 0.25
    # quadra xgb end
    
    test_tab = test.clone()
    for col in feature_cat + ['symbol_id', 'time_id']:
        test_tab = encode_column(test_tab, col, category_mappings[col])
        
    # Initialize predictions with `row_id`
    predictions = test.select('row_id').with_columns(
        pl.lit(0.0).alias('responder_6')
    )

    # lgb online retrain code
    test_lgb_re = test.clone()
    lags__lgb_re = lags_.clone()
    if not lags__lgb_re is None:
        lags_lgb_re = lags__lgb_re.group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up last record of previous date
        lags_lgb_re = lags_lgb_re.drop(["time_id"])
        test_lgb_re = test_lgb_re.join(lags_lgb_re, on=["date_id", "symbol_id"],  how="left")
    else:
        test_lgb_re = test_lgb_re.with_columns(
            ( pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9) )
        )
    if CONFIG.retrain:
        # store data for each batch
        cache_list.append(test_lgb_re)
        
    # initialize preds
    preds_lgb_re = np.zeros((test_lgb_re.shape[0],))
    
    # lightgbm model
    X_lgb_re = test_lgb_re[CONFIG.feature_colss].to_numpy()

    # re-train a model on the fly every N days
    if CONFIG.retrain and train_counter % 4 == 0 and day_count>=60:
        # print("Start retraining")
        if cache is not None:
            cache_update = pl.concat(cache_list, rechunk=True)
            cache = pl.concat([cache, cache_update], rechunk=True)
        else:
            cache = pl.concat(cache_list, rechunk=True)
        # filter labels
        # move data back to the previous day (we receive the lags at the same day but they are the ground truth of the previous day)
        df = labels.with_columns(
            (pl.col("date_id") -1).alias("date_id")
        )
        df = df.filter(pl.col("date_id") >= np.min(cache["date_id"].to_numpy()))
        # prepare data for training
        train = cache.join(df, on=["date_id", "symbol_id", "time_id"],  how="left")
        
        # drop columns where labels are none (normally last day)
        train_cleaned = train.filter(pl.col(CONFIG.target_col).is_not_nan())
        
        X_train = train_cleaned[CONFIG.feature_colss].to_numpy()
        y_train = train_cleaned[CONFIG.target_col].to_numpy().flatten()

        train_data = lgb.Dataset(X_train, label=y_train)
        
        # Re-train the model
        lgbm_retrained = lgb.train(
            params,
            train_data,
            num_boost_round=40
        )
        # reset counter otherwise we will retrain for each time_id of the same day
        train_counter = 1
        # empty cache list
        cache_list = []

        # store only last 50 days
        days = np.unique(cache["date_id"].to_numpy())
        days = days[-40:]
        min_day = np.min(days)
        cache = cache.filter(pl.col("date_id") >= min_day)
        
    # average original model with new retrained model
    if lgbm_retrained:
        # lightgbm models
        pred_lgbm_retrained = lgbm_retrained.predict(X_lgb_re, num_iteration=lgbm_original.best_iteration)
        pred_lgbm_original = lgbm_original.predict(X_lgb_re, num_iteration=lgbm_original.best_iteration)
        # simple average
        # weight more the prediction from the new model retrained on the new data
        pred_online_retrain = (0.6 * pred_lgbm_retrained + 0.4 * pred_lgbm_original)
    else:
        # lightgbm model
        pred_online_retrain = lgbm_original.predict(X_lgb_re, num_iteration=lgbm_original.best_iteration)

    # end

    # Prepare test_nn for NN processing
    test_nn = test.clone()
    symbol_ids = test_nn.select('symbol_id').to_numpy()[:, 0]
    current_date = test.select("date_id").to_numpy()[:, 0][0]
    time_id = test.select("time_id").to_numpy()[0]
    timie_id_array = test.select("time_id").to_numpy()[:, 0]

    # for tabm
    if time_id == 0:
        lagsss = lags.with_columns(pl.col('time_id').cast(pl.Int64))
        lagsss = lagsss.with_columns(pl.col('symbol_id').cast(pl.Int64))
    
        lags_history = lagsss
        lagsss = lagsss.filter(pl.col("time_id") == 0)
        
        
        test_tab = test_tab.join(lagsss, on=["time_id", "symbol_id"],  how="left")
    else:
        lagsss = lags_history.filter(pl.col("time_id") == time_id)
        test_tab = test_tab.join(lagsss, on=["time_id", "symbol_id"],  how="left")
        
    test_tab = test_tab.with_columns([
        pl.col(col).fill_null(0) for col in feature_list + [f"responder_{idx}_lag_1" for idx in range(9)] 
    ])

    test_tab = standardize(test_tab, std_feature, means, stds)


    X_test_ = test_tab[feature_test].to_numpy()
    X_test_tensor = torch.tensor(X_test_, dtype=torch.float32).to(device)

    symbol_tensor = torch.tensor(symbol_ids, dtype=torch.float32).to(device)
    time_tensor = torch.tensor(timie_id_array, dtype=torch.float32).to(device)
    X_cat = X_test_tensor[:, [9, 10, 11]]
    X_cont = X_test_tensor[:, [i for i in range(X_test_tensor.shape[1]) if i not in [9, 10, 11]]]
    # X_cont = X_cont + torch.randn_like(X_cont) * 0.02

    X_cat = (torch.concat([X_cat, symbol_tensor.unsqueeze(-1), time_tensor.unsqueeze(-1)], axis=1)).to(torch.int64)
    

    tab_model.eval()
    with torch.no_grad():
        
        outputs = tab_model(X_cont, X_cat)
        # Assuming the model outputs a tensor of shape (batch_size, 1)
        preds_tab = outputs.squeeze(-1).cpu().numpy()
        preds_tab = preds_tab.mean(1)
    # end

    # rdg start
    lagsrdg = lags_.clone().group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up 
    testrdg = test.clone().join(lagsrdg, on=["date_id", "symbol_id"],  how="left")
    predsrdg = np.zeros((testrdg.shape[0],))

    predsrdg = predict_ridge(testrdg,lagsrdg)
    # rdg end
    
    if lags is not None:
        lagss = lags.rename(CONFIG.lag_cols_rename)
        lagss = lagss.cast(history_column_types)
        lagss = lagss.cast(responder_column_types)

        history = pl.concat([history, lagss])
        
        # 只储存最近N天的历史数据
        history = history.filter(pl.col("date_id") > (current_date - CONFIG.lag_ndays))

        # 这里用的XGB模型只使用了shift 1天的统计值
        agg_list = create_agg_list(1, CONFIG.lag_target_cols_name)
        shift_n_data = history.filter(pl.col("date_id") == current_date)
        lags_infer = shift_n_data.group_by(["date_id", "symbol_id"], maintain_order=True).agg(agg_list)

    test_ = test.cast(history_column_types)
    test_ = test_.cast(feature_column_types)
    # 在一个date_id下的所有batch用到的lags_infer是相同的
    # 像lags_infer这样的统计特征在每个date_id的time_id=0时构造完成
    X_test = test_.join(lags_infer, on=["date_id", "symbol_id"], how="left")
    
    preds_xgb = np.zeros((X_test.shape[0],))
    preds_xgb += model.predict(X_test[features].to_pandas().values)
    
    if lags is not None:
        lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()
        test_nn = test_nn.join(lags, on=["date_id", "symbol_id"], how="left")
    else:
        test_nn = test_nn.with_columns(
            (pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9))
        )

    # CatBoost predictions
    feat_cat = test[FEAT_COLS_CAT + ['symbol_id', 'weight', 'time_id']].to_pandas()
    feat_cat = feat_cat.fillna('NaN').astype(str)
    # pred_cat = [model.predict(feat_cat) for model in catboost_models]
    # pred_cat = np.mean(pred_cat, axis=0)
    
    pred_cat2 = catboost_holdout_model.predict(feat_cat)


    # LightGBM predictions
    feat_lgb = test_nn[FEAT_COLS_LGB + ['weight', 'symbol_id', 'time_id']].to_pandas()
    # feat_lgb['pred_cat'] = pred_cat
    pred_lgb = [model.predict(feat_lgb) for model in lgb_models]
    pred_lgb = np.mean(pred_lgb, axis=0)
    
    # Neural network predictions
    preds_nn = np.zeros((test_nn.shape[0],))
    test_input = test_nn[CONFIG.feature_cols].to_pandas()
    test_input = test_input.fillna(method='ffill').fillna(0)
    test_input = torch.FloatTensor(test_input.values).to("cuda:0")
    with torch.no_grad():
        for i, nn_model in enumerate(tqdm(nn_models)):
            nn_model.eval()
            preds_nn += nn_model(test_input).cpu().numpy() / len(nn_models)
    print(f"predict> nn_preds.shape =", preds_nn.shape)

    # Final prediction
    pred = pred_lgb * 0.1 + preds_nn * 0.3 + preds_tab * 0.2 + pred_cat2 * 0.15 + preds_xgb * 0.2 + predsrdg * 0.1  + pred_lstm * 0.05 + pred_online_retrain * 0.1

    # Clip predictions to the range [-5, 5]
    predictions = test.select('row_id').with_columns(
        pl.Series(
            name='responder_6',
            values=np.clip(pred, a_min=-5, a_max=5),
            dtype=pl.Float64
        )
    )

    print(predictions)
    
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    assert list(predictions.columns) == ['row_id', 'responder_6']
    assert len(predictions) == len(test)
    
    return predictions

In [25]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

  0%|          | 0/5 [00:00<?, ?it/s]

predict> nn_preds.shape = (39,)
shape: (39, 2)
┌────────┬─────────────┐
│ row_id ┆ responder_6 │
│ ---    ┆ ---         │
│ i64    ┆ f64         │
╞════════╪═════════════╡
│ 0      ┆ 0.029505    │
│ 1      ┆ 0.048384    │
│ 2      ┆ -0.029644   │
│ 3      ┆ -0.007676   │
│ 4      ┆ 0.026923    │
│ 5      ┆ 0.044062    │
│ 6      ┆ 0.017532    │
│ 7      ┆ 0.059194    │
│ 8      ┆ 0.071777    │
│ 9      ┆ 0.025686    │
│ 10     ┆ 0.016923    │
│ 11     ┆ 0.065242    │
│ 12     ┆ -0.032249   │
│ 13     ┆ 0.031587    │
│ 14     ┆ 0.052508    │
│ 15     ┆ -0.01222    │
│ 16     ┆ 0.030412    │
│ 17     ┆ -0.02157    │
│ 18     ┆ -0.016674   │
│ 19     ┆ 0.013744    │
│ 20     ┆ -0.000098   │
│ 21     ┆ 0.053155    │
│ 22     ┆ 0.018272    │
│ 23     ┆ 0.030464    │
│ 24     ┆ -0.00867    │
│ 25     ┆ -0.167841   │
│ 26     ┆ -0.044406   │
│ 27     ┆ 0.038685    │
│ 28     ┆ -0.029133   │
│ 29     ┆ -0.138711   │
│ 30     ┆ 0.033299    │
│ 31     ┆ -0.044333   │
│ 32     ┆ 0.059728    │
│ 3

In [26]:
# import time

# 1. make valid data

In [27]:
# valid_from = 1577 # for private you should change to 1455 (1 year)
# alltraindata = pl.scan_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet")
# valid_df = alltraindata.filter(pl.col("date_id")>=valid_from).collect()
# valid_df = valid_df.with_columns(pl.Series(range(len(valid_df))).alias("row_id"),
#                                 pl.lit(True).alias("is_scored"))
# valid_df.write_parquet("valid_df.parquet")
# test_sample = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet")
# valid_df = valid_df.select(test_sample.columns)

# 2. make lag function

In [28]:
# lag_sample = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet/date_id=0/part-0.parquet")
# train_sample = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet",n_rows=1)
# responder_cols = [s for s in train_sample.columns if "responder" in s]

# def makelag(date_id):
#     """
#     Making lag at the previout day

#     Args:
#     date_id (int): date_id at the previout day
    
#     Returns:
#     pl.dataframe
#     """
    
#     lag = alltraindata.filter(pl.col("date_id")==date_id).select(["date_id","time_id","symbol_id"] + responder_cols).collect()
#     lag.columns = lag_sample.columns
    
#     return lag

# 3. make the test and lag data for debug

In [29]:
# os.makedirs("./debug/test.parquet",exist_ok=True)
# os.makedirs("./debug/lags.parquet",exist_ok=True)

In [30]:
# total_iterations = len(valid_df["date_id"].unique())
# total_iterations

In [31]:
# for num_days, df_per_day in tqdm(valid_df.group_by("date_id",maintain_order=True),total=total_iterations,desc="Processing"):
    
       
#     day = num_days[0] - valid_from # date_id must start from 0.
    
#     os.makedirs(f"./debug/test.parquet/date_id={day}",exist_ok=True)
#     os.makedirs(f"./debug/lags.parquet/date_id={day}",exist_ok=True)
    
#     lag = makelag(num_days[0] - 1)
    
#     df_per_day.write_parquet(f"./debug/test.parquet/date_id={day}/part-0.parquet")
#     lag.write_parquet(f"./debug/lags.parquet/date_id={day}/part-0.parquet")

# 5. check submission using the evalution API

In [32]:
# %%time

# inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway(
#         (
#             './debug/test.parquet',
#             './debug/lags.parquet',
#         )
#     )

In [33]:
# all_submission_dataframe = []
# all_inference_times = []
# timeout = 60
# total_iterations = len(valid_df["date_id"].unique())

In [34]:
# ## Step 1 The data is split by day using group_by.

# for num_days, df_per_day in tqdm(valid_df.group_by("date_id",maintain_order=True),total=total_iterations,desc="Processing"):
    
#     ## Step 2 The data is split by time_id using group_by, and the lag is generated (for time_id == 0).
    
#     for time_id, test in df_per_day.group_by("time_id",maintain_order=True):
        
#         ## when time_id == 0, makelags
        
#         start_time = time.time()
        
#         if time_id[0] == 0:
#             lag = makelag(num_days[0] - 1)
#         else:
#             lag = None
        
#         submission_dataframe = predict(test, lag)
        
#         all_submission_dataframe.append(submission_dataframe)
        
#         end_time = time.time()
        
#         diff = end_time - start_time
        
#         all_inference_times.append(diff)
        
#      #   print(f"{num_days[0]=},{time_id[0]=}{diff=}")
        
#         if diff > timeout:
#             print(f"{num_days[0]=},{time_id[0]=}{diff=}")
#             assert elapsed_time < timeout, f"process over {timeout/60} mins. cancelled"
        
# all_submission_dataframe = pl.concat(all_submission_dataframe)
# all_submission_dataframe

In [35]:
# def weighted_zero_mean_r2(y_true, y_pred, weights):
#     """
#     Calculate the sample weighted zero-mean R-squared score.

#     Parameters:
#     y_true (numpy.ndarray): Ground-truth values for responder_6.
#     y_pred (numpy.ndarray): Predicted values for responder_6.
#     weights (numpy.ndarray): Sample weight vector.

#     Returns:
#     float: The weighted zero-mean R-squared score.
#     """
#     numerator = np.sum(weights * (y_true - y_pred)**2)
#     denominator = np.sum(weights * y_true**2)
    
#     r2_score = 1 - numerator / denominator
#     return r2_score

In [36]:
# valid_df = pl.read_parquet("valid_df.parquet")
# y_true = valid_df.select("responder_6").to_numpy().reshape(-1)
# y_pred = all_submission_dataframe.select("responder_6").to_numpy().reshape(-1)
# weights = valid_df.select("weight").to_numpy().reshape(-1)

In [37]:
# weighted_zero_mean_r2(y_true, y_pred, weights)