---

## 0. requirements

In [1]:
# !pip install --target=/home/<user_name>/<venv_name>/lib/python3.10/site-packages <package_name>

## 1. config 설정

#### 1.1. init config

In [2]:
MODE = "inference"  # train, inference, both
KAGGLE_DATASET_NAME = "model-version-70"

is_train = True
is_infer = True
is_pre_test = False

is_offline = False

LGB = True
XGB = False
TF = True

In [3]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter
import functools
import time
from numba import njit, prange
import numba
import pyarrow.parquet as pq
from tqdm import tqdm
import glob
import polars as pl

import joblib
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from functools import partial
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder

from typing import Dict, List, Optional, Tuple
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler,PowerTransformer, FunctionTransformer,minmax_scale,QuantileTransformer
from sklearn.decomposition import PCA,TruncatedSVD,LatentDirichletAllocation
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.impute import KNNImputer
import traceback
from contextlib import contextmanager
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Subset, random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau

pd.set_option('display.max_rows', None)

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

#### 1.2. train / inference config

In [4]:
lgb.__version__, xgb.__version__

('3.3.2', '2.0.2')

In [5]:
EPS = 1e-10

In [6]:
if MODE == "train":
    print("You are in train mode")
    model_directory = "./models/" + time.strftime("%Y%m%d_%H:%M:%S", time.localtime(time.time() + 9 * 60 * 60))
    data_directory = "./data"
    train_mode = True
    infer_mode = False
elif MODE == "inference":
    print("You are in inference mode")
    model_directory = f'/kaggle/input/{KAGGLE_DATASET_NAME}'
    data_directory = "/kaggle/input/optiver-trading-at-the-close"
    train_mode = False
    infer_mode = True
elif MODE == "both":
    print("You are in both mode")
    model_directory = f'/kaggle/working/'
    data_directory = "/kaggle/input/optiver-trading-at-the-close"
    train_mode = True
    infer_mode = True
else:
    raise ValueError("Invalid mode")

You are in inference mode


#### 1.3. model config

In [7]:
config = {
    ### default config
    "data_dir": data_directory,
    "model_dir": model_directory,
    "train_mode": train_mode,  # True : train, False : not train
    "infer_mode": infer_mode,  # True : inference, False : not inference

    # ### model config
    # "model_name": ["lgb_b"],  # model name
    # "stacking_mode": False,  # stacking mode or not (single model도 split되면 그걸로 stacking)
    # "stacking_algorithm": None,  # "optuna",  # or None
    # 
    # "target": "target",
    # 
    # ### model hyperparameter
    # "optuna_random_state": 42,
    # 
    # ### cv hyperparameter
    # "split_method": "purged",  # time_series, rolling, blocking, holdout
    # "n_splits": 5,  # number of splits
    # "correct": True,  # correct boundary
    # "gap": 0.05,  # gap between train and test (0.05 = 5% of train size)
    # "initial_fold_size_ratio": 0.4,  # initial fold size ratio
    # "train_test_ratio": 0.95,  # train, test ratio
}

#### 1.4. model heyperparameter config

In [8]:
if MODE == "train":
    if not os.path.exists(config["model_dir"]):
        os.makedirs(config["model_dir"])
    if not os.path.exists(config["data_dir"]):
        os.makedirs(config["data_dir"])
    !kaggle competitions download optiver-trading-at-the-close -p {config["data_dir"]} --force
    !unzip -o {config["data_dir"]} /optiver-trading-at-the-close.zip -d {config["data_dir"]}
    !rm {config["data_dir"]} /optiver-trading-at-the-close.zip

In [9]:
def weighted_average(a):
    w = []
    n = len(a)
    for j in range(1, n + 1):
        j = 2 if j == 1 else j
        w.append(1 / (2 ** (n + 1 - j)))
    return w

In [10]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:

                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024 ** 2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df


In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args


# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]

                train_array = np.sort(np.unique(
                    np.concatenate((train_array,
                                    train_array_tmp)),
                    axis=None), axis=None)

            train_end = train_array.size

            for test_group_idx in unique_groups[group_test_start:
            group_test_start +
            group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                    np.concatenate((test_array,
                                    test_array_tmp)),
                    axis=None), axis=None)

            test_array = test_array[group_gap:]

            if self.verbose > 0:
                pass

            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [12]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df_shape = df.shape

In [13]:
from numba import njit, prange


@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val

            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features


In [14]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)

    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])

    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (
            df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']

    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap', 'price_spread']:
        for window in [1, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    #V4 feature
    for window in [3, 5, 10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5 - rolling diff
    # Convert from pandas to Polars
    pl_df = pl.from_pandas(df)

    #Define the windows and columns for which you want to calculate the rolling statistics
    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    # prepare the operations for each column and window
    group = ["stock_id"]
    expressions = []

    # Loop over each window and column to create the rolling mean and std expressions
    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    # Run the operations using Polars' lazy API
    lazy_df = pl_df.lazy().with_columns(expressions)

    # Execute the lazy expressions and overwrite the pl_df variable
    pl_df = lazy_df.collect()

    # Convert back to pandas if necessary
    df = pl_df.to_pandas()
    gc.collect()

    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')

    if MODE == "train":
        ## revealed_target
        # df.sort_values(by=['stock_id', 'date_id', 'seconds_in_bucket'], inplace=True)
        # grouped = df.groupby(['stock_id', 'seconds_in_bucket'])
        # df['revealed_target'] = grouped['target'].shift(1)
        # df.sort_index(inplace=True)
        # 
        # df = revealed_features(df)
        pass

    elif MODE == "inference":
        ## revealed_target (just add at the inference time, it will already be there)

        pass

    bid_price_diff = df['bid_price'].diff()
    ask_price_diff = df['ask_price'].diff()
    bid_size_diff = df['bid_size'].diff()
    ask_size_diff = df['ask_size'].diff()

    df['bid_price_diff'] = bid_price_diff
    df['ask_price_diff'] = ask_price_diff
    df['bid_size_diff'] = bid_size_diff
    df['ask_size_diff'] = ask_size_diff

    ofi = ((bid_price_diff > 0) | (ask_price_diff < 0)) * bid_size_diff \
          - ((bid_price_diff < 0) | (ask_price_diff > 0)) * ask_size_diff

    df['OFI'] = ofi.fillna(0)

    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df


def revealed_features(df):
    ## stock_return
    def calculate_stock_return(group):
        shifted_wap = group["wap"].shift(6)
        stock_return = ((group["wap"] / shifted_wap).shift(-6) - 1) * 10_000
        return stock_return

    df['stock_return'] = df.groupby(["stock_id", "date_id"]).apply(calculate_stock_return).reset_index(level=[0, 1],
                                                                                                       drop=True)

    ## revealed_stock_return
    df.sort_values(by=['stock_id', 'date_id', 'seconds_in_bucket'], inplace=True)
    grouped = df.groupby(['stock_id', 'seconds_in_bucket'])
    df['revealed_stock_return'] = grouped['stock_return'].shift(1)
    df.sort_index(inplace=True)

    ## index_return
    def calculate_index_return(group):
        return group["revealed_stock_return"] - group["revealed_target"]

    df['index_return'] = df.groupby(["stock_id", "date_id"]).apply(calculate_index_return).reset_index(level=[0, 1],
                                                                                                       drop=True)
    return df


def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
    df = df[cols]

    # Generate imbalance features
    df = imbalance_features(df)
    gc.collect()
    df = other_features(df)
    gc.collect()
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id"]]

    return df[feature_name]


In [15]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04, 0.002, 0.002, 0.004, 0.04, 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02, 0.004, 0.006, 0.002, 0.02, 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04, 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04, 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02, 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k): v for k, v in enumerate(weights)}

In [16]:
if is_offline:

    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")

else:
    df_train = df
    print("Online mode")


Online mode


In [17]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")[
            "ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")[
            "ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)


Build Online Train Feats Finished.


In [18]:
# if LGB:
#     import lightgbm as lgb
# 
#     # {'max_depth': 11, 'n_estimators': 8400, 'num_leaves': 512, 'subsample': 0.7, 'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.002530954722224875, 'reg_alpha': 0.0006747799093531522, 'reg_lambda': 0.03147134291571846}
# 
#     # {'max_depth': 11, 'n_estimators': 7600, 'num_leaves': 512, 'subsample': 0.8,
#     #  'colsample_bytree': 0.30000000000000004, 'learning_rate': 0.0025882973604023853,
#     #  'reg_alpha': 0.0001034092513169626, 'reg_lambda': 0.10112056628774735}
# 
#     lgb_params = {
#         "objective": "mae",
#         # "n_estimators": 6000,
#         'n_estimators': 7600,
#         # "num_leaves": 256,
#         'num_leaves': 512,
#         # "subsample": 0.6,
#         'subsample': 0.8,
#         # "colsample_bytree": 0.8,
#         'colsample_bytree': 0.3,
#         # "learning_rate": 0.01,
#         'learning_rate': 0.0025882973604023853,
#         'max_depth': 11,
#         "n_jobs": 4,
#         "device": "gpu",
#         "verbosity": -1,
#         "importance_type": "gain",
#         # "reg_alpha": 0.2,
#         'reg_alpha': 0.0001034092513169626,
#         # "reg_lambda": 3.25
#         'reg_lambda': 0.10112056628774735,
#     }
# 
#     gkf = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5)
# 
#     X = df_train_feats.drop(['date_id'], axis=1)
#     y = df_train['target']
#     X_date_id = df_train['date_id']
#     for fold_i, (train_idx, valid_idx) in enumerate(gkf.split(X, y, X_date_id)):
#         print(f"Fold {fold_i + 1}")
#         tr_X, tr_y = df_train_feats.iloc[train_idx], y[train_idx]
#         val_X, val_y = df_train_feats.iloc[valid_idx], y[valid_idx]
#         print(tr_X.shape, tr_y.shape, val_X.shape, val_y.shape)
#         lgb_model = lgb.LGBMRegressor(**lgb_params)
#         lgb_model.fit(
#             tr_X,
#             tr_y,
#             eval_set=[(val_X, val_y)],
#             callbacks=[
#                 lgb.callback.early_stopping(stopping_rounds=100),
#                 lgb.callback.log_evaluation(period=100),
#             ],
#         )
#         # Save the model to a file
#         joblib.dump(lgb_model, f"{config['model_dir']}/lgb_models_{fold_i}.pkl")

In [19]:
# # whole data train
# if LGB:
#     import lightgbm as lgb
# 
#     lgb_model = lgb.LGBMRegressor(**lgb_params)
#     lgb_model.fit(
#         df_train_feats,
#         df_train['target'],
#         verbose=True,
#     )
#     # Save the model to a file
#     joblib.dump(lgb_model, f"{config['model_dir']}/lgb_models.pkl")

In [20]:
feature_columns = list(df_train_feats.columns)
print(f"Features = {len(feature_columns)}")

Features = 166


In [21]:
# lgb_model_weights = weighted_average(config["n_splits"] + 1, equal_weight=False)
# model_pipeline = ModelPipeline(lgb_model_weights)
# if config["train_mode"]:
#     # 데이터 불러오기
# 
#     df = pd.read_csv(f"{config['data_dir']}/train.csv")
# 
#     # 데이터 전처리
#     data_processor = DataPreprocessor(data=df)
#     df = data_processor.transform()
# 
#     # 사용할 피쳐 엔지니어링 함수 선택
#     feature_engineer = FeatureEngineer(data=df, feature_versions=[
#         'feature_version_time',
#         'feature_version_imbalance_1',
#         'feature_version_imbalance_2_0',
#         'feature_version_imbalance_2_1',
#         'feature_version_imbalance_3',
#         'feature_version_imbalance_6_0',
#         'feature_version_imbalance_6_1',
#         'feature_version_imbalance_7',
#         'feature_version_imbalance_8',
#         'feature_version_imbalance_9',
#         # 'feature_version_imbalance_10',
#         # 'feature_version_imbalance_11',
#         # 'feature_version_imbalance_12',
#         # 'feature_version_custom_weight',
#         # 'feature_version_order_flow',
#     ],
#                                        dependencies=dependencies)
#     feature_engineer.generate_global_features(data=df)
#     df = feature_engineer.transform(save=True)  # 맨 처음에는 save=True 돌렸으면, 다음부턴 transform(load=True)로 바꾸면된
#     df_copy = df.copy()
#     splitter = Splitter(method=config["split_method"], n_splits=config["n_splits"], correct=config["correct"],
#                         initial_fold_size_ratio=config["initial_fold_size_ratio"],
#                         train_test_ratio=config["train_test_ratio"], gap=config["gap"])
#     for idx, (X_train, y_train, X_test, y_test) in enumerate(splitter.split(data=df, p_gap=5)):
#         print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
#         model_pipeline.train(idx=idx, X_train=X_train, y_train=y_train, X_valid=X_test, y_valid=y_test)
#         model_pipeline.predict(idx=idx, X_test=X_test)
#         if config["stacking_mode"] and len(config["model_name"]) > 1:  # 각 폴드마다 stacking
#             model_pipeline.stacking(idx=idx, y_test=y_test)
#     if config["stacking_mode"] and len(config["model_name"]) == 1:  # single model 에 대한 stacking
#         model_pipeline.stacking(idx=-1, y_test=y_test,
#                                 X_test=X_test)  # stacking with last fold. if you want you can stacking with all folds
#     # print(df_copy.shape, df_copy["target"].shape)
#     model_pipeline.train(idx=config["n_splits"] + 1, X_train=df_copy.drop(columns=['target', 'date_id_copy']), y_train=df_copy["target"], X_valid=None, y_valid=None)
#     model_pipeline.save_models()
#     model_pipeline.save_optuna_weights()
#     splitter.visualize_splits()

In [22]:
#far_price 및 near_price의 누락된 값을 채우고 세 가지 결과를 반환
def imputer(df):
    far_price_mean = df['far_price'].mean()
    near_price_mean = df['near_price'].mean()
    df['far_price'] = df['far_price'].fillna(far_price_mean)
    df['near_price'] = df['near_price'].fillna(near_price_mean)

    return df, far_price_mean, near_price_mean

# 결측치 채우기
def add_missing_data(df):
    all_stock_ids = set(range(200))
    all_missed_data_list = []

    #데이터를 미리 그룹화하여 각 time_id에 관련된 데이터에 빠르게 접근할 수 있도록 한다
    grouped = df.groupby('time_id')

    for t, group in grouped:
        current_stock_ids = set(group['stock_id'].to_list())
        missed_stock_id = list(all_stock_ids - current_stock_ids)
        
        date_id = group['date_id'].iloc[-1]
        seconds_in_bucket = group['seconds_in_bucket'].iloc[-1]
        
        missed_stock_id_num = len(missed_stock_id)
        missed_date_id = [date_id] * missed_stock_id_num
        missed_seconds_in_bucket = [seconds_in_bucket] * missed_stock_id_num
        missed_time_id = [t] * missed_stock_id_num
        
        missed_data = pd.DataFrame({
            'stock_id': missed_stock_id,
            'date_id': missed_date_id,
            'seconds_in_bucket': missed_seconds_in_bucket,
            'time_id': missed_time_id
        })
        
        all_missed_data_list.append(missed_data)

    all_missed_data = pd.concat(all_missed_data_list, axis=0).reset_index(drop=True).astype(int)

    df = pd.concat([df, all_missed_data], axis=0)
    df = df.sort_values(by=['time_id', 'stock_id']).reset_index(drop=True)
    df = df.groupby('stock_id').apply(lambda x: x.fillna(method='bfill')).reset_index(drop=True)

    return df

def sizesum_and_pricestd(df):
    #업데이트 후 10개의 특성 추가
    price_ftrs = ['reference_price', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap'] # std
    size_ftrs = ['imbalance_size', 'matched_size', 'bid_size', 'ask_size'] # sum
    
    rolled = df[['stock_id'] + size_ftrs].groupby('stock_id').rolling(window=8, min_periods=1).sum()
    rolled = rolled.reset_index(level=0, drop=True)
    for col in size_ftrs:
        df[f'{col}_rolled_sum'] = rolled[col]

    rolled = df[['stock_id'] + price_ftrs].groupby('stock_id').rolling(window=8, min_periods=1).std().fillna(0)
    rolled = rolled.reset_index(level=0, drop=True)
    for col in price_ftrs:
        df[f'{col}_rolled_std'] = rolled[col]

    return df

#리스트 요소 삭제
def remove_element(input_list, drop_list):
    return [e for e in input_list if e not in drop_list]

In [23]:
train = pd.read_csv(f"{config['data_dir']}/train.csv")
train = train.loc[train['target'].notna()]

train, far_price_mean, near_price_mean = imputer(train)
train = add_missing_data(train)
print('결측치：', train.isnull().sum().sum())

train = sizesum_and_pricestd(train)

no_feature_cols = ['date_id', 'row_id', 'time_id', 'target', 'currently_scored']

feature_cols = remove_element(train.columns, no_feature_cols)
target_col = 'target'

print('피처 수：', len(feature_cols))

#표준화
scaler = QuantileTransformer(output_distribution='normal', n_quantiles=30000, subsample=500000)
scaled_data = scaler.fit_transform(train[feature_cols])
train[feature_cols] = pd.DataFrame(scaled_data, columns=train[feature_cols].columns)

#데이터를 float32 데이터 타입으로 변환
train = train.astype('float32')

seq_len = 8

# Grouping by time_id
grouped_by_time = train.groupby('stock_id')

def generate_data(grouped_by_time, seq_len):
    for _, group in grouped_by_time:
        # Sorting by stock_id to maintain consistency across images
        group_sorted = group.sort_values(by='time_id')
        features = group_sorted[feature_cols].values
        # print('features',features.shape)
        windows = []
        ############################################ 
        for t in range(0, seq_len - 1):
            copy_0 = np.stack([features[0]] * (seq_len - 1 - t))
            cut_0 = features[: t + 1]
            windows.append(np.vstack((copy_0, cut_0)))
            
        for t in range(0, features.shape[0] - seq_len + 1):
            windows.append(features[t: t+seq_len, :])
        ############################################
        # stock n의 0일~480일 0초 ~540초를 time_id기준으로 정렬했을 때
        # seq_len 길이만큼을 하나의 시퀀스 데이터로 만들되, 자기 시점 이전의 데이터 행 개수가 seq_len보다 작은 경우
        # 첫번째 행을 복사.
        # 예를 들어 seq_len이 5인데 데이터가 100개 있는 경우, 1번 데이터의 시퀀스는 11111
        # 예를 들어 seq_len이 5인데 데이터가 100개 있는 경우, 2번 데이터의 시퀀스는 11112
        # 예를 들어 seq_len이 5인데 데이터가 100개 있는 경우, 3번 데이터의 시퀀스는 11123
        # 이런 식으로 데이터 포인트를 시퀀스로 변환하고 시퀀스로 라벨을 맞추는 모델 만들기
        
        # Convert list of windows to numpy array
        features_array = np.stack(windows)
        
        # 시퀀스 형태 확인
        # print(len(windows),windows[0].shape,windows[0][0].shape)
        
        target = group_sorted['target'].values
        # Yield the result for this group to avoid storing all results in memory
        yield features_array, target

# Use generator to iterate over data
data_generator = generate_data(grouped_by_time, seq_len=seq_len)

# If you need to store results in arrays:
datas, labels = zip(*data_generator)

print(len(datas),datas[0].shape,datas[1].shape)

data = np.array(datas).reshape(-1, seq_len, len(feature_cols))
label = np.array(labels).reshape(-1,)
print('data_seq_to_reshaped', data.shape, 'label_shape', label.shape)
#del train, datas, labels, grouped_by_time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('device：', device)


data = torch.tensor(data, dtype=torch.float32).to(device)
label = torch.tensor(label, dtype=torch.float32).to(device)

print('데이터 형태', data.shape)
print('라벨 형태', label.shape)

#train,val set 분리
torch.manual_seed(42)

dataset = TensorDataset(data, label)

train_ratio = 0.8
train_size = int(train_ratio * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

batch_size = 4096

train_loader = DataLoader(train_dataset, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)

print('batch size：', next(iter(train_loader))[0].shape)

결측치： 0
피처 수： 23
200 (26455, 8, 23) (26455, 8, 23)
data_seq_to_reshaped (5291000, 8, 23) label_shape (5291000,)
device： cpu
데이터 형태 torch.Size([5291000, 8, 23])
라벨 형태 torch.Size([5291000])
batch size： torch.Size([4096, 8, 23])


In [24]:
class MyModel(nn.Module):
    def __init__(self, feature_num, d_model, nhead, num_layers):
        super(MyModel, self).__init__()
        self.embedding = nn.Linear(feature_num, d_model)
        self.tf0 = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=2, batch_first=True)
        self.tf1 = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, batch_first=True)
        # [batch_size, seq_len, d_model] nn.Transformer가 입력받는 포멧
        self.fc = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(0.5)
        self.tf2 = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, batch_first=True)
        self.decoder_0 = nn.Linear(d_model, d_model//2)
        self.decoder_1 = nn.Linear(d_model//2 , 1)                            
        self.decoder = nn.Linear(d_model, 1)

        self.ffnn = nn.Sequential(
            nn.Linear(d_model, 2*d_model),  
            nn.ReLU(),                  
            nn.Linear(2*d_model, d_model)  
        )

    def forward(self, x):
        # x = self.embedding(x)
        # x = self.tf1.encoder(x)
        # x = x[:, -1, :]
        # x = self.fc(x) 
        # x = self.dropout(x)
        # x = self.tf2.encoder(x)
        # x = self.decoder(x)

        x = self.embedding(x)
        x = self.tf1.encoder(x)
        x = x[:, -1, :]
        # x = self.fc(x) 
        x = self.dropout(x)
        x = self.tf2.encoder(x)
        x = self.decoder(x)
        
        return x

In [25]:
import datetime

In [26]:
# if is_train:
#     input_size = data.shape[-1]
#     print(input_size)
#     n_epochs = 50
#     lr = 1e-03
# 
#     # pre mae init
#     pre_epoch_valid_mae = np.inf
# 
#     # MAE가 두 번의 에폭에서 감소하지 않으면 학습률을 절반으로 줄인다"
#     patience_counter = 0
# 
#     model = MyModel(feature_num=input_size, d_model=64, nhead=8, num_layers=1).to(device)
#     
#     # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
#     optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
#     loss = nn.L1Loss().to(device)
# 
#     # out_path = "model/"
#     # if not os.path.exists(out_path):
#     #     os.makedirs(out_path)
#     best_mae = np.inf
# 
#     print(f'Train start...')
#     for epoch in range(n_epochs):
#         model.train()
#         train_maes = []
#         batch_num = len(train_loader)
# 
#         # 훈련
#         for X, y in train_loader:
#             optimizer.zero_grad()
#             outputs = model(X).squeeze()
#             l = loss(outputs, y)
#             l.backward()
#             nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
#             optimizer.step()
#             mae = l.item()
#             train_maes.append(mae)
#         epoch_train_mae = np.mean(train_maes)
#         print(f'Epoch [{epoch+1}/{n_epochs}] Training average MAE: {epoch_train_mae:.4f}')
#         train_maes = []
# 
#         # 검증
#         model.eval()
#         with torch.no_grad():
#             valid_maes = []
#             for X_v, y_v in valid_loader:
#                 preds = model(X_v).squeeze()
#                 mae = torch.abs(preds - y_v).mean().item()
#                 valid_maes.append(mae)
#             epoch_valid_mae = np.mean(valid_maes)
#             print(f'Epoch [{epoch+1}/{n_epochs}] Validation average MAE: {epoch_valid_mae:.4f}')
#             
#             if epoch_valid_mae < best_mae:
#                 best_mae = epoch_valid_mae
#                 # torch.save(model, os.path.join(out_path, f"model_epoch_{epoch+1}.pt"))
#     
#                 now = datetime.datetime.now()
#                 time_string = now.strftime("%Y%m%d_%H%M")
#                 torch.save(model, f"{config['model_dir']}/model_epoch_{epoch+1}.pt")
#                 # torch.save(model, f"transformer_model.pt")
#                 
#         #이전 라운드의 MAE가 현재 MAE보다 개선되지 않으면 학습률을 절반으로 줄인다"
#         if epoch_valid_mae - pre_epoch_valid_mae > 0:
#             patience_counter += 1
# 
#             if patience_counter == 2:
#                 lr = lr * 0.75
#                 patience_counter = 0
#                 for param_group in optimizer.param_groups:
#                     param_group['lr'] = lr  # 학습률 업데이트 
#                     print(f'renew lr to {lr}')
# 
#         # MAE 업데이트
#         pre_epoch_valid_mae = epoch_valid_mae
# 
#         # 분기가 0.03을 초과하거나 학습률이 1e-7보다 낮을 때 훈련을 중단
#         if (epoch_valid_mae - epoch_train_mae > 0.03) or (lr <1e-7):
#             print('Early stop now.')
#             break
#     print(f'Train over.')

### upload kaggle dataset

In [27]:
# # want to see feature importance plot for each fold
# for idx, models in enumerate(model_pipeline.models_list):
#     for model_name, model in zip(config["model_name"], models):
#         if "lgb" in model_name:
#             lgb.plot_importance(model, importance_type="gain", figsize=(20, 20))
#         elif "xgb" in model_name:
#             xgb.plot_importance(model, importance_type="gain", figsize=(20, 20))
#         else:
#             raise ValueError("Invalid model name")
#         plt.title(f"Feature Importance ({model_name})")
#         # plt.savefig(f"{config['model_dir']}/{idx}_{model_name}_feature_importance.png")
#         plt.show()



#### dataset init
! /home/username/.local/bin/kaggle datasets init -p {config['model_dir']}
#### dataset create 
! /home/username/.local/bin/kaggle datasets create -p {config['model_dir']}

In [28]:
# KAGGLE_DATASET_NAME = "model-version-31"

In [29]:
# if MODE == "train":
#     ! /usr/local/bin/kaggle datasets init -p {config['model_dir']}
#     import json

#     with open(f"{config['model_dir']}/dataset-metadata.json", "r") as file:
#         data = json.load(file)

#     data["title"] = data["title"].replace("INSERT_TITLE_HERE", f"{KAGGLE_DATASET_NAME}")
#     data["id"] = data["id"].replace("INSERT_SLUG_HERE", f"{KAGGLE_DATASET_NAME}")

#     with open(f"{config['model_dir']}/dataset-metadata.json", "w") as file:
#         json.dump(data, file, indent=2)

#     ! /usr/local/bin/kaggle datasets create -p {config['model_dir']}

#     # !/usr/local/bin/kaggle datasets version -p {config['model_dir']} -m 'Updated data'

In [30]:
class TestStack:
    #time_id 추가
    def __init__(self, window_size=6):
        self.window_size = window_size * 2
        self.stock_cache = []  # Dictionary to hold cache for each stock

    def test_stack(self, test, time_id):
        # Convert batch_data to DataFrame if it's a list of dicts
        if isinstance(test, list):
            test = pd.DataFrame(test)
            
        test['time_id'] = time_id
        
        #단일 데이터 추가
        self.stock_cache.append(test)
        
        if len(self.stock_cache) > self.window_size:
            # 현재 데이터가 n개를 초과하면 n개 이후 데이터는 버림 
            self.stock_cache = self.stock_cache[-self.window_size:]
            test = pd.concat(self.stock_cache, axis=0).reset_index(drop=True)
        else:
            # 초기화, n개의 데이터를 이미 수집했다면 현재 데이터를 6번 복사
            self.stock_cache = []
            for t in range(self.window_size): # [0, 1, 2, 3, 4, 5]
                test['time_id'] = t - self.window_size + 1 # [-5, -4, -3, -2, -1, 0]
                test_add = test.copy()
                self.stock_cache.append(test_add)
            test = pd.concat(self.stock_cache, axis=0).reset_index(drop=True).sort_values(by='time_id')
            
        return test.sort_values(['time_id', 'stock_id'])

test_cols = None
def df_to_seq(test, seq_len):
    grouped_by_stock = test.groupby('stock_id')
    datas = []

    for _, group in grouped_by_stock:
        group_sorted = group.sort_values(by='time_id')
        cols = remove_element(test.columns, no_feature_cols)
        
        features = group_sorted[cols].values # [12, 23]
        
        features = features[-seq_len:, ]
        datas.append(features)

    return np.stack(datas)

def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    return out

In [31]:
if is_infer:
    model_names = ['model_epoch_50_1.pt']
    
    models = []
    for model_name in model_names:
        models.append(torch.load(f"/kaggle/input/model-epoch-50-1/{model_name}", map_location=device))

In [32]:
if is_pre_test:
    # 제출 전 테스트 
    main_dir = '/kaggle/input/optiver-trading-at-the-close/'
    
    test_df = pd.read_csv(main_dir + 'example_test_files/test.csv')
    #test_df = test_df.drop(columns=['target'])
    test_group = test_df.groupby(['time_id'])
    tdp = TestStack(window_size=seq_len)

    counter = 0
    for test in test_group:
        test = test[1]
        test = test.drop(columns=['time_id'])

        # zerosum
        volumes = test.loc[:,'bid_size'] + test.loc[:,'ask_size']

        # 결측치를 평균값으로 채우기 
        test['far_price'] = test['far_price'].fillna(far_price_mean)
        test['near_price'] = test['near_price'].fillna(near_price_mean)

        # 데이터 쌓기 
        test_stack = tdp.test_stack(test, counter)

        # FE
        test = sizesum_and_pricestd(test_stack)

        # 정규화
        test_cols = remove_element(test.columns, no_feature_cols)
        test[test_cols] = (test[test_cols] - avg)/std

        # 직렬화
        test = df_to_seq(test, seq_len)
    #     print(test.shape)

        # 예측 
        predictions = np.zeros((test.shape[0],))
        for model in models:
            test = torch.tensor(test, dtype=torch.float32).squeeze().to(device)
            predictions_tmp = model(test).squeeze().cpu()
            predictions_tmp = predictions_tmp.detach().numpy()
            predictions += predictions_tmp
        
        predictions /= len(models)
        # zero sum조정
        predictions = zero_sum(predictions, volumes)

        print(predictions.shape)

        counter += 1

In [33]:
if config["infer_mode"]:
    if LGB:
        lgb_models = []
        for i in range(5):
            lgb_models.append(joblib.load(f"{config['model_dir']}/lgb_models_{i}.pkl"))
        last_lgb_model = joblib.load(f"{config['model_dir']}/lgb_models.pkl")
        lgb_models.append(last_lgb_model)
        lgb_model_weights = weighted_average(lgb_models)
        print("lgb_models loaded")

    import optiver2023

    optiver2023.make_env.func_dict['__called__'] = False
    env = optiver2023.make_env()
    iter_test = env.iter_test()

    y_min, y_max = -64, 64
    qps = []
    counter = 0
    cache = pd.DataFrame()


    tdp = TestStack(window_size=seq_len)
    
    # This is for the generate_global_features (only need to run once)

    for (test, revealed_targets, sample_prediction) in iter_test:
        
        now_time = time.time()
        
        # if not test.currently_scored.iloc[0]:
        #     sample_prediction['target'] = 0
        #     env.predict(sample_prediction)
        #     counter += 1
        #     qps.append(time.time() - now_time)
        #     if counter % 10 == 0:
        #         print(counter, 'qps:', np.mean(qps))
        #     continue
        # 
        clipped_predictionss = []
        
        test_ = test.copy()
        
        if TF:
            volumes = test.loc[:,'bid_size'] + test.loc[:,'ask_size']

             # 결측치를 평균값으로 채우기 
            test['far_price'] = test['far_price'].fillna(far_price_mean)
            test['near_price'] = test['near_price'].fillna(near_price_mean)

            test_stack = tdp.test_stack(test, counter)
            # print(counter, test_stack.shape)

            test = sizesum_and_pricestd(test_stack)

            test_cols = remove_element(test.columns, no_feature_cols)
            
            scaled_test_cols = scaler.transform(test[test_cols])
            test[test_cols] = pd.DataFrame(scaled_test_cols, columns = test[test_cols].columns)

            testseq = df_to_seq(test, seq_len)

            predictions = np.zeros((testseq.shape[0],))
            # print('predictions shape', predictions.shape, 'test shape', test.shape)
            for model in models:
                test2 = torch.tensor(testseq, dtype=torch.float32).squeeze().to(device)
                predictions_tmp = model(test2).squeeze().cpu()
                predictions_tmp = predictions_tmp.detach().numpy()
                predictions += predictions_tmp
            predictions /= len(models)

            predictions = zero_sum(predictions, volumes)
            
            clipped_predictions = predictions.values

            clipped_predictionss.append(clipped_predictions)
            
        if LGB:
            test = test_

            cache = pd.concat([cache, test], ignore_index=True, axis=0)

            if counter > 0:
                cache = cache.groupby(['stock_id']).tail(21).sort_values(
                    by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
            feat = generate_all_features(cache)[-len(test):]
            feat = feat.drop(columns=["currently_scored"])
            print(f"Feat Shape is: {feat.shape}")

            lgb_predictions = np.zeros(len(test))
            for model, weight in zip(lgb_models, lgb_model_weights):
                lgb_predictions += weight * model.predict(feat[feature_columns])

            clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
            clipped_predictionss.append(clipped_predictions)
            
        sample_prediction['target'] = clipped_predictionss[0] * 0.15 + clipped_predictionss[1] * 0.85
        
        
        print(counter)
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

lgb_models loaded
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Feat Shape is: (200, 166)
0
Feat Shape is: (200, 166)
1
Feat Shape is: (200, 166)
2
Feat Shape is: (200, 166)
3
Feat Shape is: (200, 166)
4
Feat Shape is: (200, 166)
5
Feat Shape is: (200, 166)
6
Feat Shape is: (200, 166)
7
Feat Shape is: (200, 166)
8
Feat Shape is: (200, 166)
9
10 qps: 2.7927133798599244
Feat Shape is: (200, 166)
10
Feat Shape is: (200, 166)
11
Feat Shape is: (200, 166)
12
Feat Shape is: (200, 166)
13
Feat Shape is: (200, 166)
14
Feat Shape is: (200, 166)
15
Feat Shape is: (200, 166)
16
Feat Shape is: (200, 166)
17
Feat Shape is: (200, 166)
18
Feat Shape is: (200, 166)
19
20 qps: 2.7677748441696166
Feat Shape is: (200, 166)
20
Feat Shape is: (200, 166)
21
Feat Shape is: (200, 166)
22
Feat Shape is: (200, 166)
23
Feat Shape is: (200, 166)
24
Feat Shape is: (200, 166)
25
Feat Shape is: (200, 166)
26
Feat Shape is: (200, 166)
27
F