In [1]:
import joblib

import pandas as pd
import numpy as np
import polars as pl
from sklearn.preprocessing import StandardScaler
from numba import njit, prange
from itertools import combinations

from utils import reduce_memory, config

In [2]:
CONFIG = config.CONFIG

lag_cols_original = [f"responder_{idx}" for idx in range(9)]

lag_features = [f"responder_{idx}_lag_1" for idx in range(9)] + [
    f"responder_{idx}_lag_1_skew_{i}" for idx in range(9) for i in range(3, 6)
]

lag_cols_rename = {f"responder_{idx}": f"responder_{idx}_lag_1" for idx in range(9)}

interaction_features = [
    "feature_07",
    "feature_61",
    "feature_04",
    "feature_60",
    "cos_time_id",
]

# Define the feature names based on the number of features (79 in this case)
feature_names = (
    [f"feature_{i:02d}" for i in range(79)]
    + ["sin_time_id", "cos_time_id", "sin_time_id_halfday", "cos_time_id_halfday"]
    + lag_features
)

In [3]:
def read_data():
    full = []
    for i in range(10):
        t0 = pd.read_parquet(
            f"{CONFIG.main}/data/train.parquet/partition_id={i}/part-0.parquet"
        ).fillna(0)
        full.append(reduce_memory.reduce_mem_usage(t0))

    full = pd.concat(full)

    full["sin_time_id"] = np.sin(2 * np.pi * full["time_id"] / 967)
    full["cos_time_id"] = np.cos(2 * np.pi * full["time_id"] / 967)
    full["sin_time_id_halfday"] = np.sin(2 * np.pi * full["time_id"] / 483)
    full["cos_time_id_halfday"] = np.cos(2 * np.pi * full["time_id"] / 483)

    full = full[full.index >= CONFIG.skip_dates]
    return full

In [7]:
def feature_engineering(df):
    features = (
        df.rename(lag_cols_rename)
        .with_columns(
            date_id=pl.col("date_id") + 1,  # lagged by 1 day
        )
        .group_by(["date_id", "symbol_id"], maintain_order=True)
        .last()
        .with_columns(
            *[
                pl.col(col)
                .rolling_skew(window_size=i)
                .over("symbol_id")
                .alias(f"{col}_skew_{i}")
                for col in lag_cols_rename.values()
                for i in range(3, 6)
            ],
        )
        .fill_null(0)
    )

    return reduce_memory.reduce_mem_usage(features.to_pandas())


# 📊 Function to compute triplet features using +, *, -, / operations
@njit(parallel=True)
def compute_triplet_operations(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)

    # Prepare an array to store the results for each operation (sum, product, difference, division)
    operations_features = np.empty(
        (num_rows, num_combinations * 4)
    )  # 4 operations per combination

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]

        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):
            val_a = df_values[j, a]
            val_b = df_values[j, b]
            val_c = df_values[j, c]

            # Sum: a + b + c
            operations_features[j, i * 4] = val_a + val_b + val_c

            # Product: a * b * c
            operations_features[j, i * 4 + 1] = val_a * val_b * val_c

            # Difference: a - b - c
            operations_features[j, i * 4 + 2] = val_a - val_b - val_c

            # Division: a / b / c (handle division by zero)
            if val_b != 0 and val_c != 0:
                operations_features[j, i * 4 + 3] = val_a / val_b / val_c
            else:
                operations_features[j, i * 4 + 3] = (
                    np.nan
                )  # Assign NaN if division by zero occurs

    return operations_features


# 📈 Function to calculate triplet features for given columns and DataFrame
def calculate_triplet_operations(feature, arr):
    # Convert DataFrame to numpy array for Numba compatibility
    comb_indices = [
        (feature.index(a), feature.index(b), feature.index(c))
        for a, b, c in combinations(feature, 3)
    ]  # Generate triplet indices

    # Calculate the triplet operations (sum, product, difference, division)
    features_array = compute_triplet_operations(arr, comb_indices)

    # Create column names for the result DataFrame
    columns = []
    for a, b, c in combinations(feature, 3):
        columns.extend(
            [
                f"{a}_{b}_{c}_sum",
                f"{a}_{b}_{c}_prod",
                f"{a}_{b}_{c}_diff",
                f"{a}_{b}_{c}_div",
            ]
        )

    # Create a DataFrame from the results
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [None]:
full = read_data()
lag_features = pl.from_pandas(full.reset_index()[lag_cols_original])

Memory usage of dataframe is 654.51 MB
Memory usage after optimization is: 335.60 MB
Decreased by 48.73%
Memory usage of dataframe is 944.04 MB
Memory usage after optimization is: 484.06 MB
Decreased by 48.73%
Memory usage of dataframe is 1022.35 MB
Memory usage after optimization is: 524.21 MB
Decreased by 48.73%
Memory usage of dataframe is 1352.24 MB
Memory usage after optimization is: 693.36 MB
Decreased by 48.73%
Memory usage of dataframe is 1690.96 MB
Memory usage after optimization is: 867.04 MB
Decreased by 48.73%
Memory usage of dataframe is 1800.46 MB
Memory usage after optimization is: 923.18 MB
Decreased by 48.73%
Memory usage of dataframe is 2088.53 MB
Memory usage after optimization is: 1070.89 MB
Decreased by 48.73%
Memory usage of dataframe is 2132.85 MB
Memory usage after optimization is: 1093.61 MB
Decreased by 48.73%
Memory usage of dataframe is 2067.02 MB
Memory usage after optimization is: 1059.86 MB
Decreased by 48.73%
Memory usage of dataframe is 2112.32 MB
Memor

In [6]:
# Number of validation dates to use
num_valid_dates = 100

# Number of folds for cross-validation
N_fold = 5

dates = full["date_id"].unique()

# Define validation dates as the last `num_valid_dates` dates
valid_dates = dates[-num_valid_dates:]

# Define training dates as all dates except the last `num_valid_dates` dates
train_dates = dates[:-num_valid_dates]

In [None]:
X = full[CONFIG.feature_names + CONFIG.exogeneous_features + ["symbol_id", "date_id"]]
Y = full[[CONFIG.target] + ["date_id"]]
weight = full[["weight"] + ["date_id"]]

del full

lags = feature_engineering(lag_features)

In [8]:
def export_train():
    X_valid = (
        X[X["date_id"].isin(valid_dates)]
        .merge(
            lags,
            how="left",
            left_on=["date_id", "symbol_id"],
            right_on=["date_id", "symbol_id"],
        )
        .drop(columns=["symbol_id", "date_id"])
    )
    y_valid = Y[Y["date_id"].isin(valid_dates)].drop(columns="date_id")
    w_valid = weight[weight["date_id"].isin(valid_dates)].drop(columns="date_id")

    X_valid.to_parquet(
        f"{CONFIG.main}/data/training_data_features/X_valid.parquet", index=False
    )
    y_valid.to_parquet(
        f"{CONFIG.main}/data/training_data_features/y_valid.parquet", index=False
    )
    w_valid.to_parquet(
        f"{CONFIG.main}/data/training_data_features/w_valid.parquet", index=False
    )

    del X_valid, y_valid, w_valid

    for fold in range(N_fold):
        selected_dates = [
            date for ii, date in enumerate(train_dates) if ii % N_fold != fold
        ]
        X_train = (
            X[X["date_id"].isin(selected_dates)]
            .merge(
                lags,
                how="left",
                left_on=["date_id", "symbol_id"],
                right_on=["date_id", "symbol_id"],
            )
            .drop(columns=["symbol_id", "date_id"])
        )
        y_train = Y[Y["date_id"].isin(selected_dates)].drop(columns="date_id")
        w_train = weight[weight["date_id"].isin(selected_dates)].drop(columns="date_id")

        X_train.to_parquet(
            f"{CONFIG.main}/data/training_data_features/X_train_{fold}.parquet",
            index=False,
        )
        y_train.to_parquet(
            f"{CONFIG.main}/data/training_data_features/y_train_{fold}.parquet",
            index=False,
        )
        w_train.to_parquet(
            f"{CONFIG.main}/data/training_data_features/w_train_{fold}.parquet",
            index=False,
        )

        del X_train, y_train, w_train

In [9]:
export_train()

In [95]:
X_valid = pd.read_parquet(f"{CONFIG.main}/data/training_data_features/X_valid.parquet")

interaction = calculate_triplet_operations(
    interaction_features, X_valid[interaction_features]
)
out = pd.concat([X_valid[feature_names], interaction], axis=1)
out.to_parquet(f"{CONFIG.main}/data/training_data_impt/X_valid.parquet", index=False)


In [None]:
for fold in range(5):
    X_train = pd.read_parquet(
        f"{CONFIG.main}/data/training_data_features/X_train_{fold}.parquet"
    )

    interaction = pd.read_parquet(
        f"{CONFIG.main}/data/training_data_features/X_train_{fold}.parquet",
        columns=interaction_features,
    ).to_numpy()
    interaction = calculate_triplet_operations(interaction_features, interaction)
    interaction = reduce_memory.reduce_mem_usage(interaction)
    interaction = pd.concat([X_train[feature_names], interaction], axis=1)
    print(interaction.shape)
    interaction.to_parquet(
        f"{CONFIG.main}/data/training_data_impt/X_train_{fold}.parquet", index=False
    )


159


In [18]:
import joblib
import pandas as pd

pd.set_option("display.max_rows", 500)


def r2_xgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (
        np.average((y_true) ** 2, weights=sample_weight) + 1e-38
    )
    return -r2

In [28]:
lgb = [
    joblib.load(
        f"C:/Users/edmun/OneDrive/Desktop/Personal-Projects/Kaggle/Jane Street Real Time Market Data Forecasting/Models_impt/LGB/LGB_{i}.pkl"
    )
    for i in range(1, 6)
]

xgb = [
    joblib.load(
        f"C:/Users/edmun/OneDrive/Desktop/Personal-Projects/Kaggle/Jane Street Real Time Market Data Forecasting/Models_impt/XGB/XGB_{i}.pkl"
    )
    for i in range(1, 6)
]

cbt = [
    joblib.load(
        f"C:/Users/edmun/OneDrive/Desktop/Personal-Projects/Kaggle/Jane Street Real Time Market Data Forecasting/Models_impt/CBT/CBT_{i}.pkl"
    )
    for i in range(1, 6)
]

In [32]:
xgb_full = []

for i in range(5):
    xgb_full.append(
        pd.DataFrame({"imp": xgb[i].feature_importances_, "name": lgb[i].feature_name_})
    )

xgb_full = pd.concat(xgb_full, axis=0)

In [33]:
lgb_full = []

for i in range(5):
    lgb_full.append(
        pd.DataFrame({"imp": lgb[i].feature_importances_, "name": lgb[i].feature_name_})
    )

lgb_full = pd.concat(lgb_full, axis=0)

In [35]:
cbt_full = []

for i in range(5):
    cbt_full.append(
        pd.DataFrame(
            {"imp": cbt[i].get_feature_importance(), "name": lgb[i].feature_name_}
        )
    )

cbt_full = pd.concat(cbt_full, axis=0)

In [41]:
xgb_full = (
    xgb_full.groupby("name")["imp"]
    .sum()
    .to_frame()
    .sort_values(by="imp", ascending=False)
)
cbt_full = (
    cbt_full.groupby("name")["imp"]
    .sum()
    .to_frame()
    .sort_values(by="imp", ascending=False)
)
lgb_full = (
    lgb_full.groupby("name")["imp"]
    .sum()
    .to_frame()
    .sort_values(by="imp", ascending=False)
)

cbt_full = cbt_full / cbt_full.sum()
xgb_full = xgb_full / xgb_full.sum()
lgb_full = lgb_full / lgb_full.sum()

(xgb_full + lgb_full + cbt_full).sort_values(by="imp", ascending=False)

Unnamed: 0_level_0,imp
name,Unnamed: 1_level_1
feature_06,0.090838
cos_time_id,0.087923
feature_36,0.076117
feature_52,0.065419
feature_58,0.0538
feature_07_feature_60_cos_time_id_diff,0.052873
feature_61,0.05076
feature_59,0.047446
feature_30,0.045649
feature_38,0.045243
