In [1]:
import polars as pl
import numpy as np
from itertools import combinations
from numba import njit, prange

import xgboost as xgb
# import lightgbm as lgb
# import catboost as cb


from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

# import cuml
# from cuml.explainer import TreeExplainer

In [None]:
class CONFIG:
    WSL = False
    LOCAL = True
    TRAIN = False

    RANDOM_STATE = 42

    DATE_COL = "__index_level_0__"
    TARGET_COL = "label"

    if LOCAL:
        TRAIN_PATH = "data/train.parquet"
        TEST_PATH = "data/test.parquet"
        SUBMISSION_PATH = "submission.csv"
        SHAP_PATH = "data/shap_summary.parquet"
        STABILITY_PATH = "data/feature_stability_summary.parquet"
    else:
        TRAIN_PATH = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
        TEST_PATH = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
        SUBMISSION_PATH = "/kaggle/input/drw-crypto-market-prediction/submission.csv"
        SHAP_PATH = "/kaggle/input/drw-crypto-market-prediction/shap_summary.parquet"
        STABILITY_PATH = "/kaggle/input/drw-crypto-market-prediction/feature_stability_summary.parquet"

    if WSL:
        mount = "/mnt/"
        TRAIN_PATH = f"{mount}c/Users/Admin/Desktop/Personal-Projects/Kaggle/DRW - Crypto Market Prediction/{TRAIN_PATH}"
        TEST_PATH = f"{mount}c/Users/Admin/Desktop/Personal-Projects/Kaggle/DRW - Crypto Market Prediction/{TEST_PATH}"
        SUBMISSION_PATH = f"{mount}c/Users/Admin/Desktop/Personal-Projects/Kaggle/DRW - Crypto Market Prediction/{SUBMISSION_PATH}"
        SHAP_PATH = f"{mount}c/Users/Admin/Desktop/Personal-Projects/Kaggle/DRW - Crypto Market Prediction/{SHAP_PATH}"
        STABILITY_PATH = f"{mount}c/Users/Admin/Desktop/Personal-Projects/Kaggle/DRW - Crypto Market Prediction/{STABILITY_PATH}"

    CORE_FEATURES = [
        "X363",
        "X321",
        "X405",
        "X730",
        "X523",
        "X756",
        "X589",
        "X462",
        "X779",
        "X25",
        "X532",
        "X520",
        "X329",
        "X383",
        "X752",
        "X287",
        "X298",
        "X759",
        "X302",
        "X55",
        "X56",
        "X52",
        "X303",
        "X51",
        "X598",
        "X385",
        "X603",
        "X674",
        "X415",
        "X345",
        "X174",
        "X178",
        "X168",
        "X612",
        "bid_qty",
        "ask_qty",
        "buy_qty",
        "sell_qty",
        "volume",
    ]

    KNOWN_FEATURES = [
        "bid_qty",
        "ask_qty",
        "buy_qty",
        "sell_qty",
        "volume",
    ]

    ANONYMOUS_FEATURES = [f"X{i}" for i in range(1, 781)]
    OG_FEATURES = ANONYMOUS_FEATURES + KNOWN_FEATURES

    # BID-ASK QUANTITY FEATURES
    BID_ASK_QTY_FEATURES = ["bid_ask_qty_ratio", "bid_ask_qty_spread"]

    # BUY-SELL QUANTITY FEATURES
    BUY_SELL_QTY_FEATURES = ["buy_sell_qty_ratio", "buy_sell_qty_spread"]

    # VOLUME FEATURES
    VOLUME_FEATURES = [
        "bid_ask_volume",
        "buy_sell_volume",
        "bid_ask_to_volume_ratio",
        "buy_sell_to_volume_ratio",
    ]

    # CROSS FEATURES
    CROSS_FEATURES = ["bid_ask_to_buy_sell_qty_ratio", "buy_sell_imbalance"]

    # BID-ASK FEATURES
    BID_ASK_FEATURES = [
        "bid_ask_spread",
        "bid_ask_ratio",
        "buy_sell_ratio",
        "order_flow_imbalance",
    ]

    # PRESSURE INDICATORS
    PRESSURE_INDICATORS = ["buying_pressure", "selling_pressure", "net_pressure"]

    # LIQUIDITY FEATURES
    LIQUIDITY_FEATURES = ["total_liquidity", "liquidity_imbalance", "liquidity_ratio"]

    # MARKET MICROSTRUCTURE FEATURES
    MARKET_MICROSTRUCTURE_FEATURES = ["kyle_lambda", "vpin"]

    # ADDITIONAL FEATURES
    ADDITIONAL_FEATURES = [
        "effective_spread",
        "realized_spread",
        "price_impact",
        "trade_intensity",
    ]

    if not TRAIN:
        stability = pl.read_parquet(STABILITY_PATH)
        shap = pl.read_parquet(SHAP_PATH)

        stability = stability.filter(pl.col("sum") > 0).sort(pl.col("sum"), descending=True)

        stability = stability.sort(pl.col("sum"), descending=True)

        shap_rank = shap.transpose(include_header=True).with_columns(pl.all().exclude("column").rank("ordinal", descending=True))
        shap_rank = pl.concat(
            [
                shap_rank.select("column"),
                pl.DataFrame(shap_rank.select(pl.all().exclude("column")).to_numpy().mean(axis=1)).rename({"column_0": "mean_rank"}),
                pl.DataFrame(shap_rank.select(pl.all().exclude("column")).to_numpy().std(axis=1)).rename({"column_0": "std_rank"}),
            ],
            how="horizontal",
        )

        shap_rank = shap_rank.sort(by="mean_rank", descending=False)

        EVAL_CORE_FEATURES = CORE_FEATURES

    FEATURE_SELECTION_SAMPLE_SIZE = 750000
    TARGET_FEATURES = 120
    COMBINED_SCORES = None

    CLIP_PERCENTILE, CLIP_LOWER, CLIP_UPPER = 99, 0, 0

    XGB_PARAMS = {
        "tree_method": "hist",
        "device": "gpu",
        "colsample_bylevel": 0.4778,
        "colsample_bynode": 0.3628,
        "colsample_bytree": 0.7107,
        "gamma": 1.7095,
        "learning_rate": 0.02213,
        "max_depth": 20,
        "max_leaves": 12,
        "min_child_weight": 16,
        "n_estimators": 1667,
        "subsample": 0.06567,
        "reg_alpha": 39.3524,
        "reg_lambda": 75.4484,
        "verbosity": 0,
        "random_state": RANDOM_STATE,
        "early_stopping_rounds": 100,
        "n_jobs": -1,
    }

    N_FOLDS = 5

    MODELS = [
        {"name": "xgb", "Estimator": xgb.XGBRegressor, "params": XGB_PARAMS},
    ]

In [None]:
class StandardScalerWrapper:
    def __init__(self):
        self.scaler = StandardScaler()
        self.columns_ = None

    def fit(self, df: pl.DataFrame):
        X = df.to_numpy()
        self.scaler.fit(X)
        self.columns_ = df.columns
        return self

    def transform(self, df: pl.DataFrame) -> pl.DataFrame:
        X = df.to_numpy()
        X_scaled = self.scaler.transform(X)
        return pl.DataFrame(X_scaled, schema=self.columns_)

In [None]:
class FEATURE_SELECTION:
    def __init__(self, x, y) -> None:
        self.x = x
        self.y = y

    def smart_feature_selection(self, sample_size=500_000, top_k=150):
        """
        Efficient feature selection using recent data samples
        Uses multiple methods and focuses on recent crypto patterns
        """
        print(f"Starting smart feature selection with {len(self.x)} samples...")

        # Use the most recent data for feature selection (crypto patterns change)
        recent_sample_size = min(sample_size, len(self.x))
        recent_df = self.x.tail(recent_sample_size).copy()
        print(f"Using {len(recent_df)} recent samples for feature selection")

        # Get all feature columns (excluding label)
        feature_cols = [col for col in recent_df.columns if col != CONFIG.TARGET_COL]
        print(f"Total features before selection: {len(feature_cols)}")

        X_sample = recent_df[feature_cols]
        y_sample = self.y.tail(recent_sample_size).copy()

        # Remove features with zero variance or too many missing values
        print("Removing low-variance and high-missing features...")
        valid_features = []
        for col in feature_cols:
            if X_sample[col].var() > 1e-8 and X_sample[col].isna().sum() / len(X_sample) < 0.95:
                valid_features.append(col)

        X_sample = X_sample[valid_features]
        print(f"Features after variance/missing filter: {len(valid_features)}")

        # Method 1: Correlation with target (fast)
        print("Computing correlations...")
        correlations = {}
        for col in valid_features:
            try:
                corr = np.abs(pearsonr(X_sample[col], y_sample)[0])
                if not np.isnan(corr):
                    correlations[col] = corr
            except:
                continue

        # Method 2: Mutual Information (sample for speed)
        print("Computing mutual information...")
        mi_sample_size = min(100_000, len(X_sample))
        sample_idx = np.random.choice(
            len(X_sample),
            mi_sample_size,
            replace=False,
        )
        X_mi = X_sample.iloc[sample_idx]
        y_mi = y_sample.iloc[sample_idx]

        try:
            mi_scores = mutual_info_regression(X_mi, y_mi, random_state=CONFIG.RANDOM_STATE, n_jobs=-1)
            mi_dict = dict(zip(X_mi.columns, mi_scores))
        except:
            mi_dict = {}

        # Method 3: L1 regularization feature importance (fast)
        print("Computing L1 regularization scores...")
        try:
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_mi)
            lasso = LassoCV(cv=CONFIG.N_FOLDS, random_state=CONFIG.RANDOM_STATE, max_iter=1000)
            lasso.fit(X_scaled, y_mi)
            l1_scores = abs(lasso.coef_)
            l1_dict = dict(zip(X_mi.columns, l1_scores))
        except:
            l1_dict = {}

        # Method 4: Tree-based importance (sample for speed)
        print("Computing tree-based importance...")
        stab = CONFIG.stability.select(["column", "sum"]).to_pandas().set_index("column").to_dict()["sum"]
        shap = CONFIG.shap_rank.select(["column", "mean_rank"]).to_pandas().set_index("column").to_dict()["mean_rank"]

        # Combine scores with weights
        print("Combining feature scores...")
        combined_scores = {}
        for col in valid_features:
            score = 0
            score += correlations.get(col, 0) * 0.3  # Correlation weight
            score += mi_dict.get(col, 0) * 0.25  # MI weight
            score += l1_dict.get(col, 0) * 0.25  # L1 weight
            score += stab.get(col, 0) * 0.1  # stab weight
            score += shap.get(col, 0) * 0.1  # stab weight
            combined_scores[col] = score

        # Select top features
        selected_features = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
        final_features = [feat[0] for feat in selected_features[:top_k] if feat[1] >= 50]

        print(f"Selected {len(final_features)} features")
        print("Top 10 selected features:")
        for i, (feat, score) in enumerate(selected_features[:10]):
            print(f"  {i + 1:2d}. {feat:30s} - Score: {score:.4f}")

        CONFIG.EVAL_CORE_FEATURES += final_features

        return combined_scores


In [None]:
class FeatureEngineering:
    def __init__(self, df, data_set="train"):
        self.df = df
        self.data_set = data_set

        self.scaler = StandardScalerWrapper()

    def handle_inf_nan(self, df):
        df = df.with_columns(
            pl.when(pl.col(col).is_infinite()).then(0.0).otherwise(pl.col(col)).alias(col)
            for col in df.collect_schema().names()
            if col not in [CONFIG.DATE_COL, CONFIG.TARGET_COL]
        )
        df = df.fill_null(0).fill_nan(0)
        return df

    def feature_engineering(self, df):
        return df.with_columns(
            # Bid-Ask Quantity Features
            bid_ask_qty_ratio=pl.col("bid_qty") / pl.col("ask_qty"),
            log_bid_qty=pl.col("bid_qty").log(),
            log_ask_qty=pl.col("ask_qty").log(),
            bid_ask_qty_spread=(pl.col("bid_qty") - pl.col("ask_qty")),
            # Buy-Sell Quantity Features
            buy_sell_qty_ratio=(pl.col("buy_qty") / pl.col("sell_qty")),
            log_buy_qty=pl.col("buy_qty").log(),
            log_sell_qty=pl.col("sell_qty").log(),
            buy_sell_qty_spread=(pl.col("buy_qty") - pl.col("sell_qty")),
            # Volume Features
            bid_ask_volume=(pl.col("bid_qty") + pl.col("ask_qty")),
            buy_sell_volume=(pl.col("buy_qty") + pl.col("sell_qty")),
            bid_ask_to_volume_ratio=((pl.col("bid_qty") + pl.col("ask_qty")) / pl.col("volume")),
            buy_sell_to_volume_ratio=((pl.col("buy_qty") + pl.col("sell_qty")) / pl.col("volume")),
            # Cross Features
            bid_ask_to_buy_sell_qty_ratio=((pl.col("bid_qty") + pl.col("ask_qty")) / (pl.col("buy_qty") + pl.col("sell_qty"))),
            buy_sell_imbalance=((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("buy_qty") + pl.col("sell_qty"))),
            # Bid-Ask Features
            bid_ask_spread=(pl.col("ask_qty") - pl.col("bid_qty")),
            bid_ask_ratio=(pl.col("bid_qty") / (pl.col("ask_qty") + 1e-8)),
            buy_sell_ratio=(pl.col("buy_qty") / (pl.col("sell_qty") + 1e-8)),
            order_flow_imbalance=((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("volume") + 1e-8)),
            # Pressure Indicators
            buying_pressure=(pl.col("buy_qty") / (pl.col("volume") + 1e-8)),
            selling_pressure=(pl.col("sell_qty") / (pl.col("volume") + 1e-8)),
            net_pressure=((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("volume") + 1e-8)),
            # Liquidity Features
            total_liquidity=(pl.col("bid_qty") + pl.col("ask_qty")),
            liquidity_imbalance=((pl.col("bid_qty") - pl.col("ask_qty")) / (pl.col("bid_qty") + pl.col("ask_qty") + 1e-8)),
            liquidity_ratio=((pl.col("bid_qty") + pl.col("ask_qty")) / (pl.col("volume") + 1e-8)),
            # Volume Transformations
            log_volume=pl.col("volume").log1p(),
            sqrt_volume=pl.col("volume").sqrt(),
            # Market Microstructure
            kyle_lambda=((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("volume") + 1e-8) / (pl.col("volume").sqrt() + 1e-8)),
            vpin=(pl.col("buy_qty") - pl.col("sell_qty")).abs() / (pl.col("buy_qty") + pl.col("sell_qty") + 1e-8),
            # Additional Features
            effective_spread=(
                2 * ((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("volume") + 1e-8)).abs() * (pl.col("ask_qty") - pl.col("bid_qty"))
            ),
            realized_spread=(
                (pl.col("ask_qty") - pl.col("bid_qty"))
                * ((pl.col("buy_qty") - pl.col("sell_qty")).abs() / (pl.col("buy_qty") + pl.col("sell_qty") + 1e-8))
            ),
            price_impact=(
                ((pl.col("buy_qty") - pl.col("sell_qty")) / (pl.col("volume") + 1e-8)) / (pl.col("volume").sqrt() + 1e-8) * pl.col("volume")
            ),
            trade_intensity=(pl.col("volume") / ((pl.col("bid_qty") + pl.col("ask_qty")) + 1e-8)),
        )

    @staticmethod
    @njit(parallel=True)
    def compute_triplet_imbalance(df_values, comb_indices):
        num_rows = df_values.shape[0]
        num_combinations = len(comb_indices)
        imbalance_features = np.empty((num_rows, num_combinations))

        for i in prange(num_combinations):
            a, b, c = comb_indices[i]
            for j in range(num_rows):
                max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
                min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
                mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val

                if mid_val == min_val:
                    imbalance_features[j, i] = np.nan
                else:
                    imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

        return imbalance_features

    # --- Step 2: Adapter function using Polars ---
    def calculate_triplet_imbalance_numba(self, price, df: pl.DataFrame) -> pl.DataFrame:
        df_values = df.select(price).to_numpy()
        comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

        features_array = self.compute_triplet_imbalance(df_values, comb_indices)
        columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]

        return pl.DataFrame(features_array, schema=columns)

    # --- Step 3: Main feature generator ---
    def generate_imb_features(self, train: pl.DataFrame) -> pl.DataFrame:
        feat = (
            [CONFIG.KNOWN_FEATURES]
            + [CONFIG.BID_ASK_QTY_FEATURES]
            + [CONFIG.BUY_SELL_QTY_FEATURES]
            + [CONFIG.VOLUME_FEATURES]
            + [CONFIG.CROSS_FEATURES]
            + [CONFIG.BID_ASK_FEATURES]
            + [CONFIG.PRESSURE_INDICATORS]
            + [CONFIG.LIQUIDITY_FEATURES]
            + [CONFIG.MARKET_MICROSTRUCTURE_FEATURES]
            + [CONFIG.ADDITIONAL_FEATURES]
        )

        for groups in feat:
            # 2-wise combinations
            for a, b in combinations(groups, 2):
                train = train.with_columns(((pl.col(a) - pl.col(b)) / (pl.col(a) + pl.col(b))).alias(f"{a}_{b}_imb"))
                sum_a = train[a].sum()
                sum_b = train[b].sum()
                train = train.with_columns(((pl.col(a) / sum_a) / (pl.col(b) / sum_b)).alias(f"{a}_{b}_imb3"))

            # 3-wise combinations
            if len(groups) > 2:
                triplet_feature = self.calculate_triplet_imbalance_numba(groups, train)
                train = train.hstack(triplet_feature)

            train = train.with_columns(*[pl.col(col).cast(pl.Float32) if train[col].dtype == pl.Float64 else pl.col(col) for col in train.columns])

        return train

    def add_statistical_features(self, df):
        """
        Adds statistical aggregation features across all 'X'-prefixed columns:
        - Mean
        - Std Dev
        - Range (max - min)
        - Median
        - 25th percentile
        - 75th percentile
        - Count of values above row mean
        - Index of max and min column (numeric suffix)
        - Cleans up NaNs and infs
        """

        x_data = df.select(CONFIG.ANONYMOUS_FEATURES).to_numpy()

        # Core stats (Mean, Std, Range, Median, Percentiles)
        x_stat_mean = x_data.mean(axis=1)
        x_stat_std = x_data.std(axis=1)
        x_stat_range = x_data.max(axis=1) - x_data.min(axis=1)
        x_stat_median = np.median(x_data, axis=1)
        x_stat_p25 = np.percentile(x_data, 25, axis=1)
        x_stat_p75 = np.percentile(x_data, 75, axis=1)

        # Count of values above row mean (row-wise comparison)
        row_means = x_stat_mean
        x_stat_above_mean_count = (x_data > row_means.reshape(-1, 1)).sum(axis=1)

        # Index (suffix) of max and min column
        x_stat_idx_max = x_data.argmax(axis=1)
        x_stat_idx_min = x_data.argmin(axis=1)

        # Create new columns in DataFrame
        df = df.with_columns(
            pl.Series(name="x_stat_mean", values=x_stat_mean),
            pl.Series(name="x_stat_std", values=x_stat_std),
            pl.Series(name="x_stat_range", values=x_stat_range),
            pl.Series(name="x_stat_median", values=x_stat_median),
            pl.Series(name="x_stat_p25", values=x_stat_p25),
            pl.Series(name="x_stat_p75", values=x_stat_p75),
            pl.Series(name="x_stat_above_mean_count", values=x_stat_above_mean_count),
            pl.Series(name="x_stat_idx_max", values=x_stat_idx_max),
            pl.Series(name="x_stat_idx_min", values=x_stat_idx_min),
        )

        return df

    def select_core_features(self, y=None):
        core_features_df = self.df.select(CONFIG.OG_FEATURES).collect()
        df = self.feature_engineering(self.df)
        df = self.generate_imb_features(df.collect())
        df = self.add_statistical_features(df)
        df = self.handle_inf_nan(df)

        df = pl.concat([core_features_df, df.drop(CONFIG.OG_FEATURES)], how="horizontal").with_columns(pl.all().shrink_dtype())

        if not CONFIG.TRAIN:
            if self.data_set == "train":
                feature_selection = FEATURE_SELECTION(x=df.to_pandas(), y=y)
                CONFIG.COMBINED_SCORES = feature_selection.smart_feature_selection(
                    sample_size=CONFIG.FEATURE_SELECTION_SAMPLE_SIZE,
                    top_k=CONFIG.TARGET_FEATURES,
                )
            df = df.select(set(CONFIG.EVAL_CORE_FEATURES))
        return df

In [None]:
class data_loader:
    def __init__(self):
        self.train = pl.scan_parquet(CONFIG.TRAIN_PATH).with_columns(pl.all().shrink_dtype()).drop(CONFIG.DATE_COL)

        self.test = pl.scan_parquet(CONFIG.TEST_PATH).with_columns(pl.all().shrink_dtype()).drop(CONFIG.TARGET_COL)

        self.train_y = self.train.select(CONFIG.TARGET_COL).collect()

        CONFIG.CLIP_UPPER = np.percentile(  # type: ignore
            self.train_y.to_numpy().flatten(), CONFIG.CLIP_PERCENTILE
        )
        CONFIG.CLIP_LOWER = np.percentile(  # type: ignore
            self.train_y.to_numpy().flatten(), 100 - CONFIG.CLIP_PERCENTILE
        )

        eng = FeatureEngineering(self.train.drop(CONFIG.TARGET_COL), data_set="train")
        self.train_X = eng.select_core_features(y=self.train_y.to_pandas())

        print("Train data loaded with shape:", self.train_X.shape)

        eng.df = self.test
        eng.data_set = "test"
        self.test_X = eng.select_core_features()

        print("Test data loaded with shape:", self.test_X.shape)

In [None]:
class FEATURE_STABILITY:
    def __init__(
        self,
        train_X: pl.DataFrame,
        train_y: pl.DataFrame,
        num_runs=5,
        importance_type="gain",
    ):
        self.feature_stability = {}
        self.train_X = train_X.to_pandas()
        self.train_y = train_y.to_pandas()
        self.num_runs = num_runs
        self.importance_type = importance_type

        self.feature_names = train_X.columns
        self.shap_df = pl.DataFrame()

    def compute_shap(self, model, x):
        explainers = TreeExplainer(model=model)
        shap_values = explainers.shap_values(x)
        shap_df = pl.DataFrame(np.abs(shap_values.get()), schema=self.feature_names).mean()
        self.shap_df = pl.concat(
            [self.shap_df, shap_df],
            how="vertical",
        ).with_columns(pl.all().shrink_dtype())

    def fit(
        self,
    ):
        importances_list = []

        params = {
            "n_estimators": 100,
            "max_depth": 6,
            "learning_rate": 0.1,
            "subsample": 0.8,
            "colsample_bytree": 0.8,
            "random_state": CONFIG.RANDOM_STATE,
            "device": "gpu",
            "verbosity": 0,
        }

        model = xgb.XGBRegressor(**params)

        for run in range(self.num_runs):
            rs = CONFIG.RANDOM_STATE + run  # different seed for each run
            kf = KFold(n_splits=CONFIG.N_FOLDS, shuffle=True, random_state=rs)
            for fold, (train_idx, valid_idx) in enumerate(kf.split(self.train_X), start=1):
                print(f"\n--- Fold {fold}/{CONFIG.N_FOLDS} ---")
                X_train = self.train_X.iloc[train_idx, :]
                y_train = self.train_y.iloc[train_idx, :]

                X_valid = self.train_X.iloc[valid_idx, :]
                y_valid = self.train_y.iloc[valid_idx, :]

                model = xgb.XGBRegressor(**params)

                model.fit(
                    X_train,
                    y_train,
                    eval_set=[(X_valid, y_valid)],
                    verbose=False,
                )

                raw_score = model.get_booster().get_score(importance_type=self.importance_type)
                importance_dict = {f: raw_score.get(f, 0.0) for f in self.feature_names}
                importances_list.append(importance_dict)
                self.compute_shap(model, X_train)

                del model, X_train, X_valid, y_train, y_valid

        self.runs = pl.DataFrame(importances_list)
        runs_t = self.runs.transpose(include_header=True)
        self.summary_table = pl.concat(
            [
                self.runs.mean().transpose(include_header=True).rename({"column_0": "mean"}),
                self.runs.std().transpose(include_header=True).rename({"column_0": "std"}).drop("column"),
                runs_t.with_columns(*[pl.col(col) / pl.col(col).sum() for col in runs_t.columns[1:]])
                .drop("column")
                .sum_horizontal()
                .to_frame()
                .with_columns(pl.all() / pl.all().sum()),
            ],
            how="horizontal",
        )

        return self

In [None]:
class evaluate:
    def __init__(
        self,
    ):
        pass

    def get_model_slices(self, n_samples: int):
        return [
            {"name": "full_data", "cutoff": 0},
            {
                "name": "recent_95pct",
                "cutoff": int(0.05 * n_samples),
            },  # Most recent 95%
            {
                "name": "recent_90pct",
                "cutoff": int(0.10 * n_samples),
            },  # Most recent 90%
            {
                "name": "recent_85pct",
                "cutoff": int(0.15 * n_samples),
            },  # Most recent 85%
            {
                "name": "recent_80pct",
                "cutoff": int(0.20 * n_samples),
            },  # Most recent 80%
        ]

    def create_time_decay_weights(self, n: int, decay: float = 0.95) -> np.ndarray:
        positions = np.arange(n)
        normalized = positions / (n - 1)
        weights = decay ** (1.0 - normalized)
        return weights * n / weights.sum()

    def adjust_weights_for_outliers(self, X, y, base_weights, outlier_fraction=0.001):
        # Train quick model to estimate residuals
        model = xgb.XGBRegressor(
            n_estimators=50,
            max_depth=10,
            random_state=CONFIG.RANDOM_STATE,
            device="gpu",
        )
        model.fit(X, y, sample_weight=base_weights)
        preds = model.predict(X)
        residuals = np.abs(y - preds)

        # Top N residuals = outliers
        n_outliers = max(1, int(outlier_fraction * len(residuals)))
        threshold = np.partition(residuals, -n_outliers)[-n_outliers]
        outlier_mask = residuals >= threshold

        # Downweight outliers (linear scale: 0.2â€“0.8 of base weight)
        adjusted_weights = base_weights.copy()
        if outlier_mask.any():
            res_out = residuals[outlier_mask]
            res_norm = (res_out - np.min(res_out)) / (np.ptp(res_out) + 1e-8)
            weight_factors = 0.8 - 0.6 * res_norm
            adjusted_weights[outlier_mask] *= weight_factors

        return adjusted_weights

    def train_and_evaluate(self, train_X, train_y, test_X):
        train_X = train_X.to_numpy()
        train_y = train_y.to_numpy().flatten()

        test_X = test_X.to_numpy()
        n_samples = train_X.shape[0]

        model_slices = self.get_model_slices(n_samples)

        oof_preds = {learner["name"]: {s["name"]: np.zeros(n_samples) for s in model_slices} for learner in CONFIG.MODELS}
        test_preds = {learner["name"]: {s["name"]: np.zeros(test_X.shape[0]) for s in model_slices} for learner in CONFIG.MODELS}

        full_weights = self.create_time_decay_weights(n_samples)
        kf = KFold(n_splits=CONFIG.N_FOLDS, shuffle=False)

        for fold, (train_idx, valid_idx) in enumerate(kf.split(train_X), start=1):
            print(f"\n--- Fold {fold}/{CONFIG.N_FOLDS} ---")
            X_valid = train_X[valid_idx]
            y_valid = train_y[valid_idx]

            for s in model_slices:
                cutoff = s["cutoff"]
                slice_name = s["name"]
                subset_x = train_X[cutoff:]
                subset_y = train_y[cutoff:]
                rel_idx = train_idx[train_idx >= cutoff] - cutoff

                X_train = subset_x[rel_idx]
                y_train = subset_y[rel_idx]
                sw = self.create_time_decay_weights(len(subset_x))[rel_idx] if cutoff > 0 else full_weights[train_idx]

                sw = self.adjust_weights_for_outliers(X_train, y_train, sw, outlier_fraction=0.001)

                print(f"  Training slice: {slice_name}, samples: {len(X_train)}")

                for learner in CONFIG.MODELS:
                    model = learner["Estimator"](**learner["params"])
                    model.fit(
                        X_train,
                        y_train,
                        sample_weight=sw,
                        eval_set=[(X_valid, y_valid)],
                        verbose=False,
                    )

                    mask = valid_idx >= cutoff
                    if mask.any():
                        idxs = valid_idx[mask]
                        oof_preds[learner["name"]][slice_name][idxs] = model.predict(train_X[idxs])
                    if cutoff > 0 and (~mask).any():
                        oof_preds[learner["name"]][slice_name][valid_idx[~mask]] = oof_preds[learner["name"]]["full_data"][valid_idx[~mask]]

                    test_preds[learner["name"]][slice_name] += model.predict(test_X)

        # Normalize test predictions
        for learner_name in test_preds:
            for slice_name in test_preds[learner_name]:
                test_preds[learner_name][slice_name] /= CONFIG.N_FOLDS

        return oof_preds, test_preds, model_slices

In [None]:
def ensemble_and_submit(train_df, oof_preds, test_preds, submission_df):
    learner_ensembles = {}
    for learner_name in oof_preds:
        scores = {s: pearsonr(train_df, oof_preds[learner_name][s])[0] for s in oof_preds[learner_name]}
        total_score = sum(scores.values())

        oof_simple = np.mean(list(oof_preds[learner_name].values()), axis=0)
        test_simple = np.mean(list(test_preds[learner_name].values()), axis=0)
        score_simple = pearsonr(train_df, oof_simple)[0]

        oof_weighted = sum(
            scores[s] / total_score * oof_preds[learner_name][s]
            for s in scores  # type: ignore
        )
        test_weighted = sum(
            scores[s] / total_score * test_preds[learner_name][s]
            for s in scores  # type: ignore
        )
        score_weighted = pearsonr(train_df, oof_weighted)[0]

        print(f"\n{learner_name.upper()} Simple Ensemble Pearson:   {score_simple:.4f}")
        print(f"{learner_name.upper()} Weighted Ensemble Pearson: {score_weighted:.4f}")

        learner_ensembles[learner_name] = {
            "oof_simple": oof_simple,
            "test_simple": test_simple,
        }

    final_oof = np.mean([le["oof_simple"] for le in learner_ensembles.values()], axis=0)
    final_test = np.mean([le["test_simple"] for le in learner_ensembles.values()], axis=0)
    final_test = np.clip(final_test, CONFIG.CLIP_LOWER, CONFIG.CLIP_UPPER)
    final_score = pearsonr(train_df, final_oof)[0]

    print(f"\nFINAL ensemble across learners Pearson: {final_score:.4f}")
    submission_df = submission_df.with_columns(pl.Series(name="prediction", values=final_test))
    submission_df.write_csv(CONFIG.SUBMISSION_PATH)
    print("Saved: submission.csv")

In [10]:
data = data_loader()
train_X, train_y = data.train_X, data.train_y
test = data.test_X

del data

Starting smart feature selection with 525886 samples...
Using 525886 recent samples for feature selection
Total features before selection: 926
Removing low-variance and high-missing features...
Features after variance/missing filter: 919
Computing correlations...
Computing mutual information...


  y = column_or_1d(y, warn=True)


Computing L1 regularization scores...


  y = column_or_1d(y, warn=True)
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model

Computing tree-based importance...
Combining feature scores...
Selected 120 features
Top 10 selected features:
   1. price_impact_trade_intensity_imb3 - Score: 92.2757
   2. realized_spread_trade_intensity_imb3 - Score: 92.0876
   3. effective_spread_trade_intensity_imb3 - Score: 91.7004
   4. effective_spread_price_impact_imb3 - Score: 91.5065
   5. kyle_lambda_vpin_imb3          - Score: 91.1313
   6. realized_spread_trade_intensity_imb - Score: 90.9759
   7. realized_spread_price_impact_imb - Score: 90.9257
   8. realized_spread_price_impact_imb3 - Score: 90.7300
   9. total_liquidity_liquidity_ratio_imb3 - Score: 90.6755
  10. total_liquidity_liquidity_ratio_imb - Score: 90.5756
Train data loaded with shape: (525886, 158)
Test data loaded with shape: (538150, 158)


In [None]:
if CONFIG.TRAIN:
    run_stability = FEATURE_STABILITY(train_X, train_y, num_runs=5, importance_type="gain")
    run_stability.fit()
    run_stability.summary_table.write_parquet(CONFIG.STABILITY_PATH)
    run_stability.shap_df.write_parquet(CONFIG.SHAP_PATH)

In [12]:
eval = evaluate()
oof_preds, test_preds, model_slices = eval.train_and_evaluate(train_X, train_y, test)


--- Fold 1/5 ---


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




  Training slice: full_data, samples: 420708
  Training slice: recent_95pct, samples: 420708
  Training slice: recent_90pct, samples: 420708
  Training slice: recent_85pct, samples: 420708
  Training slice: recent_80pct, samples: 420708

--- Fold 2/5 ---
  Training slice: full_data, samples: 420709
  Training slice: recent_95pct, samples: 394415
  Training slice: recent_90pct, samples: 368121
  Training slice: recent_85pct, samples: 341827
  Training slice: recent_80pct, samples: 315532

--- Fold 3/5 ---
  Training slice: full_data, samples: 420709
  Training slice: recent_95pct, samples: 394415
  Training slice: recent_90pct, samples: 368121
  Training slice: recent_85pct, samples: 341827
  Training slice: recent_80pct, samples: 315532

--- Fold 4/5 ---
  Training slice: full_data, samples: 420709
  Training slice: recent_95pct, samples: 394415
  Training slice: recent_90pct, samples: 368121
  Training slice: recent_85pct, samples: 341827
  Training slice: recent_80pct, samples: 31553

In [13]:
ensemble_and_submit(
    train_y.to_numpy().flatten(),
    oof_preds,
    test_preds,
    pl.DataFrame(
        {
            "id": [i for i in range(1, test.shape[0] + 1)],
        }
    ),
)


XGB Simple Ensemble Pearson:   0.1253
XGB Weighted Ensemble Pearson: 0.1254

FINAL ensemble across learners Pearson: 0.1253
Saved: submission.csv


In [14]:
dict(sorted(CONFIG.COMBINED_SCORES.items(), key=lambda item: item[1]))

{'X752': 0.13623877634277026,
 'X757': 0.2997224471093961,
 'X344': 0.3190095388067619,
 'X759': 0.48866681888397734,
 'X425': 0.575830740339469,
 'X758': 0.6514685906339077,
 'x_stat_median': 0.7804755071910973,
 'X415': 1.0260118925728854,
 'X508': 1.0482662609232827,
 'X614': 1.3399310714035977,
 'X444': 1.4136470736263924,
 'X137': 1.777507570099907,
 'X767': 1.8028083137708755,
 'X766': 1.8594598319691076,
 'X756': 1.8908012868520947,
 'X751': 1.9402391779308832,
 'X333': 2.0370406305211155,
 'X570': 2.124013815188575,
 'X608': 2.151968150568311,
 'X27': 2.3103860849875324,
 'X177': 2.4131049252443235,
 'X421': 2.5918045435265915,
 'X198': 2.861057463293814,
 'X501': 3.00957472442429,
 'X283': 3.1894961350318947,
 'X191': 3.3148973062014497,
 'X376': 3.3465444867325314,
 'X343': 3.4826279560096642,
 'X287': 3.6640958735957647,
 'X419': 3.9382883445558567,
 'X28': 4.169674604967346,
 'X342': 4.327963175567005,
 'X338': 4.471787847807335,
 'X769': 4.752782542948813,
 'X97': 4.767433