<a href="https://colab.research.google.com/github/fyenne/salaryshowoff/blob/master/sub1_hull_train_short_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

hull_tactical_market_prediction_path = kagglehub.competition_download('hull-tactical-market-prediction')
fyenneyenn_hull_feature_select_dict_path = kagglehub.dataset_download('fyenneyenn/hull-feature-select-dict')
fyenneyenn_download_talib_path = kagglehub.notebook_output_download('fyenneyenn/download-talib')

print('Data source import complete.')


In [None]:
!uv pip install --system --no-index -U --no-deps --find-links='/kaggle/input/download-talib'  ta_lib

In [None]:
import polars as pl
from argparse import Namespace
from pathlib import Path
import os
import numpy as np
import pandas as pd
import warnings
try:
    from metrics_hull import hull_score
    from samoyan_pack.stats import xi_correlation
    from threeway_tssplit import load_data
except:
    print('not finding error metric hull')
from scipy.stats import spearmanr, pearsonr
from sklearn.feature_selection import mutual_info_regression, f_regression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

warnings.filterwarnings("ignore")

# --- BLOCK 1: IMPORTS & SYSTEM SETUP ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import os

# 1. Device Configuration
# We check if a GPU is available. If yes, we use it for speed.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# 2. Reproducibility (Crucial for Finance)
# Financial data is noisy. We need to ensure that if we run the code twice,
# we get the exact same result, otherwise we can't trust our improvements.
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # These two lines ensure deterministic behavior on CUDA (slightly slower but safer)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)


def calculate_optimal_target(df: pl.DataFrame, vol_span: int = 21 * 6) -> pl.Series:
    """
    Calculate optimal position using a Volatility-Adjusted (Kelly-like) approach.

    Logic:
    1. Calculate Excess Returns.
    2. Calculate Rolling/EWM Volatility (Risk).
    3. Position = (Excess Return / Risk).
       - Higher Return -> Higher Position
       - Higher Volatility -> Lower Position (Punishment)
    4. Clip result between 0 and 2.
    """
    excess_returns = pl.col("forward_returns") - pl.col("risk_free_rate")
    volatility = excess_returns.ewm_std(span=vol_span, adjust=False).fill_null(0) + 1e-6
    raw_position = excess_returns / volatility
    final_position = raw_position.fill_nan(0)  # .clip(0, 2)

    # Return the Series computed from the input DataFrame
    return df.select(final_position.alias("optimal_position")).to_series()


def create_targets(df: pl.DataFrame, roll_var_window: int = 21):
    """
    Produces cls_label and reg_label and realized_var (rolling var proxy).
    Assumes column forward_ret exists (float), risk_free_r and market_forward_excess_returns may exist.
    """
    df = df.with_columns(
        [
            (pl.col("forward_returns") > 0).cast(pl.Int8).alias("cls_label"),
            pl.when(pl.col("forward_returns") > 0)
            .then(pl.col("forward_returns"))
            .otherwise(0.0)
            .alias("reg_label"),
        ]
    )
    # compute realized variance proxy via pandas rolling on forward_returns
    pdf = df.to_pandas()
    pdf["realized_var"] = (
        pdf["forward_returns"]
        .rolling(window=roll_var_window, min_periods=1)
        .var()
        .fillna(method="bfill")
        .clip(lower=1e-8)
    )
    return pl.from_pandas(pdf)



set_seed(42)


print("Block 1: Setup Complete.")
trading_days_per_yr = 252
trading_days_per_mt = 21
# import kaggle_evaluation.default_inference_server

try:
    if os.environ["USER"].startswith("samoyan"):
        args = Namespace(
            data_dir=Path("./"),
        )
except:
    args = Namespace(
        data_dir=Path("/kaggle/input/hull-tactical-market-prediction"),
    )



In [None]:
import numpy as np
import pandas as pd
import pandas.api.types
import talib
import polars as pl
import matplotlib.pyplot as plt

MIN_INVESTMENT = 0
MAX_INVESTMENT = 2
#  trading_days_per_yr = 252


class ParticipantVisibleError(Exception):
    pass


# ---------- 4. Evaluation: adapted hull_score ----------
def hull_score(
    solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str
) -> float:
    """
    Adapted from user's function. solution: pandas DF with forward_ret, risk_free_r, market_forward_excess_returns.
    submission: pandas DF with columns [row_id_column_name, 'prediction'] (row id must align to solution).
    """
    import pandas as pd, numpy as np

    MIN_INVESTMENT = 0
    MAX_INVESTMENT = 2
    trading_days_per_yr = 252

    if row_id_column_name in submission.columns:
        submission = submission.set_index(row_id_column_name)
    submission = submission.reindex(solution.index)
    sol = solution.copy()
    sol["position"] = submission["prediction"].values
    if sol["position"].max() > MAX_INVESTMENT or sol["position"].min() < MIN_INVESTMENT:
        # penalty or error, but here raise
        raise ValueError("position out of bounds")
    sol["strategy_returns"] = (
        sol["risk_free_rate"] * (1 - sol["position"])
        + sol["position"] * sol["forward_returns"]
    )
    strategy_excess_returns = sol["strategy_returns"] - sol["risk_free_rate"]
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = strategy_excess_cumulative ** (1 / len(sol)) - 1
    strategy_std = sol["strategy_returns"].std()
    if strategy_std == 0:
        return 0.0
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)
    market_excess_returns = sol["market_forward_excess_returns"]
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = market_excess_cumulative ** (1 / len(sol)) - 1
    market_std = sol["forward_returns"].std()
    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)
    if market_volatility == 0:
        return 0.0
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2)
    vol_penalty = 1 + excess_vol
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return)
        * 100
        * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    print(
        "return_penalty: ",
        return_penalty,
        "\nvol_penalty: ",
        vol_penalty,
        "\nstrategy_volatility: ",
        strategy_volatility,
        "\nmarket_volatility: ",
        market_volatility,
    )
    print("=" * 30)
    return min(float(adjusted_sharpe), 1_000_000)


class RollingWindowMinMaxTrainSplit:
    """
    A custom time series cross-validation splitter that generates rolling windows
    with explicit training, testing, and standardization sets.

    It maintains the training set size between a minimum (`min_train_size`)
    and a maximum (`max_train_size`) number of groups. It shifts the entire
    window (standardization, train, purge, test) forward by `test_size` groups
    for each fold.

    It includes:
    - A 'standardization' set preceding the training set, of a fixed `standardization_size`.
    - A 'purge' period to prevent data leakage between the training and test sets.

    This splitter assumes 'groups' represent ordered time steps (e.g., days, weeks)
    and that all data points within a group belong to that time step.
    """

    def __init__(
        self,
        min_train_size=252,
        max_train_size=None,
        test_size=180,
        purge=1,
        standardization_size=None,
    ):
        """
        Initialize the splitter with training, test, purge, and standardization size parameters.

        Parameters:
        - min_train_size (int): Minimum number of unique groups required in the training set.
                                The training window will dynamically adjust to be at least this size,
                                within the max_train_size constraint. Must be positive.
        - max_train_size (int): Maximum number of unique groups allowed in the training set.
                                The training window will not extend further back than this size.
                                Must be greater than or equal to `min_train_size`.
        - test_size (int): Number of unique groups in the test set for each fold. Must be positive.
        - purge (int): Number of unique groups to skip between the end of the training set
                       and the start of the test set. This gap helps avoid forward-looking
                       data leakage (e.g., target values influencing features). Must be non-negative.
                       A value of 0 means no gap (contiguous train-test).
        - standardization_size (int): Number of unique groups in the standardization set. This set
                                      immediately precedes the training set and is used for
                                      calculating normalization parameters. Must be non-negative.
                                      A value of 0 means no separate standardization set will be yielded.

        Raises:
            ValueError: If any size parameter is invalid (e.g., non-positive,
                        or max_train_size < min_train_size).
        """
        if not isinstance(min_train_size, int) or min_train_size <= 0:
            raise ValueError("min_train_size must be a positive integer.")
        if not isinstance(max_train_size, int) or max_train_size < min_train_size:
            raise ValueError(
                "max_train_size must be an integer greater than or equal to min_train_size."
            )
        if not isinstance(test_size, int) or test_size <= 0:
            raise ValueError("test_size must be a positive integer.")
        if not isinstance(purge, int) or purge < 0:
            raise ValueError("purge must be a non-negative integer.")
        if not isinstance(standardization_size, int) or standardization_size < 0:
            raise ValueError("standardization_size must be a non-negative integer.")

        self.min_train_size = min_train_size
        if max_train_size:
            self.max_train_size = max_train_size
        else:
            self.max_train_size = min_train_size
        self.test_size = test_size
        self.purge = purge
        if standardization_size is None:
            self.standardization_size = self.max_train_size
        else:
            self.standardization_size = standardization_size

    def split(self, X, y=None, groups=None):
        """
        Generates indices to split data into standardization, training, and test sets.

        The method yields three NumPy arrays for each fold.

        Parameters:
        - X: Input data. Not directly used for splitting logic, but required
             for scikit-learn compatibility.
        - y: Target variable. Optional, not used for splitting.
        - groups: An array-like object of time-based group identifiers (e.g., sorted dates
                  or sequential integers). Each element corresponds to a row in X,
                  and rows with the same group identifier are treated as part of
                  the same time step for splitting. Must be provided.

        Yields:
        - standardization_idx (np.ndarray): NumPy array of integer indices for the
                                            standardization set. This array will be empty
                                            if `standardization_size` is 0.
        - train_idx (np.ndarray): NumPy array of integer indices for the training set.
        - test_idx (np.ndarray): NumPy array of integer indices for the test set.

        Raises:
            ValueError: If the 'groups' parameter is not provided.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter must be provided with time-based identifiers."
            )

        groups_arr = np.asarray(groups)
        unique_groups = np.unique(groups_arr)
        n_unique_groups = len(unique_groups)

        # Pre-map unique group values to their original indices for efficient lookup.
        group_to_original_indices = {
            g: np.where(groups_arr == g)[0] for g in unique_groups
        }

        # Calculate the minimum total number of groups required for at least one valid split.
        # This includes standardization set, minimum train set, purge, and test set.
        min_required_total_groups = (
            self.standardization_size
            + self.min_train_size
            + self.purge
            + self.test_size
        )
        if n_unique_groups < min_required_total_groups:
            # Not enough data to form even the first valid split with all components.
            return  # Yields nothing

        # 'test_start_group_idx' refers to the index within 'unique_groups' where
        # the current test set begins.
        # The first test set cannot start until all preceding sets (standardization, min_train, purge)
        # are accounted for.
        test_start_group_idx = (
            self.standardization_size + self.min_train_size + self.purge
        )

        while test_start_group_idx + self.test_size <= n_unique_groups:
            # 1. Test Set groups
            # These are the group identifiers for the test period.
            test_groups_for_fold = unique_groups[
                test_start_group_idx : test_start_group_idx + self.test_size
            ]

            # 2. Training Set groups
            # The exclusive end index of the training set groups (within unique_groups).
            # It's positioned immediately before the purge period.
            train_end_group_idx = test_start_group_idx - self.purge

            # The inclusive start index of the training set groups (within unique_groups).
            # This must be:
            # - At least 0 (cannot go before the first group).
            # - At least `self.standardization_size` (to leave space for the standardization set).
            # - At most `train_end_group_idx - self.min_train_size` (to ensure minimum train size).
            # - At least `train_end_group_idx - self.max_train_size` (to respect maximum train size).

            # The earliest a training set can start is either 0 or after the standardization set.
            # The latest it can start (to satisfy max_train_size) is (train_end_group_idx - max_train_size).
            train_start_group_idx = max(
                self.standardization_size,  # Must leave space for standardization set
                train_end_group_idx
                - self.max_train_size,  # Respect max_train_size lookback
            )

            # Validate the calculated training window for the current fold.
            # If the window size is less than min_train_size, this fold is invalid.
            if (train_end_group_idx - train_start_group_idx) < self.min_train_size:
                # This can happen if `train_end_group_idx` is too early in `unique_groups`
                # or if the constraints for `train_start_group_idx` make the window too small.
                break  # Cannot form a valid training set for this test window, so stop.

            train_groups_for_fold = unique_groups[
                train_start_group_idx:train_end_group_idx
            ]

            # 3. Standardization Set groups
            # The exclusive end of the standardization set is the inclusive start of the training set.
            standardization_end_group_idx = train_start_group_idx
            # The inclusive start of the standardization set.
            standardization_start_group_idx = (
                standardization_end_group_idx - self.standardization_size
            )

            # This check is implicitly covered by the `train_start_group_idx = max(self.standardization_size, ...)`
            # but is a good final sanity check or if `standardization_size` is 0.
            if standardization_start_group_idx < 0:
                # Should not be reached with the current `train_start_group_idx` calculation
                # unless `standardization_size` is intended to be 0 and no groups are needed.
                # However, if `standardization_size` is > 0 and `train_start_group_idx` is 0, this catches it.
                break

            standardization_groups_for_fold = unique_groups[
                standardization_start_group_idx:standardization_end_group_idx
            ]

            # Collect the original data indices corresponding to these selected groups.
            standardization_idx = np.concatenate(
                [group_to_original_indices[g] for g in standardization_groups_for_fold]
            )
            train_idx = np.concatenate(
                [group_to_original_indices[g] for g in train_groups_for_fold]
            )
            test_idx = np.concatenate(
                [group_to_original_indices[g] for g in test_groups_for_fold]
            )

            yield standardization_idx, train_idx, test_idx

            # Move the entire window forward by `test_size` groups for the next fold.
            test_start_group_idx += self.test_size

    def get_n_splits(self, X=None, y=None, groups=None):
        """
        Returns the total number of splitting iterations (folds) that will be generated.

        Parameters:
        - X: Ignored.
        - y: Ignored.
        - groups: Array-like of time-based group identifiers. Must be provided.

        Returns:
        - n_splits (int): The total number of folds.

        Raises:
            ValueError: If the 'groups' parameter is not provided.
        """
        if groups is None:
            raise ValueError("The 'groups' parameter must be provided.")

        groups_arr = np.asarray(groups)
        unique_groups = np.unique(groups_arr)
        n_unique_groups = len(unique_groups)

        n_splits = 0
        # Replicate the exact logic from the 'split' method to ensure consistency.
        min_required_total_groups = (
            self.standardization_size
            + self.min_train_size
            + self.purge
            + self.test_size
        )
        if n_unique_groups < min_required_total_groups:
            return 0

        test_start_group_idx = (
            self.standardization_size + self.min_train_size + self.purge
        )

        while test_start_group_idx + self.test_size <= n_unique_groups:
            train_end_group_idx = test_start_group_idx - self.purge

            train_start_group_idx = max(
                self.standardization_size, train_end_group_idx - self.max_train_size
            )

            if (train_end_group_idx - train_start_group_idx) < self.min_train_size:
                break

            standardization_end_group_idx = train_start_group_idx
            standardization_start_group_idx = (
                standardization_end_group_idx - self.standardization_size
            )

            if standardization_start_group_idx < 0:
                break

            n_splits += 1
            test_start_group_idx += self.test_size
        return n_splits


def load_data(df, showplot=False, target_col: str = "target"):
    # print(df.schema)
    # Core rolling features (keeping essential ones, removing redundant)
    df_fe = df.with_columns(
        [
            # Rolling means - keep key timeframes
            pl.col(target_col).rolling_mean(4).alias("rolling_mean_5"),
            pl.col(target_col).rolling_mean(21).alias("rolling_mean_21"),  # Monthly
            pl.col(target_col)
            .rolling_mean(63)
            .alias("rolling_mean_50"),  # Key technical level
            pl.col(target_col)
            .rolling_mean(252)
            .alias("rolling_mean_252"),  # Long-term trend
            # Rolling std - focus on key periods
            pl.col(target_col).rolling_std(4).alias("rolling_std_5"),
            pl.col(target_col).rolling_std(21).alias("rolling_std_21"),
            pl.col(target_col).rolling_std(63).alias("rolling_std_50"),
            # Rolling min/max - key support/resistance levels
            pl.col(target_col).rolling_min(4).alias("rolling_min_5"),
            pl.col(target_col).rolling_min(21).alias("rolling_min_21"),
            pl.col(target_col).rolling_min(63).alias("rolling_min_50"),
            pl.col(target_col).rolling_max(4).alias("rolling_max_5"),
            pl.col(target_col).rolling_max(21).alias("rolling_max_21"),
            pl.col(target_col).rolling_max(63).alias("rolling_max_50"),
            pl.col(target_col).rolling_min(252).alias("rolling_min_252"),
            pl.col(target_col).rolling_max(252).alias("rolling_max_252"),
            pl.col(target_col).rolling_skew(4).alias("rolling_skew_5"),
            pl.col(target_col).rolling_skew(21).alias("rolling_skew_21"),
            pl.col(target_col).rolling_skew(63).alias("rolling_skew_50"),
            # pl.col(target_col).kurtosis(4).alias("rolling_kurtosis_5"),
            # pl.col(target_col).kurtosis(21).alias("rolling_kurtosis_21"),
            # pl.col(target_col).kurtosis(63).alias("rolling_kurtosis_50"),
            # Price lags - keep recent history
            pl.col(target_col).shift(1).alias("value_lag_1"),
            pl.col(target_col).shift(2).alias("value_lag_2"),
            pl.col(target_col).shift(4).alias("value_lag_5"),
            # Date features
        ]
    )

    # Price-based features
    df_fe = df_fe.with_columns(
        [
            # Returns
            (pl.col(target_col) / pl.col("value_lag_1") - 1).alias("return_1d"),
            (pl.col(target_col) / pl.col("value_lag_2") - 1).alias("return_2d"),
            (pl.col(target_col) / pl.col("value_lag_5") - 1).alias("return_5d"),
            # Log returns (better for modeling)
            (pl.col(target_col) / pl.col("value_lag_1")).log().alias("log_return_1d"),
            (pl.col(target_col) / pl.col("value_lag_5")).log().alias("log_return_5d"),
            # Volatility measures
            pl.col(target_col).rolling_std(21).alias("volatility_21d"),
            (
                pl.col(target_col).rolling_std(4) / pl.col(target_col).rolling_std(21)
            ).alias("vol_ratio_5_21"),
            # Price position within recent range
            (
                (pl.col(target_col) - pl.col("rolling_min_21"))
                / (pl.col("rolling_max_21") - pl.col("rolling_min_21"))
            ).alias("price_position_21d"),
            # Distance from moving averages (normalized)
            (
                (pl.col(target_col) - pl.col("rolling_mean_21"))
                / pl.col("rolling_std_21")
            ).alias("zscore_21d"),
            (
                (pl.col(target_col) - pl.col("rolling_mean_50"))
                / pl.col("rolling_std_50")
            ).alias("zscore_50d"),
            # Momentum indicators
            (pl.col(target_col) - pl.col("rolling_mean_5")).alias("momentum_5d"),
            (pl.col(target_col) - pl.col("rolling_mean_21")).alias("momentum_21d"),
            (
                (pl.col(target_col) - pl.col("value_lag_5")) / pl.col("value_lag_5")
            ).alias("roc_5d"),
            (pl.col("rolling_mean_5") > pl.col("rolling_mean_21")).alias(
                "short_above_medium"
            ),
            (pl.col("rolling_mean_21") > pl.col("rolling_mean_50")).alias(
                "medium_above_long"
            ),
            (pl.col("rolling_mean_50") > pl.col("rolling_mean_252")).alias(
                "trend_bullish"
            ),
        ]
    )
    # Advanced technical features
    df_fe = df_fe.with_columns(
        [
            # Relative strength vs different timeframes
            (pl.col(target_col) / pl.col("rolling_mean_5")).alias(
                "relative_strength_5d"
            ),
            (pl.col(target_col) / pl.col("rolling_mean_21")).alias(
                "relative_strength_21d"
            ),
            (pl.col(target_col) / pl.col("rolling_mean_50")).alias(
                "relative_strength_50d"
            ),
            # Bollinger Band position
            (
                (pl.col(target_col) - pl.col("rolling_mean_21"))
                / (2 * pl.col("rolling_std_21"))
            ).alias("bb_position"),
            # Support/Resistance levels
            ((pl.col(target_col) - pl.col("rolling_max_5")).abs() <= 3).alias(
                "at_5d_high"
            ),
            ((pl.col(target_col) - pl.col("rolling_min_5")).abs() <= 3).alias(
                "at_5d_low"
            ),
            ((pl.col(target_col) - pl.col("rolling_max_21")).abs() <= 3).alias(
                "at_21d_high"
            ),
            ((pl.col(target_col) - pl.col("rolling_min_21")).abs() <= 3).alias(
                "at_21d_low"
            ),
            # Trend strength
            (pl.col("rolling_mean_5").diff() > 0).alias("ma5_trending_up"),
            (pl.col("rolling_mean_21").diff() > 0).alias("ma21_trending_up"),
            # Volume-like proxy (using price volatility)
            (pl.col("rolling_std_5") / pl.col("rolling_std_21")).alias(
                "volatility_ratio"
            ),
        ]
    )

    value_scaled = df_fe[target_col].to_numpy()

    # Hilbert Transform
    hts1, hts2 = talib.HT_SINE(value_scaled)

    # Bollinger Bands
    bb_upper, bb_middle, bb_lower = talib.BBANDS(value_scaled, timeperiod=20)

    # MACD
    macd_line, macd_signal, macd_hist = talib.MACD(value_scaled)

    # Additional technical indicators
    rsi = talib.RSI(value_scaled, timeperiod=14)
    stoch_k, stoch_d = talib.STOCH(value_scaled, value_scaled, value_scaled)
    williams_r = talib.WILLR(value_scaled, value_scaled, value_scaled)
    cci = talib.CCI(value_scaled, value_scaled, value_scaled)

    # Add all technical indicators
    df_fe = df_fe.with_columns(
        [
            pl.Series(name="ht_sine", values=hts1),
            pl.Series(name="ht_leadsine", values=hts2),
            pl.Series(name="bb_upper", values=bb_upper),
            pl.Series(name="bb_lower", values=bb_lower),
            pl.Series(name="macd_line", values=macd_line),
            pl.Series(name="macd_signal", values=macd_signal),
            pl.Series(name="macd_histogram", values=macd_hist),
            pl.Series(name="rsi", values=rsi),
            pl.Series(name="stoch_k", values=stoch_k),
            pl.Series(name="stoch_d", values=stoch_d),
            pl.Series(name="williams_r", values=williams_r),
            pl.Series(name="cci", values=cci),
        ]
    )

    # Derived features from technical indicators
    df_fe = df_fe.with_columns(
        [
            # Bollinger Band signals
            (pl.col(target_col) > pl.col("bb_upper")).alias("bb_breakout_upper"),
            (pl.col(target_col) < pl.col("bb_lower")).alias("bb_breakout_lower"),
            (
                (pl.col(target_col) - pl.col("bb_lower"))
                / (pl.col("bb_upper") - pl.col("bb_lower"))
            ).alias("bb_percent"),
            # MACD signals
            (pl.col("macd_line") > pl.col("macd_signal")).alias("macd_bullish"),
            (pl.col("macd_histogram") > 0).alias("macd_hist_positive"),
            # RSI levels
            (pl.col("rsi") > 70).alias("rsi_overbought"),
            (pl.col("rsi") < 30).alias("rsi_oversold"),
            # Stochastic signals
            (pl.col("stoch_k") > pl.col("stoch_d")).alias("stoch_bullish"),
            ((pl.col("stoch_k") > 80) & (pl.col("stoch_d") > 80)).alias(
                "stoch_overbought"
            ),
            ((pl.col("stoch_k") < 20) & (pl.col("stoch_d") < 20)).alias(
                "stoch_oversold"
            ),
            # Multi-timeframe confirmation
            ((pl.col("ma5_trending_up")) & (pl.col("ma21_trending_up"))).alias(
                "multi_ma_bullish"
            ),
            (
                (pl.col("rsi") > 50) & (pl.col("macd_line") > pl.col("macd_signal"))
            ).alias("momentum_confluence"),
            (
                pl.col("log_return_1d").rolling_std(4)
                / pl.col("log_return_1d").rolling_std(21)
            ).alias("volatility_spread"),
            pl.col("momentum_5d").diff().alias("momentum_acceleration_5d"),
        ]
    )

    if showplot:
        plt.figure(figsize=(12, 6))
        plt.plot(df_fe["stateDate"], df_fe["value"], label="Original Data")
        plt.plot(df_fe["stateDate"], df_fe["talib_EMA"], label="hts1")
        plt.plot(df_fe["stateDate"], df_fe["talib_SMA"], label="hts2")
        plt.title(
            f"next month contract Scaled {target_col} history close Value with EMA and SMA"
        )
        plt.xlabel("State Date")

    return df_fe


In [None]:
train = pl.read_csv(args.data_dir / "train.csv")
test = pl.read_csv(args.data_dir / "test.csv")

train = train.with_columns(pl.selectors.string().cast(pl.Float64))
train = train.with_columns(
    (pl.col("I2") - pl.col("I1")).alias("U1"),
    (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias("U2"),
    pl.col("forward_returns").shift(1).alias("lagged_forward_returns"),
    pl.col("risk_free_rate").shift(1).alias("lagged_risk_free_rate"),
    pl.col("market_forward_excess_returns")
    .shift(1)
    .alias("lagged_market_forward_excess_returns"),
    # ((pl.col("forward_returns") - pl.col("risk_free_rate")) > 0).alias("if_pos_r"),
    # pl.col("forward_returns").log().alias("log_forward_returns"),
)
train = train.with_columns(
    (pl.col("date_id") // trading_days_per_yr).alias("yrno"),
    np.sin(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias("day_of_yr_sin"),
    np.cos(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias("day_of_yr_cos"),
    np.sin(np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr).alias(
        "day_of_yr_sin2"
    ),
    np.cos(np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr).alias(
        "day_of_yr_cos2"
    ),
).with_columns(
    (pl.col("day_of_yr_cos2") / 100 + (1 - (1 / (pl.int_range(0, pl.count()) + 1e-6))) * 100).alias(
        "magic_up"
    )
)
train = create_targets(train)
train = train.with_columns(target=calculate_optimal_target(train))
train = load_data(train, target_col="lagged_market_forward_excess_returns")

# ------------------------------------- x ------------------------------------ #

exclude_cols = [
    "date_id",
    "forward_returns",
    "risk_free_rate",
    "market_forward_excess_returns",
    "if_pos_r",
    "target",
    "cls_label",
    "reg_label",
    "realized_var",
]
feature_cols = [c for c in train.columns if c not in exclude_cols]

#
# *---------------------------------------------------------------------------- #
# *                               for null check                                #
# *---------------------------------------------------------------------------- #
train = train.fill_nan(None)

print(f"\nTotal features available: {len(feature_cols)}")

display(train.select(feature_cols).null_count())
# px.histogram(train.select(feature_cols).null_count().transpose(), nbins=60)
display(
    train.select([col for col in train.columns if train[col].null_count() < 1205])[
        1200:, :
    ]
    .null_count()
    .transpose()
    .max()
)
# samo, skip first 1200 rows to remove some null val cols.
train = train.select([col for col in train.columns if train[col].null_count() < 1205])[
    1200:, :
]
train = train.fill_null(strategy="forward")


# 3. Hyperparameter Configuration
# We define all our settings here. This makes tuning the model much easier later.
class CFG:
    # Data Settings
    input_dim = len(feature_cols)  # Example: 94 stock characteristics (from Gu et al.)
    output_dim = 1  # Regression: Predicting 1 value (e.g., Return)
    # Model Architecture (Shallow & Robust)
    hidden_layers = [64, 32, 16]  # The "Funnel" shape
    dropout_prob = 0.4  # High dropout to prevent overfitting
    # Training Settings
    learning_rate = 0.001
    batch_size = 128
    epochs = 20
    # Loss Settings
    huber_delta = 1.0  # Threshold for Huber Loss
    # samo: trees
    verbose = 33
    num_boost_round = 3000
    early_stopping_rounds = 50
    exclude_cols = exclude_cols

In [None]:
import joblib
import lightgbm as lgb
import xgboost as xgb


feature_cols = [c for c in train.columns if c not in CFG.exclude_cols]
target_col = "target"

# Extract features and target as NumPy arrays
X = train.select(feature_cols).to_numpy()
y = train.select(target_col).to_numpy().flatten()
print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
try:
    feature_sets = joblib.load("/kaggle/input/hull-feature-select-dict/feature_select_dict.py")
except:
    raise SystemError
    feature_sets = feature_selection_create_df()
    joblib.dump(feature_sets, "feature_select_dict.py")


CFG.cat_cols = (
    train[feature_cols].select(pl.selectors.by_dtype([pl.Int64, pl.Boolean])).columns
)

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import sys
import pickle

# ----------------------------------- limix ---------------------------------- #
# import os

# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# sys.path.append("/mnt/c/d2/Limix")
# from inference.predictor import LimiXPredictor

# MAX_CONTEXT = 1024  # Reduced slightly to be safe for CUDA kernels
# EVAL_BATCH_SIZE = 16  # <--- NEW: Predict in chunks of 128 to avoid CUDA crash
# model_file = "/mnt/c/d2/Limix/model_weight/LimiX-2M.ckpt"
# inference_config = "/mnt/c/d2/Limix/config/reg_default_noretrieval.json"
# ----------------------------------- limix ---------------------------------- #


def build_models(random_state: int = 42):
    return {
        "ElasticNet": ElasticNet(
            alpha=0.01,
            l1_ratio=0.1,
            max_iter=1_000_000,
            random_state=random_state,
            selection="random",
        ),
        "XGBoost": dict(
            n_estimators=10_000,
            learning_rate=0.05,
            max_depth=2,
            objective="reg:pseudohubererror",
            n_jobs=-1,
            random_state=random_state,
            early_stopping_rounds=50,
        ),
        "XGBoost1": dict(
            n_estimators=1_000,
            learning_rate=0.05,
            max_depth=6,
            objective="reg:pseudohubererror",
            n_jobs=-1,
            random_state=random_state,
            early_stopping_rounds=50,
            colsample_bytree=0.8,  # Use only 70% of features per tree
            subsample=0.8,  # Use only 70% of rows per tree
            reg_alpha=0.1,  # L1 Regularization
            reg_lambda=1.0,  # L2 Regularization
            gamma=0.1,  # Min loss reduction required to make a split
            # min_child_weight=1,
        ),
        "LightGBM": dict(
            n_estimators=10_000,
            learning_rate=0.05,
            max_depth=2,
            objective="huber",
            n_jobs=-1,
            random_state=random_state,
            verbose=-1,
        ),
        "LightGBM1": dict(
            n_estimators=1_000,
            learning_rate=0.05,
            max_depth=6,
            objective="huber",
            n_jobs=-1,
            random_state=random_state,
            verbose=-1,
            colsample_bytree=0.8,
            subsample=0.8,
            reg_alpha=0.5,
            reg_lambda=1,
            # min_split_gain=0.1,  # Higher threshold for deeper trees
            # min_child_samples=40,  #
        ),
        "CatBoost": dict(
            iterations=10_000,
            learning_rate=0.05,
            depth=2,
            loss_function="Huber:delta=1.0",
            random_seed=random_state,
            early_stopping_rounds=50,
            # rsm=.7,
            # subsample=.7,
        ),
        "CatBoost1": dict(
            iterations=10_000,
            learning_rate=0.05,
            depth=6,
            loss_function="Huber:delta=1.0",
            random_seed=random_state,
            early_stopping_rounds=50,
            rsm=0.8,
            l2_leaf_reg=0.8,
            subsample=0.8,
        ),
        # "nn": FinancialRegressor(input_dim=X_train_scaled.shape[1]),
        # "limix": LimiXPredictor(
        #     device=device,
        #     model_path=model_file,
        #     inference_config=inference_config,
        #     categorical_features_indices=None,
        # ),
    }


# trainingfunctions
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    cat_cols: list = [],
):
    # display(pd.Series(train.dtypes).value_counts())
    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_cols)
    lgb_valid = lgb.Dataset(x_valid, y_valid, categorical_feature=cat_cols)

    model = lgb.train(
        params=CFG.model_params,
        train_set=lgb_train,
        num_boost_round=CFG.num_boost_round,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "val"],
        callbacks=[
            lgb.early_stopping(
                stopping_rounds=CFG.early_stopping_rounds, verbose=CFG.verbose
            ),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
        CFG.model_params,
        dtrain=xgb_train,
        num_boost_round=CFG.num_boost_round,
        evals=[(xgb_train, "train"), (xgb_valid, "eval")],
        early_stopping_rounds=CFG.early_stopping_rounds,
        verbose_eval=CFG.verbose,
    )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    cat_cols: list = [],
):
    cat_train = cat.Pool(data=x_train, label=y_train, cat_features=cat_cols)
    cat_valid = cat.Pool(data=x_valid, label=y_valid, cat_features=cat_cols)
    model = cat.CatBoostRegressor(**CFG.model_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_rounds,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


# ==========================================
# HELPER: BATCHED PREDICTION
# ==========================================
def limix_batched_predict(
    model,
    X_train,
    y_train,
    X_valid,
    batch_size=512,
):
    preds = []

    for i in range(0, len(X_valid), batch_size):
        xb = X_valid[i : i + batch_size]

        with torch.no_grad():
            p = model.predict(
                X_train,
                y_train,
                xb,
                task_type="Regression",
            )

        preds.append(p)

    return np.concatenate(preds, axis=0)


def train_models_reg(
    std_df: pl.DataFrame,
    train: pl.DataFrame,
    valid: pl.DataFrame,
    FEATURES: list,
    _cat_cols: list,
    target_col: str,
):
    _num_cols = [str(c) for c in np.setdiff1d(FEATURES, _cat_cols)]
    _cat_cols = [str(c) for c in _cat_cols]
    FEATURES = [str(c) for c in FEATURES]

    s_train = std_df[_num_cols].to_numpy()
    scaler = StandardScaler()
    scaler.fit(s_train)

    X_train = train[_num_cols].to_numpy()
    X_valid = valid[_num_cols].to_numpy()
    y_train = train[target_col].to_numpy()
    y_valid = valid[target_col].to_numpy()

    def scale_num(df):
        return pd.DataFrame(
            scaler.transform(df[_num_cols].to_numpy()),
            columns=_num_cols,
            index=df.to_pandas().index if isinstance(df, pl.DataFrame) else df.index,
        )

    X_train_num = scale_num(train)
    X_valid_num = scale_num(valid)

    y_train = train[target_col].to_numpy()
    y_valid = valid[target_col].to_numpy()

    # ===============================
    # 3. Encode categorical features (NN-safe)
    # ===============================
    if _cat_cols:
        cat_encoder = OrdinalEncoder(
            handle_unknown="use_encoded_value",
            unknown_value=-1,
            encoded_missing_value=-1,
            dtype=np.int64,
        )
        X_train_cat = cat_encoder.fit_transform(
            train.select(_cat_cols).to_pandas().astype(str)
        )
        X_valid_cat = cat_encoder.transform(
            valid.select(_cat_cols).to_pandas().astype(str)
        )

        X_train_cat = pd.DataFrame(
            X_train_cat,
            columns=_cat_cols,
            index=train.to_pandas().index,
        )
        X_valid_cat = pd.DataFrame(
            X_valid_cat,
            columns=_cat_cols,
            index=valid.to_pandas().index,
        )

        X_train = pd.concat([X_train_num, X_train_cat], axis=1)
        X_valid = pd.concat([X_valid_num, X_valid_cat], axis=1)

        num_dim = len(_num_cols)
        cat_dim = len(_cat_cols)
        categorical_idx = list(range(num_dim, num_dim + cat_dim))
    else:
        X_train = X_train_num
        X_valid = X_valid_num
        categorical_idx = None
        cat_encoder = None
    # ===============================
    # 4. Safety checks (critical for CUDA)
    # ===============================
    if cat_encoder is not None:
        for i, c in enumerate(_cat_cols):
            max_allowed = len(cat_encoder.categories_[i])
            assert X_train[c].max() <= max_allowed
            assert X_valid[c].max() <= max_allowed

    # = torch params
    returns_train = train["forward_returns"].to_numpy()
    input_dim = X_train.shape[1]
    WINDOW_SIZE = 21  # for torch models.
    # Define the dictionary of models
    trained_models = {}
    valid_scores = {}  # 1. Initialize collector
    model_dict = build_models()
    print(f"{'='*10} Starting Training Loop {'='*10}\n")

    for name, model in model_dict.items():
        print(f"Training {name} model...")
        CFG.model_params = model
        if "XGB" in name:
            model, position = xgboost_training(X_train, y_train, X_valid, y_valid)

        elif "LightGBM" in name:
            model, position = lightgbm_training(
                X_train, y_train, X_valid, y_valid, _cat_cols
            )

        elif "CatBoost" in name:
            model, position = catboost_training(
                X_train, y_train, X_valid, y_valid, _cat_cols
            )

        elif "limix" in name or "nn" in name:
            # NN models consume:
            # - float32 numericals
            # - int64 categoricals
            import os

            os.environ["CUDA_VISIBLE_DEVICES"] = ""
            model = LimiXPredictor(
                device=torch.device("cpu"),
                model_path=model_file,
                inference_config=inference_config,
                categorical_features_indices=categorical_idx,
            )
            position = limix_batched_predict(
                model,
                X_train.values,
                y_train.astype("float32"),
                X_valid.values,
                batch_size=512,  # start small
            )
        else:
            model.fit(X_train, y_train)
            position = model.predict(X_valid)

        # # 2. Check Training Score
        # train_score = model.score(X_train_scaled, y_train)
        # print(f"{name} Train RÂ² score: {train_score:.4f}")

        # 3. Predict on Validation Data (passed in function)
        position = np.clip(position, 0.0, 2.0)
        # 4. Prepare Validation Submission DataFrame for hull_score
        # We use the 'valid' dataframe passed to the function, not global 'test'
        valid_pd = valid.to_pandas() if isinstance(valid, pl.DataFrame) else valid
        # Create a dataframe matching the index of the validation set
        valid_sub = pd.DataFrame(
            position, index=valid_pd.index, columns=["prediction"]
        ).reset_index()
        # 5. Calculate Hull Score on Validation Set
        # Assuming hull_score is defined elsewhere in your code
        score_ = hull_score(valid_pd, valid_sub, "index")
        valid_scores[name] = score_
        print(f"{name} Validation Hull Score: {score_:.4f}")
        print("=" * 30)
        print("-" * 30)

        trained_models[name] = model
        print()
        # = ------------------------------- test sub part ------------------------------ #
        # if "XGB" in name:
        #     position = np.clip(
        #         model.predict(xgb.DMatrix(test_df[X_train_scaled.columns])), 0.0, 2.0
        #     )
        # else:
        #     position = np.clip(model.predict(test_df[X_train_scaled.columns]), 0.0, 2.0)
        # # # Create a dataframe matching the index of the validation set
        # test_sub = pd.DataFrame(
        #     position,
        #     index=train.tail(180).select("date_id")["date_id"].to_list(),
        #     columns=["prediction"],
        # ).reset_index()
        # print(
        #     "# ------------------------------------- = ------------------------------------ #"
        # )
        # print("test score")
        # print(hull_score(train.tail(180).to_pandas(), test_sub, "date_id"))
        # print(
        #     "# ------------------------------------- = ------------------------------------ #"
        # )
    return trained_models, scaler, cat_encoder, valid_scores

In [None]:
trading_days_per_yr = 252
# from threeway_tssplit import RollingWindowMinMaxTrainSplit


feature_sets_to_test = list(feature_sets.keys())

# Store all results
all_results = []
num_fold_trained = 0
train_output = []
# https://arxiv.org/html/2502.17493v2  A Novel Loss Function for Deep Learning Based Daily Stock Trading System
ts = RollingWindowMinMaxTrainSplit(
    min_train_size=trading_days_per_yr * 6,
    max_train_size=trading_days_per_yr * 6,
    test_size=21 * 6,
    purge=0,
    standardization_size=trading_days_per_yr * 1,
)


TRAIN = False
if TRAIN:
    for i in feature_sets_to_test:
        print("\n" + "=" * 80)
        print(" TRAINING MODELS")
        print("=" * 80)
        print(f"\nConfiguration:")
        print(f"  Feature sets: {i}")
        print()
        FEATURES = feature_sets[i]
        _cat_cols = list(np.intersect1d(FEATURES, CFG.cat_cols))

        for ind, (std_i, tra_i, val_i) in enumerate(
            ts.split(train, groups=range(len(train)))
        ):
            if (ind % 5 != 0):
                continue

            print("=" * 20, f"fold {ind}", "=" * 20)
            print(
                "=" * 15,
                f"tra idx: {tra_i.min(), tra_i.max()}; val idx: {val_i.max()}; std: {std_i.max()}",
                "=" * 20,
            )
            m, _scaler, _cat_encoder, _scores = train_models_reg(
                std_df=train[std_i],
                train=train[tra_i],
                valid=train[val_i],
                FEATURES=FEATURES,
                _cat_cols=_cat_cols,
                target_col=target_col,
            )
            _scores["fold"] = ind
            _scores["feature_nums"] = i
            all_results.append(_scores)

            print("\n\n\n")
            train_output.append(
                {
                    "feature_list": i,
                    "fold": ind,
                    "tra_idx": f"{tra_i.min(), tra_i.max()}",
                    "models": m,
                    "scaler": _scaler,
                    "encoder": _cat_encoder,
                    "score": _scores,
                }
            )
            # display(train[std_i, "date_id"])
            # display(train[tra_i, "date_id"])
            # display(train[val_i, "date_id"])
            num_fold_trained += 1
    print("\nAll models trained successfully!")

In [None]:
train_output = joblib.load( "/kaggle/input/hull-feature-select-dict/tr_reg_enc_1214.py")

In [None]:
test = (
    pl.read_csv(args.data_dir / "train.csv")
    .with_columns(pl.selectors.string().cast(pl.Float64))
    .with_columns(
        pl.lit(False).alias("is_scored"),
        pl.col("forward_returns").shift(1).alias("lagged_forward_returns"),
        pl.col("risk_free_rate").shift(1).alias("lagged_risk_free_rate"),
        pl.col("market_forward_excess_returns")
        .shift(1)
        .alias("lagged_market_forward_excess_returns"),
    )
    .drop(["forward_returns", "risk_free_rate", "market_forward_excess_returns"])
    .vstack(
        pl.read_csv(args.data_dir / "test.csv").with_columns(
            pl.selectors.string().cast(pl.Float64)
        )
    )
)
test = (
    test.with_columns(
        (pl.col("I2") - pl.col("I1")).alias("U1"),
        (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias(
            "U2"
        ),
    )
    .with_columns(
        (pl.col("date_id") // trading_days_per_yr).alias("yrno"),
        np.sin(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias(
            "day_of_yr_sin"
        ),
        np.cos(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias(
            "day_of_yr_cos"
        ),
        np.sin(
            np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr
        ).alias("day_of_yr_sin2"),
        np.cos(
            np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr
        ).alias("day_of_yr_cos2"),
    )
    .with_columns(
        (
            pl.col("day_of_yr_cos2") / 100
            + (1 - (1 / (pl.int_range(0, pl.count()) + 1e-6))) * 100
        ).alias("magic_up")
    )
).fill_nan(None)
test = load_data(test, target_col="lagged_market_forward_excess_returns")
test = test.fill_null(strategy="forward")

In [None]:
def preprocess_test(
    test_df: pl.DataFrame,
    FEATURES: list[str],
    cat_cols: list[str],
    scaler,
    encoder=None,
):
    FEATURES = [str(c) for c in FEATURES]
    cat_cols = [str(c) for c in cat_cols]
    num_cols = [c for c in FEATURES if c not in cat_cols]
    # -------------------------------
    # 1. Numeric
    # -------------------------------
    X_num = pd.DataFrame(
        scaler.transform(test_df[num_cols].to_numpy()),
        columns=num_cols,
        index=test_df.to_pandas().index,
    )
    # -------------------------------
    # 2. Categorical (optional)
    # -------------------------------
    if cat_cols:
        if encoder is None:
            raise ValueError("Categorical columns exist but encoder is None")
        X_cat = encoder.transform(test_df.select(cat_cols).to_pandas().astype(str))
        # shift for NN-safe / embedding-safe
        X_cat = X_cat + 1
        X_cat = pd.DataFrame(
            X_cat,
            columns=cat_cols,
            index=X_num.index,
        )
        X = pd.concat([X_num, X_cat], axis=1)
    else:
        X = X_num
    return X


def predict_model(model, model_name, X):
    if "XGB" in model_name:
        import xgboost as xgb

        dmat = xgb.DMatrix(X)
        return model.predict(dmat)

    elif "LightGBM" in model_name:
        return model.predict(X)

    elif "CatBoost" in model_name:
        return model.predict(X)

    else:  # sklearn (ElasticNet, etc.)
        return model.predict(X)


def compute_weights(score_dict, eps=1e-6):
    """
    score_dict: {model_name: float_score, ...}
    returns: {model_name: normalized_weight}
    """
    for k, v in score_dict.items():
        if isinstance(v, float):
            raw = {k: 1.0 / (v + eps)}

    s = sum(raw.values())
    return {k: v / s for k, v in raw.items()}




In [None]:

def predict(test: pl.DataFrame) -> float:
    """Replace this function with your inference code.
    You can return either a Pandas or Polars dataframe, though Polars is recommended for performance.
    Each batch of predictions (except the very first) must be returned within 5 minutes of the batch features being provided.
    """
    test_raw = test.clone()
    try:
        test = (
            pl.read_csv(args.data_dir / "train.csv")
            .with_columns(pl.selectors.string().cast(pl.Float64))
            .with_columns(
                pl.lit(False).alias("is_scored"),
                pl.col("forward_returns").shift(1).alias("lagged_forward_returns"),
                pl.col("risk_free_rate").shift(1).alias("lagged_risk_free_rate"),
                pl.col("market_forward_excess_returns")
                .shift(1)
                .alias("lagged_market_forward_excess_returns"),
            )
            .drop(["forward_returns", "risk_free_rate", "market_forward_excess_returns"])
            .vstack(
                   test.with_columns(
                    pl.selectors.string().cast(pl.Float64)
                )
            )
        ).sort('is_scored', descending=True).unique("date_id")
        test = (
            test.with_columns(
                (pl.col("I2") - pl.col("I1")).alias("U1"),
                (pl.col("M11") / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3)).alias(
                    "U2"
                ),
            )
            .with_columns(
                (pl.col("date_id") // trading_days_per_yr).alias("yrno"),
                np.sin(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias(
                    "day_of_yr_sin"
                ),
                np.cos(np.pi * 2 * pl.col("date_id") % trading_days_per_yr).alias(
                    "day_of_yr_cos"
                ),
                np.sin(
                    np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr
                ).alias("day_of_yr_sin2"),
                np.cos(
                    np.pi / trading_days_per_yr * pl.col("date_id") % trading_days_per_yr
                ).alias("day_of_yr_cos2"),
            )
            .with_columns(
                (
                    pl.col("day_of_yr_cos2") / 100
                    + (1 - (1 / (pl.int_range(0, pl.count()) + 1e-6))) * 100
                ).alias("magic_up")
            )
        ).fill_nan(None)
        test = load_data(test, target_col="lagged_market_forward_excess_returns")
        test = test.fill_null(strategy="forward").sort('date_id')
        all_test_preds = []

        for fold_obj in train_output:
            FEATURES = fold_obj["feature_list"]
            models = fold_obj["models"]
            scaler = fold_obj["scaler"]
            encoder = fold_obj.get("encoder", None)
            score_dict = fold_obj["score"]
            cat_cols = list(np.intersect1d(FEATURES, CFG.cat_cols))
            test_sub = test.filter(pl.col("is_scored") == True).select(FEATURES)
            X_test = preprocess_test(
                test_df=test_sub,
                FEATURES=FEATURES,
                cat_cols=cat_cols,
                scaler=scaler,
                encoder=encoder,
            )
            weights = compute_weights(score_dict)
            fold_pred = np.zeros(len(X_test), dtype=np.float32)

            for name, model in models.items():
                if name not in weights:
                    continue

                p = predict_model(model, name, X_test)
                p = np.clip(p, 0.0, 2.0)

                fold_pred += weights[name] * p

            all_test_preds.append(fold_pred)
        final_test_pred = np.mean(np.vstack(all_test_preds), axis=0)
        test_out = test.filter(pl.col("is_scored") == True).select("date_id").to_pandas()
        test_out["prediction"] = final_test_pred
        # display(test_out)
        print(final_test_pred[0])
        return float(final_test_pred[0])
    except Exception as e:
        print("=" * 10)
        print(e)
        display(test_raw)

        return 0.0
    # return float(final_test_pred[0])

In [None]:


import kaggle_evaluation.default_inference_server


# When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting
# or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very
# first `predict` call, which does not have the usual 1 minute response deadline.
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

In [None]:


# def predict(test: pl.DataFrame) -> float:
#     """Replace this function with your inference code.
#     You can return either a Pandas or Polars dataframe, though Polars is recommended for performance.
#     Each batch of predictions (except the very first) must be returned within 5 minutes of the batch features being provided.
#     """
#     return 0.0


# # When your notebook is run on the hidden test set, inference_server.serve must be called within 15 minutes of the notebook starting
# # or the gateway will throw an error. If you need more than 15 minutes to load your model you can do so during the very
# # first `predict` call, which does not have the usual 1 minute response deadline.
# inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))