In [1]:
import polars as pl

import numpy as np
import re
import os
import json
from tqdm.auto import tqdm
from scipy.stats import rankdata

from sklearn.model_selection import TimeSeriesSplit

import xgboost as xgb
import torch
import pickle

from CONFIG import CONFIG
from PREPROCESSOR_V2 import PREPROCESSOR
from FEATURE_ENGINEERING_V2 import FEATURE_ENGINEERING
import time


def timer(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"{func.__name__} took {end - start:.4f} seconds")
        return result

    return wrapper

In [2]:
folder = "xgb_models/v1"
os.makedirs(folder, exist_ok=True)
with open(f"{folder}/features.json", "w") as f:
    json.dump(CONFIG.IMPT_COL, f)

with open(f"{folder}/features.json", "r") as f:
    json.load(f)

In [3]:
def rank_correlation_sharpe(targets, predictions) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).

    :param merged_df: DataFrame containing prediction columns (starting with 'prediction_')
                    and target columns (starting with 'target_')
    :return: Sharpe ratio of the rank correlation
    :raises ZeroDivisionError: If the standard deviation is zero
    """
    correlations = []
    targets = targets.reshape(-1, CONFIG.NUM_TARGET_COLUMNS)
    targets = np.where(targets == 0, np.nan, targets)
    for i, (pred_row, target_row) in enumerate(zip(predictions, targets)):
        # Find valid (non-NaN) assets for this timestep

        valid_mask = ~np.isnan(target_row)
        valid_pred = pred_row[valid_mask]
        valid_target = target_row[valid_mask]

        if np.std(pred_row) == 0 or np.std(target_row) == 0:
            raise ZeroDivisionError("Zero standard deviation in a row.")

        rho = np.corrcoef(rankdata(valid_pred, method="average"), rankdata(valid_target, method="average"))[0, 1]
        correlations.append(rho)

    daily_rank_corrs = np.array(correlations)
    std_dev = daily_rank_corrs.std(ddof=0)
    if std_dev == 0:
        raise ZeroDivisionError("Denominator is zero, unable to compute Sharpe ratio.")

    sharpe_ratio = daily_rank_corrs.mean() / std_dev
    return -float(sharpe_ratio)

In [4]:
class FeatureSelector:
    def __init__(self, train_x, train_y, n_targets: int = 424):
        self.train_x = train_x  # drop date
        self.train_y = train_y
        self.n_targets = n_targets
        self.keep_features = None
        self.total_features = train_x.columns.__len__()

    @timer
    def basic_filters(self):
        train_x_filter_1 = self.train_x.select([col for col in self.train_x.columns if self.train_x[col].var() > 1e-3])
        train_x_filter_2 = train_x_filter_1.select(
            [col for col in train_x_filter_1.columns if train_x_filter_1[col].value_counts()["count"].max() / len(train_x_filter_1) < 0.80]
        )
        train_x_filter_3 = train_x_filter_2.select([col for col in train_x_filter_2.columns if train_x_filter_2[col].n_unique() > 2])

        print(f"After Basic Filter: {train_x_filter_3.columns.__len__()} / {self.total_features}")

        return train_x_filter_3

    @timer
    def run_correlation(self, x: torch.Tensor, y: torch.Tensor, names: list) -> list:
        """
        Memory-optimized version using chunked processing
        Best for when you have enough GPU memory

        Args:
            x: Tensor
            y: Tensor

        Returns:
            correlations: Tensor
        """
        N, D1 = x.shape
        N2, D2 = y.shape
        assert N == N2

        device = x.device

        # Handle NaNs by masking
        x_valid = ~torch.isnan(x)
        y_valid = ~torch.isnan(y)

        # Convert NaNs to 0 for computation
        x_clean = torch.where(x_valid, x, 0.0)
        y_clean = torch.where(y_valid, y, 0.0)

        # Compute valid sample counts for each pair efficiently
        # This is the memory bottleneck, so we chunk it
        chunk_size = 500  # Adjust based on GPU memory
        correlations = torch.zeros(D1, D2, device=device)

        for i in range(0, D1, chunk_size):
            end_i = min(i + chunk_size, D1)

            # Get chunk
            x_chunk = x_clean[:, i:end_i]  # (N, chunk_size)
            x_valid_chunk = x_valid[:, i:end_i]  # (N, chunk_size)

            # Compute valid sample matrix for this chunk
            valid_matrix = x_valid_chunk.unsqueeze(2) & y_valid.unsqueeze(1)  # (N, chunk_size, D2)
            n_valid = valid_matrix.sum(dim=0).float()  # (chunk_size, D2)

            # Sufficient samples mask
            sufficient = n_valid >= 10

            if sufficient.any():
                # Compute means over valid samples
                x_sum = (x_chunk.unsqueeze(2) * valid_matrix).sum(dim=0)  # (chunk_size, D2)
                y_sum = (y_clean.unsqueeze(1) * valid_matrix).sum(dim=0)  # (chunk_size, D2)

                x_mean = x_sum / (n_valid + 1e-10)
                y_mean = y_sum / (n_valid + 1e-10)

                # Center data
                x_centered = (x_chunk.unsqueeze(2) - x_mean.unsqueeze(0)) * valid_matrix  # (N, chunk_size, D2)
                y_centered = (y_clean.unsqueeze(1) - y_mean.unsqueeze(0)) * valid_matrix  # (N, chunk_size, D2)

                # Compute correlation
                numerator = (x_centered * y_centered).sum(dim=0)
                x_var = (x_centered**2).sum(dim=0)
                y_var = (y_centered**2).sum(dim=0)

                denominator = torch.sqrt(x_var * y_var) + 1e-10
                chunk_corr = numerator / denominator

                # Apply sufficient samples mask
                chunk_corr = torch.where(sufficient, chunk_corr, 0.0)
                correlations[i:end_i] = torch.abs(chunk_corr)

        self.correlations = correlations.cpu().numpy()

    def run_selection(self):
        filtered = self.basic_filters()
        train_x_arr = filtered.drop(CONFIG.DATE_COL).to_numpy()
        train_y_arr = self.train_y.drop(CONFIG.DATE_COL).to_numpy()
        self.run_correlation(
            x=torch.tensor(train_x_arr, device="cuda"), y=torch.tensor(train_y_arr, device="cuda"), names=filtered.drop(CONFIG.DATE_COL).columns
        )

        return self.correlations, filtered.drop(CONFIG.DATE_COL).columns

In [5]:
# --- Prepare DataLoader ---
# Create the dataset

train_x = pl.scan_csv(CONFIG.TRAIN_X_PATH)
train_x = PREPROCESSOR(df=train_x)
train_x = train_x.clean()

features = FEATURE_ENGINEERING(df=train_x)
train_x: pl.DataFrame = features.create_market_features()

train_y = pl.scan_csv(CONFIG.TRAIN_Y_PATH)

curr_y = (
    train_y.with_columns([pl.col(CONFIG.LAGS[f"lag{i}"]).exclude(CONFIG.DATE_COL).shift(i + 1) for i in range(1, 5)])
    .with_columns(pl.all().exclude(CONFIG.DATE_COL).shift())
    .filter((pl.col(CONFIG.DATE_COL).is_in(train_x.select(CONFIG.DATE_COL).to_series())))
    .collect()
    .fill_null(0)
    .lazy()
)

y_feat = FEATURE_ENGINEERING(df=curr_y)
lags = y_feat._compute_lag_returns(df=curr_y)
market = y_feat._compute_market_stats(df=curr_y)
skew = y_feat._compute_return_skew(df=curr_y)
auto_corr = y_feat._compute_autocorr_torch(df=curr_y)

train_x = (
    train_x.join(curr_y.collect(), on=CONFIG.DATE_COL)
    .join(lags.collect(), on=CONFIG.DATE_COL)
    .join(market.collect(), on=CONFIG.DATE_COL)
    .join(skew.collect(), on=CONFIG.DATE_COL)
    .join(auto_corr.collect(), on=CONFIG.DATE_COL)
)


train_y = train_y.filter((pl.col(CONFIG.DATE_COL).is_in(train_x.select(CONFIG.DATE_COL).to_series()))).collect()
train_x = (
    train_x.with_columns([pl.when(pl.col(col).is_infinite()).then(0.0).otherwise(pl.col(col)).alias(col) for col in train_x.columns])
    .with_columns(pl.all().shrink_dtype())
    .filter(pl.col(CONFIG.DATE_COL).is_in(train_y.select(CONFIG.DATE_COL).to_series()))
    .with_columns(pl.col(CONFIG.DATE_COL).cast(pl.Int64))
    .select([CONFIG.DATE_COL] + CONFIG.IMPT_COL)
)

retrain_x = train_x.with_columns(pl.all().exclude(CONFIG.DATE_COL).shift(5))
retrain_y = train_y.filter((pl.col(CONFIG.DATE_COL).is_in(train_x.select(CONFIG.DATE_COL).to_series()))).with_columns(
    pl.all().exclude(CONFIG.DATE_COL).shift(5)
)

train_y_arr = train_y.drop(CONFIG.DATE_COL).to_numpy()

train_y = (
    pl.DataFrame(
        (train_y_arr - np.nanmean(train_y_arr, axis=1).reshape(train_y_arr.shape[0], -1))
        / np.nanstd(train_y_arr, axis=1).reshape(train_y_arr.shape[0], -1),
        schema=train_y.drop(CONFIG.DATE_COL).columns,
    )
    .insert_column(0, train_y.select(CONFIG.DATE_COL).to_series())
    .fill_nan(0)
)

retrain_y_arr = retrain_y.drop(CONFIG.DATE_COL).to_numpy()
retrain_y = (
    pl.DataFrame(
        (retrain_y_arr - np.nanmean(retrain_y_arr, axis=1).reshape(retrain_y_arr.shape[0], -1))
        / np.nanstd(retrain_y_arr, axis=1).reshape(retrain_y_arr.shape[0], -1),
        schema=train_y.drop(CONFIG.DATE_COL).columns,
    )
    .insert_column(0, train_y.select(CONFIG.DATE_COL).to_series())
    .fill_nan(0)
)


_compute_lag_returns took 0.0130 seconds
_compute_autocorr_torch took 0.3945 seconds
_compute_obv took 0.0085 seconds
_compute_return_skew took 0.0030 seconds
_compute_volume_z took 0.0382 seconds
_compute_market_stats took 0.1398 seconds
_compute_atr took 0.0081 seconds
_compute_rolling took 0.0134 seconds
create_market_features took 2.2632 seconds
_compute_lag_returns took 0.0095 seconds
_compute_market_stats took 0.0131 seconds
_compute_return_skew took 0.0030 seconds
_compute_autocorr_torch took 0.0268 seconds


  (retrain_y_arr - np.nanmean(retrain_y_arr, axis=1).reshape(retrain_y_arr.shape[0], -1))
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [6]:
params = {
    # "objective": RankCorrelationLoss().custom_loss,
    "eval_metric": rank_correlation_sharpe,
    "max_depth": 16,
    "learning_rate": 0.05,
    "n_estimators": 500,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 1.0,
    "random_state": CONFIG.RANDOM_STATE,
    "device": "gpu",
    "tree_method": "hist",
    "early_stopping_rounds": 5,
}

In [7]:
dates_unique = train_x.filter(pl.col(CONFIG.DATE_COL) <= CONFIG.MAX_TRAIN_DATE).select(pl.col(CONFIG.DATE_COL).unique().sort()).to_series().to_numpy()
real_dates_unique = (
    train_x.filter(pl.col(CONFIG.DATE_COL) > CONFIG.MAX_TRAIN_DATE).select(pl.col(CONFIG.DATE_COL).unique().sort()).to_series().to_numpy()
)

if CONFIG.VERBOSE:
    print(f"Train dates from {dates_unique.min()} to {dates_unique.max()}")
    print(f"Valid dates from {real_dates_unique.min()} to {real_dates_unique.max()}")

dates_train = dates_unique
dates_valid = real_dates_unique

df_train = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(dates_train)).drop(CONFIG.DATE_COL)
true_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(dates_train)).drop(CONFIG.DATE_COL)

valid_period = range(min(dates_valid), max(dates_valid) + 1)
df_valid = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(valid_period)).drop(CONFIG.DATE_COL)
df_valid_current_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(valid_period)).drop(CONFIG.DATE_COL)

df_valid_retrain = retrain_x.filter(pl.col(CONFIG.DATE_COL).is_in(valid_period)).drop(CONFIG.DATE_COL)
df_valid_current_y_retrain = retrain_y.filter(pl.col(CONFIG.DATE_COL).is_in(valid_period)).drop(CONFIG.DATE_COL)

model = xgb.XGBRegressor(**params)
# Train with custom objective
model.fit(
    df_train,
    true_y,
    eval_set=[(df_valid, df_valid_current_y)],
)
model.save_model(f"{folder}/xgb_full.json")
print("model saved")


Train dates from 2 to 1826
Valid dates from 1827 to 1916
[0]	validation_0-rmse:0.94374	validation_0-rank_correlation_sharpe:-0.06801
[1]	validation_0-rmse:0.94410	validation_0-rank_correlation_sharpe:-0.20558
[2]	validation_0-rmse:0.94453	validation_0-rank_correlation_sharpe:-0.23522
[3]	validation_0-rmse:0.94558	validation_0-rank_correlation_sharpe:-0.21142
[4]	validation_0-rmse:0.94611	validation_0-rank_correlation_sharpe:-0.20633
[5]	validation_0-rmse:0.94655	validation_0-rank_correlation_sharpe:-0.21293
[6]	validation_0-rmse:0.94660	validation_0-rank_correlation_sharpe:-0.25092
[7]	validation_0-rmse:0.94710	validation_0-rank_correlation_sharpe:-0.25607
[8]	validation_0-rmse:0.94735	validation_0-rank_correlation_sharpe:-0.26706
[9]	validation_0-rmse:0.94838	validation_0-rank_correlation_sharpe:-0.24774
[10]	validation_0-rmse:0.94879	validation_0-rank_correlation_sharpe:-0.24301
[11]	validation_0-rmse:0.94923	validation_0-rank_correlation_sharpe:-0.23637
[12]	validation_0-rmse:0.9499

In [8]:
retrain_params = {
    # "objective": RankCorrelationLoss().custom_loss,
    "eval_metric": rank_correlation_sharpe,
    "max_depth": 16,
    "learning_rate": 0.01,
    "n_estimators": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 1.0,
    "random_state": CONFIG.RANDOM_STATE,
    "device": "gpu",
    "tree_method": "hist",
}

retrain_model = xgb.XGBRegressor(**retrain_params)
retrain_model.load_model(f"{folder}/xgb_full.json")
preds = []
cnt_dates = 0
for date_id in tqdm(real_dates_unique):
    period = range(date_id, date_id + 1)

    df_valid_date = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(period)).drop(CONFIG.DATE_COL)

    df_upd = retrain_x.filter(pl.col(CONFIG.DATE_COL).is_in(date_id)).drop(CONFIG.DATE_COL)
    df_upd_current_y = retrain_y.filter(pl.col(CONFIG.DATE_COL).is_in(date_id)).drop(CONFIG.DATE_COL)

    if len(df_upd) > 0:
        retrain_model.fit(df_upd, df_upd_current_y)

    preds_i = retrain_model.predict(df_valid_date)

    preds += list(preds_i[-1].reshape(-1, CONFIG.NUM_TARGET_COLUMNS))

    cnt_dates += 1

preds = np.array(preds)

score = rank_correlation_sharpe(
    train_y.filter(pl.col(CONFIG.DATE_COL).is_in(real_dates_unique)).drop(CONFIG.DATE_COL).to_numpy().astype(np.float64),
    preds,
)
print(f"REAL Sharpe: {score:.5f}")


  0%|          | 0/90 [00:00<?, ?it/s]

REAL Sharpe: 0.18294
