In [None]:
# =========================================================
# Library Imports
# =========================================================

# ===== Basic Libraries =====
import numpy as np               # Numerical computations
import pandas as pd              # Data manipulation and analysis
import re                        # Regular expressions for text processing
import warnings                  # To manage and suppress warnings
from datetime import date        # Working with date objects
from tqdm import tqdm            # Progress bars for loops

# ===== Visualization =====
import matplotlib.pyplot as plt  # Core plotting library
import seaborn as sns            # Statistical data visualization

# ===== Statistics & Evaluation Metrics =====
from scipy.stats import spearmanr             # Spearman rank correlation (for IC calculation)
from sklearn.metrics import mean_squared_error, r2_score  # Model evaluation metrics

# ===== Feature Processing & Model Pipeline =====
from sklearn.impute import SimpleImputer               # Handle missing values
from sklearn.preprocessing import MaxAbsScaler, StandardScaler  # Data normalization
from sklearn.compose import ColumnTransformer          # Combine multiple preprocessing steps
from sklearn.pipeline import Pipeline                  # Build modeling pipelines
from sklearn.base import BaseEstimator, TransformerMixin  # Create custom transformers
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV  # Time-based CV and hyperparameter tuning

# ===== Modeling =====
import lightgbm as lgb              # LightGBM framework
from lightgbm import LGBMRegressor  # Core LightGBM model class

# ===== Optional Settings =====
warnings.filterwarnings("ignore")   # Suppress unnecessary warnings

1. Data Import & Preprocessing  
Ensure data cleaning, feature engineering, and time alignment with other models.

In [None]:
# ===== 0) Configuration =====
file_path = "../data/final_df.csv"  
try:
    df_ori = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"*** ERROR: Cannot find {file_path} ***")

In [None]:
# ===== 1) Data Preparation & Target Construction =====
# Make a working copy of the original dataset
df = df_ori.copy()

# Sort by symbol and datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(by=['symbol', 'datetime'])
df = df.reset_index(drop=True)

# --- Compute 1-minute log return for each symbol ---
# lret_1m = log(price_t / price_{t-1})
df['lret_1m'] = df.groupby('symbol')['close'].transform(lambda s: np.log(s).diff())

# --- Define prediction target: next-period log return ---
# y_target = log-return shifted by one step ahead (t+1)
df['y_target'] = df.groupby('symbol')['lret_1m'].shift(-1)


# --- Remove outliers and missing values ---
initial_rows = len(df)
# Filter extreme targets |y| > 0.2 (to avoid abnormal spikes)
df = df[(df['y_target'].abs() <= 0.2)]
# Drop NaN values in target or feature columns
df = df.dropna(subset=['y_target', 'lret_1m'])

print(f"(Dropped {initial_rows - len(df)} unusal/NaN y value)")

# --- Re-sort by datetime and symbol for time-series consistency ---
df = df.sort_values(by=['datetime', 'symbol'])
df = df.reset_index(drop=True)

df

(Dropped 12 unusal/NaN y value)


Unnamed: 0,datetime,symbol,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,f_minsin,f_mincos,...,split_nonpos_flag,shares_out,log_shares_out,eps_surp_pct_final,div_amount,log_shares_out_iqr_outlier,eps_estimate_rz_8,eps_actual,lret_1m,y_target
0,2024-04-30 12:51:00,AMAT,0,1,0,0,0,0,-0.220697,-0.975342,...,1,830897024,20.538016,0.00,0.0,0,0.0,0.00,-0.000199,-0.000100
1,2024-04-30 12:51:00,AMD,0,1,0,0,0,0,-0.220697,-0.975342,...,1,1616140032,21.203306,2.04,0.0,0,0.0,0.62,0.000314,-0.000126
2,2024-04-30 12:51:00,AVGO,0,1,0,0,0,0,-0.220697,-0.975342,...,1,465308000,19.958210,0.00,0.0,0,0.0,0.00,-0.000334,0.000721
3,2024-04-30 12:51:00,MU,0,1,0,0,0,0,-0.220697,-0.975342,...,1,1107369984,20.825254,0.00,0.0,0,0.0,0.00,0.000698,-0.000698
4,2024-04-30 12:51:00,NVDA,0,1,0,0,0,0,-0.220697,-0.975342,...,1,2500000000,21.639557,0.00,0.0,0,0.0,0.00,-0.000769,-0.000907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324883,2025-10-28 15:58:00,AMAT,0,1,0,0,0,0,-0.861629,-0.507538,...,1,796642427,20.495916,0.00,0.0,0,0.0,0.00,-0.000461,-0.000132
324884,2025-10-28 15:58:00,AMD,0,1,0,0,0,0,-0.861629,-0.507538,...,1,1633284837,21.213859,0.00,0.0,0,0.0,0.00,-0.000698,-0.001316
324885,2025-10-28 15:58:00,AVGO,0,1,0,0,0,0,-0.861629,-0.507538,...,1,4722365022,22.275576,0.00,0.0,0,0.0,0.00,0.000161,0.000375
324886,2025-10-28 15:58:00,MU,0,1,0,0,0,0,-0.861629,-0.507538,...,1,1122466035,20.838794,0.00,0.0,0,0.0,0.00,0.000270,-0.000135


In [None]:
# ===== 2) Feature Cleaning & Target / Feature Split =====
# --- Separate target variable ---
y = df['y_target']

# --- Drop columns not used for modeling ---
X = df.drop(columns=[
    'y_target', 'lret_1m', 'datetime', 'symbol',  
    'year', 'month', 'day', 'minute', 'minute_of_day'
], errors='ignore')

print(f"X original shape: {X.shape}")

# --- Identify problematic columns based on summary statistics ---
desc = X.describe(percentiles=[0.99]).T
bad_cols = []

# std > 1000 → extreme scale / noise
bad_cols += desc.index[desc['std'] > 1e3].tolist()
# 99th percentile > 1000 → heavy-tailed distribution
bad_cols += desc.index[desc['99%'].abs() > 1e3].tolist()
# max > 1,000,000 → extreme outliers
bad_cols += desc.index[desc['max'].abs() > 1e6].tolist()
# std == 0 → constant / non-informative columns
bad_cols += desc.index[desc['std'] == 0].tolist()

bad_cols_set = sorted(set(bad_cols))
#print(f"--- Dropping {len(bad_cols_set)} bad cols ---")
#for col in bad_cols_set:
#   print(f"  - {col}")

# --- Drop the identified bad columns ---
X_cleaned = X.drop(columns=bad_cols_set)

#print(f"\n--- X cleaned ---")
#print(f"X (cleaned) shape: {X_cleaned.shape}")

X original shape: (324888, 184)


In [None]:
# ===== 3) Time Series Split (first 2/3 for training + last 1/3 for testing) =====
split_ratio = 1.0 / 1.5 
split_index = int(len(X_cleaned) * split_ratio)

X_val = X_cleaned.iloc[:split_index]
y_val = y.iloc[:split_index]

X_test = X_cleaned.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"--- Data Splitting done ---")
print(f" Validation set (X_val, y_val) shape: {X_val.shape}, {y_val.shape}")
print(f" Testing set (X_test, y_test) shape: {X_test.shape}, {y_test.shape}")

--- Data Splitting done ---
 Validation set (X_val, y_val) shape: (216592, 147), (216592,)
 Testing set (X_test, y_test) shape: (108296, 147), (108296,)


In [188]:
print("\n=== Time Range ===")
print(f"Training period: {df.loc[X_train.index, 'datetime'].min()} → {df.loc[X_train.index, 'datetime'].max()}")
print(f"Testing period: {df.loc[X_test.index, 'datetime'].min()} → {df.loc[X_test.index, 'datetime'].max()}")


=== Time Range ===
Training period: 2024-04-30 12:51:00 → 2025-04-29 15:59:00
Testing period: 2025-04-29 15:59:00 → 2025-10-28 15:58:00


# LightGBM (5-Feature Variant) — Rolling Backtest

### Purpose
This notebook documents an **ablation experiment** using only **five manually selected features** to test whether a smaller and more interpretable model could reduce overfitting and improve stability compared with the full Lasso-selected feature set.

### Setup
- **Model:** LightGBM regressor (same parameters as the main run)
- **Window:** Rolling training with a 30-minute lookback  
- **Data:** Same test period and symbols as the main Lasso-feature model  
- **Feature set:** Fixed to 5 preselected predictors  
- **Tuning:** No hyperparameter optimization (reusing best parameters from the main notebook)

### Results Summary
| Metric | Full Lasso Feature Model | 5-Feature Model | Comparison |
|:--|:--|:--|:--|
| Overall IC (sample-wise) | **0.0037** | **-0.0106** | ↓ weaker directional consistency |
| Sharpe (sign, minute-level) | **-0.0088** | **-0.0040** | ↔ both near zero |
| Sharpe (long-only, minute-level) | **0.0207** | **0.0172** | ↓ slightly lower |
| Interpretation | Richer feature space → higher predictive power | Simpler but less expressive |  |

### Interpretation
While the 5-feature version simplifies the model and improves runtime efficiency, it fails to capture the same predictive strength as the full Lasso feature model.  
LightGBM benefits from a broader set of weak but complementary predictors; trimming the features overly restricts model flexibility and signal diversity.

### Decision
The **5-feature LightGBM model is not selected** for the final portfolio evaluation.  
It is retained here for **transparency, reproducibility, and future audits** as part of the ablation study.

---

In [None]:
# ========= Inputs you already have =========
# X_lasso: DataFrame (your 75-feature subset)
# y      : Series (aligned with X_lasso by index)

# [OPTIONAL] If you have a datetime Series aligned with X_lasso (same index), set it here:
datetime_series = None
# e.g. if you still have a df with datetime: datetime_series = df.loc[X_lasso.index, "datetime"]

# ========= Build canonical base (no other df needed) =========
base = X_lasso.copy()
base_ycol = "y_target"
base[base_ycol] = y.values  # align by position (indices already aligned)

# attach datetime if provided
if datetime_series is not None:
    base["datetime"] = pd.to_datetime(datetime_series)
    base = base.set_index("datetime").sort_index()

# ========= Split (2/3 Train+Val, 1/3 Test) =========
split_ratio = 1.0 / 1.5
split_index = int(len(base) * split_ratio)

X_val_lasso  = base.drop(columns=[base_ycol]).iloc[:split_index].copy()
y_val        = base[base_ycol].iloc[:split_index].copy()
X_test_lasso = base.drop(columns=[base_ycol]).iloc[split_index:].copy()
y_test       = base[base_ycol].iloc[split_index:].copy()

print("✅ Prepared X_val_lasso / y_val. Index type:", type(X_val_lasso.index).__name__)

✅ Prepared X_val_lasso / y_val. Index type: RangeIndex


1. IC Stability Test (Rolling Spearman ρ Stability Filtering)

In [171]:
# ========= IC stability =========
def ic_stability_monthly(X: pd.DataFrame, y: pd.Series, min_samples=200) -> pd.DataFrame:
    rows = []
    for feat in X.columns:
        vals = []
        for _, Xm in X.groupby(pd.Grouper(freq="M")):
            if len(Xm) < min_samples: 
                continue
            ym = y.loc[Xm.index]
            v  = pd.concat([Xm[feat], ym], axis=1).dropna()
            if len(v) < min_samples or v.iloc[:,0].nunique() < 2: 
                continue
            vals.append(spearmanr(v.iloc[:,0].values, v.iloc[:,1].values, nan_policy="omit")[0])
        if len(vals)==0:
            rows.append({"feature": feat, "n_windows": 0, "mean_ic": np.nan, "std_ic": np.nan, "sign_consistency": np.nan})
        else:
            arr = np.array(vals, float)
            rows.append({
                "feature": feat,
                "n_windows": len(arr),
                "mean_ic": float(np.nanmean(arr)),
                "std_ic": float(np.nanstd(arr)),
                "sign_consistency": float(np.mean(arr > 0)),
            })
    return pd.DataFrame(rows).sort_values("mean_ic", ascending=False)

def ic_stability_chunks(X: pd.DataFrame, y: pd.Series, chunk_size=5000) -> pd.DataFrame:
    rows, n = [], len(X)
    cuts = list(range(0, n, chunk_size)) + [n]
    for feat in X.columns:
        vals = []
        for i in range(len(cuts)-1):
            s, e = cuts[i], cuts[i+1]
            Xm = X.iloc[s:e, [X.columns.get_loc(feat)]]
            ym = y.iloc[s:e]
            v  = pd.concat([Xm.iloc[:,0], ym], axis=1).dropna()
            if len(v) < max(200, chunk_size//2) or v.iloc[:,0].nunique() < 2:
                continue
            vals.append(spearmanr(v.iloc[:,0].values, v.iloc[:,1].values, nan_policy="omit")[0])
        if len(vals)==0:
            rows.append({"feature": feat, "n_windows": 0, "mean_ic": np.nan, "std_ic": np.nan, "sign_consistency": np.nan})
        else:
            arr = np.array(vals, float)
            rows.append({
                "feature": feat,
                "n_windows": len(arr),
                "mean_ic": float(np.nanmean(arr)),
                "std_ic": float(np.nanstd(arr)),
                "sign_consistency": float(np.mean(arr > 0)),
            })
    return pd.DataFrame(rows).sort_values("mean_ic", ascending=False)

if isinstance(X_val_lasso.index, pd.DatetimeIndex):
    ic_df = ic_stability_monthly(X_val_lasso, y_val)
else:
    ic_df = ic_stability_chunks(X_val_lasso, y_val, chunk_size=5000)

ic_df.to_csv("feature_ic_stability.csv", index=False)
print("✅ Saved: feature_ic_stability.csv")

✅ Saved: feature_ic_stability.csv


2. Feature Importance (by Gain)

In [172]:
# ========= Gain importance (train once on X_val_lasso, y_val) =========
BEST_PARAMS_LGBM = {
    "num_leaves": 63, "min_child_samples": 50, "subsample": 0.9,
    "colsample_bytree": 0.9, "reg_lambda": 1.0, "reg_alpha": 0.0
}

imp = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imp.fit_transform(X_val_lasso), index=X_val_lasso.index, columns=X_val_lasso.columns)

lgbm = LGBMRegressor(
    objective="regression", n_estimators=1000, learning_rate=0.05,
    random_state=42, n_jobs=-1, verbosity=-1, force_row_wise=True,
    **BEST_PARAMS_LGBM
).fit(X_imp, y_val)

gain = lgbm.booster_.feature_importance(importance_type="gain")
gain_df = pd.DataFrame({"feature": X_val_lasso.columns, "gain": gain}).sort_values("gain", ascending=False)
gain_df.to_csv("feature_gain_importance.csv", index=False)
print("✅ Saved: feature_gain_importance.csv")

# ========= Merge & select =========
THRESH_MEAN_IC = 0.02
THRESH_SIGN    = 0.60
TOP_K_BY_GAIN  = 30

merged = gain_df.merge(ic_df, on="feature", how="left")
merged.to_csv("feature_ic_gain_merged.csv", index=False)
print("✅ Saved: feature_ic_gain_merged.csv")

stable = merged[
    (merged["mean_ic"].fillna(-1) > THRESH_MEAN_IC) &
    (merged["sign_consistency"].fillna(0) > THRESH_SIGN)
].sort_values("gain", ascending=False)

if stable.empty:
    print("⚠️ No features pass stability thresholds. Falling back to top-K by gain only.")
    selected = gain_df.head(TOP_K_BY_GAIN)["feature"].tolist()
else:
    selected = stable.head(TOP_K_BY_GAIN)["feature"].tolist()

pd.DataFrame({"feature": selected}).to_csv("selected_features_ic_gain.csv", index=False)
print(f"✅ Saved: selected_features_ic_gain.csv (n={len(selected)})")

✅ Saved: feature_gain_importance.csv
✅ Saved: feature_ic_gain_merged.csv
⚠️ No features pass stability thresholds. Falling back to top-K by gain only.
✅ Saved: selected_features_ic_gain.csv (n=30)


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor

# ==============================
# 0) Inputs & Defaults
# ==============================
# Required:  X_val_lasso, y_val  must already exist (same as before)
# Optional:  X_test_lasso, y_test  if available, a final test evaluation will be run using the best K
# Required:  feature_gain_importance.csv  (generated from the previous step)
gain_df = pd.read_csv("feature_gain_importance.csv")
feature_order = gain_df.sort_values("gain", ascending=False)["feature"].tolist()

K_list = [10, 15, 20, 30, 40, 50, 60, 75]  
K_list = [k for k in K_list if k <= len(feature_order)]

def sharpe_from_pnl(pnl: pd.Series) -> float:
    std = pnl.std(ddof=1)
    return 0.0 if (std == 0 or np.isnan(std)) else pnl.mean() / std

def fold_pnl_sign_no_agg(idx, y_true, y_pred):
    signals = np.sign(y_pred)
    return pd.Series(signals * y_true, index=idx)

BEST_PARAMS_LGBM = {
    "num_leaves": 63, "min_child_samples": 50,
    "subsample": 0.9, "colsample_bytree": 0.9,
    "reg_lambda": 1.0, "reg_alpha": 0.0
}

def build_lgbm_pipe(params: dict):
    return Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("reg", LGBMRegressor(
            objective="regression",
            n_estimators=1000,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
            verbosity=-1,
            force_row_wise=True,
            **params
        ))
    ])

# ==============================
# 1) CV over K (TimeSeriesSplit=10)
# ==============================
tscv = TimeSeriesSplit(n_splits=10)
rows = []

for K in K_list:
    feats = feature_order[:K]
    Xk = X_val_lasso[feats]

    all_fold_pnl = []
    ic_list, mse_list, r2_list = [], [], []

    for tr_idx, va_idx in tscv.split(Xk):
        X_tr, y_tr = Xk.iloc[tr_idx], y_val.iloc[tr_idx]
        X_va, y_va = Xk.iloc[va_idx], y_val.iloc[va_idx]

        pipe = build_lgbm_pipe(BEST_PARAMS_LGBM)
        pipe.fit(X_tr, y_tr)
        y_hat = pipe.predict(X_va)

        # per-step sign Sharpe
        fold_pnl = fold_pnl_sign_no_agg(Xk.index[va_idx], y_va.values, y_hat)
        all_fold_pnl.append(fold_pnl)

        try:
            ic = spearmanr(y_va.values, y_hat, nan_policy="omit")[0]
        except Exception:
            ic = np.nan
        ic_list.append(ic)
        mse_list.append(mean_squared_error(y_va.values, y_hat))
        r2_list.append(r2_score(y_va.values, y_hat))

    pnl_concat = pd.concat(all_fold_pnl).sort_index()
    sr_step = sharpe_from_pnl(pnl_concat)

    rows.append({
        "K": K,
        "Sharpe_step_concat": float(sr_step),
        "IC_mean": float(np.nanmean(ic_list)),
        "IC_std": float(np.nanstd(ic_list)),
        "MSE_mean": float(np.mean(mse_list)),
        "R2_mean": float(np.mean(r2_list)),
        "pnl_len": int(pnl_concat.size)
    })

cv_table = pd.DataFrame(rows).sort_values("Sharpe_step_concat", ascending=False)
cv_table.to_csv("feature_count_selection_cv_results.csv", index=False)
print("✅ Saved: feature_count_selection_cv_results.csv")
print(cv_table)

best_row = cv_table.iloc[0]
best_K = int(best_row["K"])
print(f"\n✅ Best K by CV per-step Sharpe: K={best_K} (Sharpe={best_row['Sharpe_step_concat']:.6f}, "
      f"IC_mean={best_row['IC_mean']:.6f})")

# ==============================
# 2) (Optional) Final test on X_test with best K
# ==============================
if 'X_test_lasso' in globals() and 'y_test' in globals():
    feats = feature_order[:best_K]
    Xtr, ytr = X_val_lasso[feats], y_val
    Xte, yte = X_test_lasso[feats], y_test

    pipe = build_lgbm_pipe(BEST_PARAMS_LGBM)
    pipe.fit(Xtr, ytr)
    yhat_te = pipe.predict(Xte)

    # sample-level Sharpe
    pnl_te = fold_pnl_sign_no_agg(Xte.index, yte.values, yhat_te)
    sharpe_te = sharpe_from_pnl(pnl_te)

    try:
        ic_te = spearmanr(yte.values, yhat_te, nan_policy="omit")[0]
    except Exception:
        ic_te = np.nan
    mse_te = mean_squared_error(yte.values, yhat_te)
    r2_te  = r2_score(yte.values, yhat_te)

    print("\n=== [OOS • Test Split with Best-K] ===")
    print(f"K = {best_K}")
    print(f"Sharpe (per-step, sign): {sharpe_te:.6f}")
    print(f"IC (Spearman)         : {ic_te:.6f}")
    print(f"MSE                    : {mse_te:.8f}")
    print(f"R²                     : {r2_te:.6f}")

✅ Saved: feature_count_selection_cv_results.csv
    K  Sharpe_step_concat   IC_mean    IC_std  MSE_mean   R2_mean  pnl_len
7  75            0.005904  0.009297  0.011091  0.000008 -0.145659   196900
6  60            0.005056  0.007018  0.009052  0.000008 -0.136600   196900
3  30            0.003016  0.007525  0.008921  0.000008 -0.153262   196900
5  50            0.002524  0.004087  0.010204  0.000008 -0.140766   196900
0  10            0.002444 -0.001622  0.010282  0.000008 -0.168871   196900
4  40            0.001823  0.006682  0.011544  0.000008 -0.142505   196900
2  20            0.000886 -0.001647  0.008310  0.000008 -0.155124   196900
1  15            0.000271  0.000648  0.009061  0.000008 -0.161526   196900

✅ Best K by CV per-step Sharpe: K=75 (Sharpe=0.005904, IC_mean=0.009297)

=== [OOS • Test Split with Best-K] ===
K = 75
Sharpe (per-step, sign): 0.005611
IC (Spearman)         : 0.017103
MSE                    : 0.00000587
R²                     : -0.382233


In [None]:
# --- Feature Filtering ---

df = pd.read_csv("feature_ic_gain_merged.csv").copy()

# --- Cleaning & Derivation ---
for col in ["mean_ic","std_ic","sign_consistency","gain"]:
    if col not in df.columns:
        df[col] = np.nan

# Normalize feature gain
max_gain = df["gain"].max() if np.isfinite(df["gain"]).any() else 1.0
df["gain_norm"] = df["gain"].fillna(0.0) / (max_gain if max_gain != 0 else 1.0)

# Handle missing values (avoid all-NaN crashes)
df["mean_ic"] = df["mean_ic"].fillna(df["mean_ic"].median() if np.isfinite(df["mean_ic"]).any() else 0.0)
df["std_ic"]  = df["std_ic"].fillna(df["std_ic"].median()  if np.isfinite(df["std_ic"]).any()  else 0.5)
df["sign_consistency"] = df["sign_consistency"].fillna(0.5)

# Compute overall stability score (with adjustable weights)
w_sign, w_gain, w_icstd = 0.5, 0.3, 0.2
df["stability_score"] = (
    w_sign * df["sign_consistency"] +
    w_gain * df["gain_norm"] +
    w_icstd * (1 - df["std_ic"])
)

# ===== Strategy A: Adaptive threshold (quantile-based) =====
# Goal: consistent direction, meaningful contribution, and acceptable volatility
q_sign = df["sign_consistency"].quantile(0.60)
q_gain = df["gain_norm"].quantile(0.40)
q_icstd= df["std_ic"].quantile(0.80)

SIGN_THRESHOLD = max(0.60, q_sign)        # - At least 60% directional consistency or above 60th percentile of all samples
GAIN_MIN      = max(0.02, q_gain)         # - At least top 60% in normalized gain (gain_norm)
IC_MEAN_MIN   = max(-0.002, df["mean_ic"].quantile(0.40))  # - Allow slightly near-zero mean IC
IC_STD_MAX    = min(0.50, q_icstd)        # - Relaxed volatility constraint (upper bound)

selA = df.query(
    "sign_consistency >= @SIGN_THRESHOLD and "
    "gain_norm >= @GAIN_MIN and "
    "mean_ic >= @IC_MEAN_MIN and "
    "std_ic <= @IC_STD_MAX"
).sort_values(["stability_score","gain_norm"], ascending=False)

# ===== Strategy B: If no features pass → use Top-K by stability score =====
TOP_K = 30
if selA.empty:
    selB = df.sort_values(["stability_score","gain_norm"], ascending=False).head(TOP_K)
else:
    selB = selA

# ===== Strategy C: If still empty (extreme case) → fallback to Top-K by gain =====
if selB.empty:
    selC = df.sort_values("gain", ascending=False).head(TOP_K)
    used_strategy = "C: gain_topK"
    final = selC
elif len(selB) > TOP_K:
    used_strategy = "B: stability_score_topK"
    final = selB.head(TOP_K)
else:
    used_strategy = "A: adaptive_thresholds"
    final = selB

final = final[["feature","mean_ic","std_ic","sign_consistency","gain","gain_norm","stability_score"]]
final.to_csv("selected_features_adaptive.csv", index=False)

print(f"✅ Strategy used: {used_strategy}")
print(f"✅ Saved: selected_features_adaptive.csv (n={len(final)})")
print(final.head(10))

✅ Strategy used: A: adaptive_thresholds
✅ Saved: selected_features_adaptive.csv (n=5)
                    feature   mean_ic    std_ic  sign_consistency      gain  \
0                    vol_60  0.008047  0.022259          0.604651  0.547870   
8   fz_spill_AMAT_lret_lag3  0.006110  0.022549          0.627907  0.363969   
15            f_volume_norm  0.007787  0.019776          0.604651  0.295219   
43                fz_vol_60  0.005210  0.021358          0.651163  0.126349   
44          fz_vol_ratio_15  0.002069  0.018376          0.627907  0.114648   

    gain_norm  stability_score  
0    1.000000         0.797874  
8    0.664334         0.708744  
15   0.538848         0.660025  
43   0.230619         0.590496  
44   0.209261         0.573057  


In [197]:
# ===== Stable Core 5 Features (from adaptive selection) =====
LGBM_CORE_FEATURES = [
    "vol_60",
    "fz_spill_AMAT_lret_lag3",
    "f_volume_norm",
    "fz_vol_60",
    "fz_vol_ratio_15"
]

# 2) Construct X_core based on X_cleaned
present = [c for c in LGBM_CORE_FEATURES if c in X_cleaned.columns]
missing = [c for c in LGBM_CORE_FEATURES if c not in X_cleaned.columns]

if missing:
    print(f"{len(missing)} Core features are missing from X_cleaned (not included in X_core).")
    for c in missing:
        print("   -", c)

X_core = X_cleaned.loc[:, present].copy()

# X_lasso = X_lasso.apply(pd.to_numeric, errors="ignore")

print(f"X_core successfully generated: shape = {X_core.shape} (containing {len(present)} features).")

X_core successfully generated: shape = (324888, 5) (containing 5 features).


In [None]:
split_ratio = 1.0 / 1.5 
split_index = int(len(X_core) * split_ratio)

X_val_core = X_core.iloc[:split_index]
y_val = y.iloc[:split_index]

X_test_core = X_core.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"--- Data Splitting done ---")
print(f" Validation set (X_val, y_val): {X_val_core.shape}, {y_val.shape}")
print(f" Testing set (X_test, y_test):   {X_test_core.shape}, {y_test.shape}")

--- Data Splitting done ---
 Validation set (X_val, y_val): (216592, 5), (216592,)
 Testing set (X_test, y_test):   (108296, 5), (108296,)


In [184]:
# ===== 0) Configuration =====
ROLLING_WINDOW_MINUTES = 30

BEST_PARAMS_LGBM = {  # Best parameters from CV
    "num_leaves": 63,
    "min_child_samples": 50,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "reg_lambda": 1.0,
    "reg_alpha": 0.0,
}

def build_lgbm_pipe(best_params=BEST_PARAMS_LGBM):
    return Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("reg", LGBMRegressor(
            objective="regression",
            n_estimators=1000,
            learning_rate=0.05,
            random_state=42,
            n_jobs=-1,
            verbosity=-1,
            force_row_wise=True,
            **best_params
        ))
    ])

def sharpe_per_step(series: pd.Series) -> float:
    std = series.std(ddof=1)
    return 0.0 if (std == 0 or np.isnan(std)) else series.mean() / std

In [181]:
# ===== 1) Prepare backtest data =====
df_backtest = df.set_index('datetime')

test_original_indices = X_test_core.index
test_datetimes = np.sort(df.loc[test_original_indices, 'datetime'].unique())

feature_cols = list(X_val_core.columns)
available_cols = [c for c in feature_cols if c in df_backtest.columns]
if len(available_cols) < len(feature_cols):
    miss = sorted(set(feature_cols) - set(available_cols))
    print(f"⚠️ Missing {len(miss)} Lasso features: {miss[:10]}{' ...' if len(miss)>10 else ''}")

X_core_full_history = df_backtest[available_cols]
y_full_history = df_backtest['y_target']
meta_full_history = df_backtest[['symbol']]

print(f"--- Starting minute-level rolling backtest (LGBM + Lasso) ---")
print(f"Testing period: {test_datetimes.min()} → {test_datetimes.max()}")
print(f"Rolling window (training): {ROLLING_WINDOW_MINUTES} minutes")
print(f"LightGBM best parameters: {BEST_PARAMS_LGBM}")

--- Starting minute-level rolling backtest (LGBM + Lasso) ---
Testing period: 2025-04-29T15:59:00.000000000 → 2025-10-28T15:58:00.000000000
Rolling window (training): 30 minutes
LightGBM best parameters: {'num_leaves': 63, 'min_child_samples': 50, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_lambda': 1.0, 'reg_alpha': 0.0}


In [185]:
# ===== 2) Rolling window: [t-30m, t-1m] → predict t =====
results = []

for current_dt in tqdm(test_datetimes):
    train_start_dt = current_dt - pd.Timedelta(minutes=ROLLING_WINDOW_MINUTES)
    train_end_dt   = current_dt - pd.Timedelta(minutes=1)

    # Training window
    X_tr = X_core_full_history.loc[train_start_dt:train_end_dt]
    y_tr = y_full_history.loc[train_start_dt:train_end_dt]

    if len(X_tr) < 15 * 5:
        continue

    # Test data at current minute
    X_te = X_core_full_history.loc[current_dt]
    if isinstance(X_te, pd.Series):
        X_te = X_te.to_frame().T

    pipe = build_lgbm_pipe()
    pipe.fit(X_tr, y_tr)
    y_hat = pipe.predict(X_te)

    meta_rows = meta_full_history.loc[current_dt]
    y_true = y_full_history.loc[current_dt]

    if isinstance(meta_rows, pd.Series):
        results.append({
            "datetime": current_dt,
            "symbol": meta_rows['symbol'],
            "predicted_log_return": float(y_hat[0]),
            "actual_log_return": float(y_true),
        })
    else:
        for i in range(len(meta_rows)):
            results.append({
                "datetime": current_dt,
                "symbol": meta_rows.iloc[i]['symbol'],
                "predicted_log_return": float(y_hat[i]),
                "actual_log_return": float(y_true.iloc[i]),
            })

100%|██████████| 21660/21660 [1:41:59<00:00,  3.54it/s]   


In [186]:
# ===== 3) Post-processing: derived columns =====
results_df_core = pd.DataFrame(results).sort_values(["datetime","symbol"]).reset_index(drop=True)

# (A) Long-only relative weights (sum=1 per minute)
results_df_core["positive_prediction"] = results_df_core["predicted_log_return"].clip(lower=0)
minute_sum = results_df_core.groupby("datetime")["positive_prediction"].transform("sum")
results_df_core["weight_relative"] = np.where(minute_sum == 0, 0.0,
                                         results_df_core["positive_prediction"] / minute_sum)
results_df_core.drop(columns=["positive_prediction"], inplace=True)

# (B) Sign weights (-1/0/+1)
results_df_core["weight_sign"] = np.sign(results_df_core["predicted_log_return"])

# (C) Per-sample PnL (sign-based, same as CV)
results_df_core["pnl_sample_sign"] = results_df_core["weight_sign"] * results_df_core["actual_log_return"]

# (D) Long-only PnL (relative weights)
results_df_core["pnl_sample_relative"] = results_df_core["weight_relative"] * results_df_core["actual_log_return"]

# (E) Aggregate by minute (optional)
pnl_minute_sign = results_df_core.groupby("datetime")["pnl_sample_sign"].mean()
pnl_minute_relative = results_df_core.groupby("datetime")["pnl_sample_relative"].sum()

print("\n--- Backtesting completed ---")
print(f"Total backtest rows: {len(results_df_core)}")
print(results_df_core.head())


--- Backtesting completed ---
Total backtest rows: 99749
             datetime symbol  predicted_log_return  actual_log_return  \
0 2025-04-29 15:59:00   AMAT             -0.000565          -0.014007   
1 2025-04-29 15:59:00    AMD             -0.000243          -0.006581   
2 2025-04-29 15:59:00   AVGO             -0.000243          -0.008008   
3 2025-04-29 15:59:00     MU             -0.000243          -0.019444   
4 2025-04-29 15:59:00   NVDA             -0.000243          -0.025525   

   weight_relative  weight_sign  pnl_sample_sign  pnl_sample_relative  
0              0.0         -1.0         0.014007                 -0.0  
1              0.0         -1.0         0.006581                 -0.0  
2              0.0         -1.0         0.008008                 -0.0  
3              0.0         -1.0         0.019444                 -0.0  
4              0.0         -1.0         0.025525                 -0.0  


In [187]:
# ===== 4) OOS evaluation metrics =====
y_true_all = results_df_core["actual_log_return"].values
y_pred_all = results_df_core["predicted_log_return"].values

mse = mean_squared_error(y_true_all, y_pred_all)
r2  = r2_score(y_true_all, y_pred_all)
ic  = spearmanr(y_true_all, y_pred_all)[0]

sr_step_sign_samples   = sharpe_per_step(results_df_core["pnl_sample_sign"])         
sr_step_sign_minutes   = sharpe_per_step(pnl_minute_sign)                        
sr_step_relative_minutes = sharpe_per_step(pnl_minute_relative)                  

print("\n=== [OOS • Test Split] Performance ===")
print(f"MSE                         : {mse:.8f}")
print(f"R²                          : {r2:.6f}")
print(f"IC (Spearman ρ)             : {ic:.6f}")
print(f"Sharpe (per-step, sign) — sample-level : {sr_step_sign_samples:.6f}  (same CV style)")
print(f"Sharpe (per-step, sign) — minute-level : {sr_step_sign_minutes:.6f}  (portfolio level)")
print(f"Sharpe (per-step, long-only relative)  : {sr_step_relative_minutes:.6f}  (portfolio level)")


=== [OOS • Test Split] Performance ===
MSE                         : 0.00000463
R²                          : -0.012640
IC (Spearman ρ)             : -0.010608
Sharpe (per-step, sign) — sample-level : -0.002436  (same CV style)
Sharpe (per-step, sign) — minute-level : -0.004017  (portfolio level)
Sharpe (per-step, long-only relative)  : 0.017159  (portfolio level)


### (Rolling Spearman ρ Stability Filtering)

## Summary & Decision

- **Performance:** Both out-of-sample IC and minute-level Sharpe ratios are lower than those of the main Lasso-selected feature model.
- **Risk:** Drawdown levels show no meaningful improvement under comparable turnover conditions.
- **Robustness:** After accounting for transaction costs, any marginal advantage disappears or turns negative.
- **Interpretability:** Although the 5-feature set is simpler and more compact, its calibration is weaker and less monotonic.

**Decision:**  
This 5-feature variant is **not selected** for final portfolio construction after the rolling Spearman ρ stability screening.  
It is retained here solely for transparency, reproducibility, and future reference.