# Load Data

In [None]:
import pandas as pd
import platform
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
from datetime import timedelta
from collections import defaultdict
import joblib
import json
import warnings
import time
from scipy.stats.mstats import winsorize
import numba
import pandas_ta as pta

import sys
import os

# Add one directory up to sys.path
sys.path.append(os.path.abspath(".."))

# Internal Libraries
from data_loader import load_and_resample_data, apply_feature_engineering
from backtest import evaluate_regression, evaluate_static_tp_two_contracts
from labeling_utils import label_and_save
from helpers import check_overfit, generate_oof_predictions, is_same_session
#

# Tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler, RobustScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import Huber, MeanSquaredError
import tensorflow as tf
#

# Scikit-learn
from sklearn.base import clone, BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_val_predict
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, StackingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, RidgeClassifier, ElasticNet, Ridge, ElasticNetCV, RidgeCV
from sklearn.metrics import classification_report, root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score, \
    confusion_matrix, precision_recall_curve, roc_curve, auc, accuracy_score, classification_report, f1_score, precision_score, ConfusionMatrixDisplay
from sklearn.preprocessing import label_binarize, StandardScaler, LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

# Models and Training
from catboost import CatBoostRegressor, CatBoostClassifier
import lightgbm as lgb
import xgboost as xgb
import optuna
import seaborn as sns
import shap
from sklearn.svm import SVC
#

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*There are no meaningful features.*", category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.INFO)

market = "NQ"

# Initialize features or indicators

In [None]:
# This is your Cell 5: Feature Engineering Function Definitions

# --- Helper Functions for Custom Features (Some will be kept, some modified/called differently) ---
# These helpers now expect columns to be named as pandas_ta typically names them (e.g., RSI_14, EMA_10, MACD_12_26_9, etc.)
def add_market_regime_features(df):
    chop_col = f'CHOP_14_1_100'
    adx_col = f'ADX_14'
    atr_col = f'ATR_14'  # Use base_tf for ATR

    # Handle missing data
    if any(col not in df.columns for col in [chop_col, adx_col, atr_col]):
        raise ValueError(f"Required columns not found for regime detection: {chop_col}, {adx_col}, {atr_col}")

    atr_thresh = df[atr_col].rolling(100, min_periods=20).mean() * 1.1

    df['Is_Trending'] = (df[adx_col] > 20).astype(int)
    df['Is_Choppy'] = (df[chop_col] > 60).astype(int)
    df['Is_High_Vol'] = (df[atr_col] > atr_thresh).astype(int)
    df['Is_Low_Vol'] = (df[atr_col] <= atr_thresh).astype(int)

    df['Market_Regime'] = (
        df['Is_Trending'].astype(str) + "_" +
        df['Is_High_Vol'].astype(str)
    )

    return df

def add_price_vs_ma(df, price_col='close', ma_col_name='EMA_20', new_col_name_suffix='_vs_EMA20'):
    # Ensure ma_col_name exists (it would have been created by pandas_ta)
    if ma_col_name in df.columns and price_col in df.columns:
        # Ensure inputs are numeric before division
        df[price_col + new_col_name_suffix] = pd.to_numeric(df[price_col], errors='coerce') / pd.to_numeric(df[ma_col_name], errors='coerce')
    return df

def add_ma_vs_ma(df, ma1_col_name='EMA_10', ma2_col_name='EMA_20', new_col_name_suffix='_vs_EMA20'):
    if ma1_col_name in df.columns and ma2_col_name in df.columns:
        df[ma1_col_name + new_col_name_suffix] = pd.to_numeric(df[ma1_col_name], errors='coerce') / pd.to_numeric(df[ma2_col_name], errors='coerce')
    return df

def add_ma_slope(df, ma_col_name='EMA_10', new_col_name_suffix='_Slope_10', periods=1):
    if ma_col_name in df.columns:
        df[f'{ma_col_name}{new_col_name_suffix}'] = pd.to_numeric(df[ma_col_name], errors='coerce').diff(periods) / periods
    return df

def add_rsi_signals(df, rsi_col_name='RSI_14', ob_level=70, os_level=30):
    if rsi_col_name in df.columns:
        rsi_series = pd.to_numeric(df[rsi_col_name], errors='coerce')
        df[rsi_col_name + f'_Is_Overbought_{ob_level}'] = (rsi_series > ob_level).astype(int)
        df[rsi_col_name + f'_Is_Oversold_{os_level}'] = (rsi_series < os_level).astype(int)
    return df

def add_stoch_signals(df, stoch_k_col_name='STOCHk_14_3_3', ob_level=80, os_level=20): # Default pandas_ta name for k
    if stoch_k_col_name in df.columns:
        stoch_k_series = pd.to_numeric(df[stoch_k_col_name], errors='coerce')
        df[stoch_k_col_name + f'_Is_Overbought_{ob_level}'] = (stoch_k_series > ob_level).astype(int)
        df[stoch_k_col_name + f'_Is_Oversold_{os_level}'] = (stoch_k_series < os_level).astype(int)
    return df

def add_macd_cross_signal(df, macd_col_name='MACD_12_26_9', signal_col_name='MACDs_12_26_9'): # Default pandas_ta name for signal
    if macd_col_name in df.columns and signal_col_name in df.columns:
        macd_series = pd.to_numeric(df[macd_col_name], errors='coerce')
        signal_series = pd.to_numeric(df[signal_col_name], errors='coerce')
        crossed_above = (macd_series > signal_series) & (macd_series.shift(1) < signal_series.shift(1))
        crossed_below = (macd_series < signal_series) & (macd_series.shift(1) > signal_series.shift(1))
        df[macd_col_name + '_Cross_Signal'] = np.where(crossed_above, 1, np.where(crossed_below, -1, 0))
    return df

def add_price_vs_bb(df, price_col='close', bb_upper_col='BBU_20_2.0', bb_lower_col='BBL_20_2.0'): # Default pandas_ta names
    if price_col in df.columns and bb_upper_col in df.columns and bb_lower_col in df.columns:
        price_series = pd.to_numeric(df[price_col], errors='coerce')
        bb_upper_series = pd.to_numeric(df[bb_upper_col], errors='coerce')
        bb_lower_series = pd.to_numeric(df[bb_lower_col], errors='coerce')
        df[price_col + '_vs_BB_Upper'] = (price_series > bb_upper_series).astype(int)
        df[price_col + '_vs_BB_Lower'] = (price_series < bb_lower_series).astype(int)
    return df

def add_daily_vwap(df, high_col='high', low_col='low', close_col='close', volume_col='volume', new_col_name='VWAP_D'): # Changed name to VWAP_D for daily
    # ... (your existing robust add_daily_vwap function - ensure it uses .copy() and numeric conversions internally)
    # Make sure the final column is named VWAP_D or adjust add_price_vs_ma call later
    if not isinstance(df.index, pd.DatetimeIndex):
        print("Error: DataFrame index must be DatetimeIndex for daily VWAP.")
        return df
    df_temp = df.copy()
    for col in [high_col, low_col, close_col, volume_col]:
        df_temp[col] = pd.to_numeric(df_temp[col], errors='coerce')
    tpv = ((df_temp[high_col] + df_temp[low_col] + df_temp[close_col]) / 3) * df_temp[volume_col]
    cumulative_tpv = tpv.groupby(df_temp.index.date).cumsum()
    cumulative_volume = df_temp[volume_col].groupby(df_temp.index.date).cumsum()
    vwap_series = cumulative_tpv / cumulative_volume
    df[new_col_name] = vwap_series.replace([np.inf, -np.inf], np.nan)
    return df

def add_candle_features(df):
    # ... (your existing add_candle_features function - ensure numeric conversions) ...
    df_temp = df.copy()
    for col in ['open', 'high', 'low', 'close']:
        df_temp[col] = pd.to_numeric(df_temp[col], errors='coerce')
    df['Candle_Range'] = df_temp['high'] - df_temp['low']
    df['Candle_Body'] = (df_temp['close'] - df_temp['open']).abs()
    df['Upper_Wick'] = df_temp['high'] - np.maximum(df_temp['open'], df_temp['close'])
    df['Upper_Wick_Length'] = df_temp['high'] - df_temp[['open','close']].max(axis=1)
    df['Lower_Wick'] = np.minimum(df_temp['open'], df_temp['close']) - df_temp['low']
    df['Body_vs_Range'] = (df['Candle_Body'] / df['Candle_Range'].replace(0, np.nan)).replace([np.inf, -np.inf], np.nan).fillna(0)
    df['Is_Bearish_Wick']   = (
        (df_temp['close'] < df_temp['open']) &
        (df['Upper_Wick_Length'] > 2 * df['Candle_Body']) &
        ((df_temp[['open','close']].min(axis=1) - df_temp['low']) < 0.2 * df['Candle_Body'])
    ).astype(int)
    return df

def add_return_features(df, price_col='close'):
    # ... (your existing add_return_features function - ensure numeric conversions and inf handling) ...
    price_series_num = pd.to_numeric(df[price_col], errors='coerce').replace(0, np.nan)
    df[f'Log_Return_1'] = np.log(price_series_num / price_series_num.shift(1))
    df[f'Log_Return_3'] = np.log(price_series_num / price_series_num.shift(3))
    df[f'Log_Return_6'] = np.log(price_series_num / price_series_num.shift(6))
    df[f'Simple_Return_1'] = price_series_num.pct_change(1)
    for col_ret in [f'Log_Return_1', f'Log_Return_3', f'Log_Return_6', f'Simple_Return_1']:
        if col_ret in df.columns: df[col_ret] = df[col_ret].replace([np.inf, -np.inf], np.nan)
    return df

def add_rolling_stats(df, price_col='close', window1=14, window2=30):
    # ... (your existing add_rolling_stats function - ensure numeric conversions and inf handling) ...
    returns = pd.to_numeric(df[price_col], errors='coerce').pct_change(1).replace([np.inf, -np.inf], np.nan)
    df[f'Rolling_Std_Dev_{window1}'] = returns.rolling(window=window1).std()
    df[f'Rolling_Skew_{window2}'] = returns.rolling(window=window2).skew()
    df[f'Rolling_Kurtosis_{window2}'] = returns.rolling(window=window2).kurt()
    return df

def add_lagged_features(df, cols_to_lag, lags=[1, 3, 6]):
    # ... (your existing add_lagged_features function - ensure numeric conversions on source col if needed) ...
    for col_orig in cols_to_lag:
        if col_orig in df.columns:
            series_to_lag = pd.to_numeric(df[col_orig], errors='coerce')
            for lag in lags:
                df[f'{col_orig}_Lag_{lag}'] = series_to_lag.shift(lag)
    return df

def add_trend_features(df_input, suffix=''):
    """
    Add trend, volatility regime, and momentum features with proper suffix handling.
    Assumes standard column names from pandas_ta (e.g., EMA_20, VWAP_D, etc.)
    """
    df = df_input.copy()
    
    # Ensure we have the base columns with correct suffixes
    required_cols = [
        f'EMA_20{suffix}', f'EMA_50{suffix}', 
        f'VWAP_D{suffix}', f'ATR_14{suffix}',
        f'RSI_14{suffix}', f'close'
    ]
    
    if not all(col in df.columns for col in required_cols):
        print(f"Warning: Missing some required columns for trend features with suffix {suffix}")
        return df

    # Trend Direction using EMAs
    df[f'Trend_Direction{suffix}'] = np.where(
        df[f'EMA_20{suffix}'] > df[f'EMA_50{suffix}'], 1, -1
    )
    
    # Price vs VWAP (normalized)
    df[f'Price_vs_VWAP{suffix}'] = (
        df['close'] / df[f'VWAP_D{suffix}'] - 1
    ) * 100  # Convert to percentage
    
    # Volatility Regime (normalized)
    vol_short = df[f'ATR_14{suffix}'].rolling(24).mean()
    vol_long = df[f'ATR_14{suffix}'].rolling(120).mean()
    df[f'Vol_Regime{suffix}'] = (vol_short / vol_long - 1) * 100
    
    # Momentum Confirmation (multiple timeframe alignment)
    df[f'RSI_Trend_Align{suffix}'] = (
        (df[f'RSI_14{suffix}'] > 50) & 
        (df[f'EMA_20{suffix}'] > df[f'EMA_50{suffix}'])
    ).astype(int)
    
    return df

def session_key(ts: pd.Timestamp) -> pd.Timestamp:
    # shift back 18 h, then floor to midnight to get a unique session “date”
    return (ts - timedelta(hours=18)).normalize()

def add_volume_features(df):
    vol_col = 'volume'
    # 2) session VWAP

    df['TPV'] = ((df['high']+df['low']+df['close'])/3) * df['volume']
    df['cum_tpv'] = df.groupby('session_id')['TPV'].cumsum()
    df['cum_vol'] = df.groupby('session_id')['volume'].cumsum()
    df['VWAP_Session'] = df['cum_tpv'] / df['cum_vol']

    # 3) compute POC map (approx via bar-close)
    temp = df.assign(price=df['close'])
    vol_profile = temp.groupby(['session_id','price'])['volume'].sum().reset_index()
    poc_map = (
        vol_profile
        .sort_values(['session_id','volume'], ascending=[True,False])
        .drop_duplicates('session_id')
        .set_index('session_id')['price']
    )
    df['POC_Current']  = df['session_id'].map(poc_map)

    # 4) map previous session POC
    sessions = sorted(poc_map.index)
    prev_map = {curr: prev for prev, curr in zip(sessions, sessions[1:])}
    df['session_prev']   = df['session_id'].map(prev_map)
    df['POC_Previous'] = df['session_prev'].map(poc_map)
    # Typical Price × Volume

    df[f'VWAP_Session_Dist'] = (df['close'] / df["VWAP_Session"] - 1) * 100
    df[f'Vol_Delta_1'] = df[vol_col].diff()
    
    # 2-bar and 3-bar deltas via .diff(N)
    for n in (2, 3):
        df[f'Vol_Delta_{n}'] = df['volume'].diff(n)

    # ————————
    # If instead you want the sum of each 1-bar delta over the last N bars:
    for n in (2, 3):
        df[f'Vol_Delta_rollsum_{n}'] = df['Vol_Delta_1'].rolling(n).sum()

    # ————————
    # Or even a rolling average of the 1-bar delta:
    for n in (2, 3):
        df[f'Vol_Delta_rollavg_{n}'] = df['Vol_Delta_1'].rolling(n).mean()

    for value in [20, 10, 5]:
        mv = df[vol_col].rolling(value).mean()
        sv = df[vol_col].rolling(value).std()
        df[f'Vol_zscore_{value}'] = (df[vol_col] - mv) / sv
        df[f'High_Vol_Event_{value}'] = (df[f'Vol_zscore_{value}'] > 2).astype(int)

    # Point distance from POC
    df['POC_Dist_Current_Points']  = (df['close'] - df['POC_Current'])
    df['POC_Dist_Previous_Points'] = (df['close'] - df['POC_Previous'])
    # bar width, avoid zero-division
    width = (df['high'] - df['low']).replace(0, np.nan)

    # Candle body as % of range
    df['Candle_Body_%'] = ((df['close'] - df['open']).abs() / width).fillna(0)

    # Close position in the bar as % of range
    df['Close_Pos_%'] = ((df['close'] - df['low']) / width).fillna(0)
    df['Rel_Vol_20'] = df['volume'] / df['volume'].rolling(20).mean()

    signed_vol = np.where(df['close'] >= df['open'], df['volume'],
                          -df['volume'])
    df['CVD_3'] = pd.Series(signed_vol, index=df.index).rolling(3).sum()

    rng = (df['high'] - df['low']).replace(0, np.nan)          # avoid /0
    df['Upper_Wick_%'] = (
        (df['high'] - np.maximum(df['open'], df['close'])) / rng
    ).fillna(0)
    df['Lower_Wick_%'] = (
        (np.minimum(df['open'], df['close']) - df['low']) / rng
    ).fillna(0)

    lookback = 20                                              # ≈100 min
    swing_hi_prev = df['high'].rolling(lookback).max().shift(1)
    swing_lo_prev = df['low'].rolling(lookback).min().shift(1)
    swing_mid_prev = 0.5 * (swing_hi_prev + swing_lo_prev)

    df['Prev_Swing_Dist'] = (
        (df['close'] - swing_mid_prev) / df['ATR_14']
    )

    return df
    
def add_donchian_dist(df, length=20):
    donch_hi = df['high'].rolling(length).max()
    donch_lo = df['low'].rolling(length).min()
    donch_mid = (donch_hi + donch_lo) / 2
    df['Donchian_Dist_20'] = (df['close'] - donch_mid) / df['ATR_14']
    return df

# --- Main Feature Generation Function using pandas_ta ---
def add_all_features(df_input, suffix=''):
    """
    Adds technical indicators and derived features using pandas_ta.
    Assumes df_input has 'open', 'high', 'low', 'close', 'volume' columns (DatetimeIndex).
    """
    if not isinstance(df_input.index, pd.DatetimeIndex):
        print(f"Warning: DataFrame for suffix '{suffix}' does not have a DatetimeIndex.")
    
    df = df_input.copy() # Work on a copy

    # Ensure base OHLCV columns are numeric and present
    base_cols = ['open', 'high', 'low', 'close', 'volume']
    if not all(col in df.columns for col in base_cols):
         raise ValueError(f"DataFrame must contain {base_cols}. Found: {df.columns.tolist()}")
    for col in base_cols: # Ensure correct dtypes for pandas_ta
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df.dropna(subset=base_cols, inplace=True) # Drop rows if OHLCV became NaN

    if df.empty:
        print(f"DataFrame became empty after coercing OHLCV for suffix '{suffix}'. Returning empty DataFrame.")
        # Return an empty dataframe with expected suffixed columns if possible or raise error
        # For simplicity, we'll let it create columns that will be all NaN, then suffixing will apply.
        # Or handle more gracefully by creating expected columns with NaNs.
        # For now, we will proceed, and suffixing will apply to what gets created.
        pass


    print(f"DataFrame shape for pandas_ta (suffix: {suffix}): {df.shape}")

    # I. Technical Indicators using pandas_ta
    # Most pandas_ta functions automatically name columns (e.g., SMA_10, RSI_14)
    # and handle NaNs internally. `append=True` adds them to df.

    # Volume
    df.ta.sma(close=df['volume'], length=20, append=True, col_names=('Volume_SMA_20'))
    df = add_daily_vwap(df, new_col_name='VWAP_D') # Using your custom daily VWAP, named VWAP_D
    df = add_price_vs_ma(df, ma_col_name='VWAP_D', new_col_name_suffix='_vs_VWAP_D')
    df = add_price_vs_ma(df, ma_col_name='EMA_21', new_col_name_suffix='_vs_EMA_21')

    # Volatility
    df.ta.bbands(length=20, std=2, append=True) # Creates BBL_20_2.0, BBM_20_2.0, BBU_20_2.0, BBB_20_2.0, BBP_20_2.0
    # Helpers will need these names: BBU_20_2.0, BBL_20_2.0
    df = add_price_vs_bb(df, bb_upper_col='BBU_20_2.0', bb_lower_col='BBL_20_2.0')
    df.ta.atr(length=14, append=True, col_names=('ATR_14')) # pandas_ta might name it ATRr_14 or similar. We force ATR_14.
    df.ta.atr(length=7, append=True, col_names=('ATR_7'))

    # Trend
    df.ta.sma(length=10, append=True) # SMA_10
    df.ta.sma(length=20, append=True) # SMA_20 (also BBM_20_2.0 from bbands)
    df.ta.sma(length=50, append=True) # SMA_50
    df.ta.ema(length=9, append=True) # EMA_10
    df.ta.ema(length=10, append=True) # EMA_10
    df.ta.ema(length=20, append=True) # EMA_20
    df.ta.ema(length=21, append=True) # EMA_21
    df.ta.ema(length=50, append=True) # EMA_50
    

    df = add_price_vs_ma(df, ma_col_name='EMA_9', new_col_name_suffix='_vs_EMA9')
    df = add_price_vs_ma(df, ma_col_name='EMA_20', new_col_name_suffix='_vs_EMA20')
    df = add_price_vs_ma(df, ma_col_name='EMA_21', new_col_name_suffix='_vs_EMA21')
    df = add_ma_vs_ma(df, ma1_col_name='EMA_10', ma2_col_name='EMA_20', new_col_name_suffix='_vs_EMA20')
    df = add_ma_vs_ma(df, ma1_col_name='EMA_9', ma2_col_name='EMA_21', new_col_name_suffix='_vs_EMA21')
    df = add_ma_slope(df, ma_col_name='EMA_9', new_col_name_suffix='_Slope_9')
    df = add_ma_slope(df, ma_col_name='EMA_10', new_col_name_suffix='_Slope_10')
    df = add_ma_slope(df, ma_col_name='EMA_21', new_col_name_suffix='_Slope_21')
    df = add_ma_slope(df, ma_col_name='EMA_21', new_col_name_suffix='_Slope_21_3', periods=3)
    df = add_ma_slope(df, ma_col_name='EMA_21', new_col_name_suffix='_Slope_21_2', periods=2)

    df.ta.macd(fast=12, slow=26, signal=9, append=True) # MACD_12_26_9, MACDh_12_26_9, MACDs_12_26_9
    df = add_macd_cross_signal(df, macd_col_name='MACD_12_26_9', signal_col_name='MACDs_12_26_9')

    df.ta.adx(length=14, append=True) # ADX_14, DMP_14, DMN_14
    # Rename DMN_14 and DMP_14 to match your old Minus_DI_14, Plus_DI_14 if helpers depend on it
    if 'DMP_14' in df.columns: df.rename(columns={'DMP_14': 'Plus_DI_14'}, inplace=True)
    if 'DMN_14' in df.columns: df.rename(columns={'DMN_14': 'Minus_DI_14'}, inplace=True)

    df.ta.cci(length=20, append=True, col_names=('CCI_20')) # pandas_ta uses CCI_20_0.015 by default

    # Momentum
    df.ta.rsi(length=14, append=True) # RSI_14
    df.ta.rsi(length=7, append=True) # RSI_14
    df = add_rsi_signals(df, rsi_col_name='RSI_7')
    df = add_rsi_signals(df, rsi_col_name='RSI_14')
    df.ta.chop(length=14, append=True)  # Adds column: CHOP_14
    df.ta.chop(length=7, append=True)  # Adds column: CHOP_7

    chop_threshold = 61.8

    df['Is_Choppy_14'] = (df['CHOP_14_1_100'] > chop_threshold).astype(int)
    df['Is_Choppy_7']  = (df['CHOP_7_1_100']  > chop_threshold).astype(int)

    df.ta.willr(length=14, append=True)  # Adds column: WILLR_14

    df.ta.stoch(k=14, d=3, smooth_k=3, append=True) # STOCHk_14_3_3, STOCHd_14_3_3
    df = add_stoch_signals(df, stoch_k_col_name='STOCHk_14_3_3')

    df.ta.ppo(fast=12, slow=26, signal=9, append=True) # PPO_12_26_9, PPOh_12_26_9, PPOs_12_26_9
    df.ta.roc(length=10, append=True) # ROC_10
    
    # Explicitly convert PPO and ROC to numeric (belt and braces after pandas_ta)
    for col_name in ['PPO_12_26_9', 'ROC_10']: # Check exact names if pandas_ta produces variants
        base_name_ppo = [c for c in df.columns if "PPO_" in c and "PPOh" not in c and "PPOs" not in c]
        base_name_roc = [c for c in df.columns if "ROC_" in c]
        
        for actual_col_name in base_name_ppo + base_name_roc:
            if actual_col_name in df.columns:
                df[actual_col_name] = df[actual_col_name].replace([np.inf, -np.inf], np.nan)
                df[actual_col_name] = pd.to_numeric(df[actual_col_name], errors='coerce')

    # II. Price Action & Basic Features (Keep your custom functions)
    df = add_candle_features(df)
    # df = add_candlestick_patterns(df) # We'll replace this with pandas_ta candlestick patterns

    # --- pandas_ta Candlestick Patterns ---
    # Example: Add Doji, Hammer, Engulfing. pandas_ta has many more.
    # 'name="all"' would add many columns, so be selective or use a list.
    candle_patterns_to_check = ["doji", "hammer", "engulfing"] 
    df.ta.cdl_pattern(name=candle_patterns_to_check, append=True)
    # Rename columns to match your old convention if needed, e.g., CDLDOJI -> Is_Doji
    if 'CDLDOJI' in df.columns: df.rename(columns={'CDLDOJI': 'Is_Doji_pta'}, inplace=True) # Add _pta to distinguish
    if 'CDLHAMMER' in df.columns: df.rename(columns={'CDLHAMMER': 'Is_Hammer_pta'}, inplace=True)
    if 'CDLENGULFING' in df.columns: df.rename(columns={'CDLENGULFING': 'Is_Engulfing_pta'}, inplace=True) # This is a general engulfing signal (+/-)

    df = add_return_features(df)

    # III. Statistical Features (Keep your custom functions)
    df = add_rolling_stats(df)
    
    # Lagged Features
    # Ensure base columns for lagging are the ones created by pandas_ta or your helpers
    cols_to_lag_pta = ['close', 'RSI_14', 'Candle_Body', 'Volume_SMA_20'] 
    # Check if these columns actually exist, as pandas_ta might name them slightly differently
    # This valid_cols_to_lag should use the names as they are in df at this point
    valid_cols_to_lag = [col for col in cols_to_lag_pta if col in df.columns]
    df = add_lagged_features(df, valid_cols_to_lag, lags=[1,2,3])
    df = add_market_regime_features(df)
    df["session_id"] = df.index.map(session_key)
    df = add_volume_features(df)
    df = add_donchian_dist(df)

    # --- Suffixing ---
    # All columns created by pandas_ta (that were appended) or by helpers
    # that are not the original 'open', 'high', 'low', 'close', 'volume' will be suffixed.
    current_cols = list(df.columns)
    # Identify features generated in this function call (not the original base OHLCV)
    generated_feature_cols = [col for col in current_cols if col not in base_cols]
    
    rename_dict = {col: col + suffix for col in generated_feature_cols}
    df.rename(columns=rename_dict, inplace=True)
    
    return df

def add_trend_features(df_input, suffix=''):
    """
    Enhanced trend features with proper suffix handling.
    Assumes standard column names from pandas_ta with appropriate suffixes.
    """
    df = df_input.copy()
    
    # Required columns check with suffix
    required_cols = [
        f'EMA_20{suffix}', f'EMA_50{suffix}', 
        f'VWAP_D{suffix}', f'ATR_14{suffix}',
        f'RSI_14{suffix}', f'MACD_12_26_9{suffix}',
        f'BBM_20_2.0{suffix}',  # BB middle band
        'close', 'volume'  # Base columns without suffix
    ]
    
    if not all(col in df.columns for col in required_cols):
        print(f"Warning: Missing some required columns for trend features with suffix {suffix}")
        return df

    # 1. Trend Direction Features
    df[f'Trend_Direction{suffix}'] = np.where(
        df[f'EMA_20{suffix}'] > df[f'EMA_50{suffix}'], 1, -1
    )
    
    # 2. Trend Strength (normalized)
    df[f'Trend_Strength{suffix}'] = (
        (df[f'EMA_20{suffix}'] - df[f'EMA_50{suffix}']) / 
        df[f'ATR_14{suffix}']
    ).rolling(20).mean()
    
    # 3. Price vs Moving Averages (percentage based)
    df[f'Price_vs_EMA20{suffix}'] = (
        (df['close'] - df[f'EMA_20{suffix}']) / 
        df[f'EMA_20{suffix}'] * 100
    )
    
    # 4. VWAP-based trend
    df[f'Price_vs_VWAP{suffix}'] = (
        (df['close'] - df[f'VWAP_D{suffix}']) / 
        df[f'VWAP_D{suffix}'] * 100
    )
    
    # 5. Volatility Regime Features
    vol_short = df[f'ATR_14{suffix}'].rolling(24).mean()
    vol_long = df[f'ATR_14{suffix}'].rolling(120).mean()
    df[f'Vol_Regime{suffix}'] = (
        (vol_short / vol_long - 1) * 100
    ).fillna(0)
    
    # 6. Volume-Weighted Trend
    volume_ma = df['volume'].rolling(20).mean()
    df[f'Volume_Trend{suffix}'] = (
        df['volume'] / volume_ma - 1
    ) * df[f'Trend_Direction{suffix}']
    
    # 7. Multi-Indicator Trend Alignment
    df[f'Trend_Alignment{suffix}'] = (
        (df[f'EMA_20{suffix}'] > df[f'EMA_50{suffix}']) &  # EMA trend
        (df['close'] > df[f'VWAP_D{suffix}']) &            # Above VWAP
        (df[f'RSI_14{suffix}'] > 50) &                     # RSI momentum
        (df[f'MACD_12_26_9{suffix}'] > 0)                  # MACD positive
    ).astype(int)
    
    # 8. Mean Reversion Potential
    df[f'Mean_Reversion{suffix}'] = (
        (df['close'] - df[f'BBM_20_2.0{suffix}']) / 
        df[f'ATR_14{suffix}']
    ).rolling(10).mean()
    
    # 9. Trend Acceleration
    df[f'Trend_Acceleration{suffix}'] = (
        df[f'EMA_20{suffix}'].diff() - 
        df[f'EMA_50{suffix}'].diff()
    ) / df[f'ATR_14{suffix}']
    
    # 10. Composite Trend Score (-100 to +100)
    df[f'Trend_Score{suffix}'] = (
        (df[f'Trend_Direction{suffix}'] * 20) +                    # Base direction
        (df[f'Price_vs_EMA20{suffix}'].clip(-20, 20)) +          # Price vs EMA
        (df[f'RSI_14{suffix}'] - 50) +                           # RSI contribution
        (np.sign(df[f'MACD_12_26_9{suffix}']) * 10) +           # MACD direction
        (df[f'Volume_Trend{suffix}'].clip(-20, 20)) +           # Volume trend
        (df[f'Trend_Alignment{suffix}'] * 20)                    # Alignment bonus
    ).clip(-100, 100)
    
    return df

# --- Time & Session Features (Keep as is) ---
def add_time_session_features(df):
    # ... (your existing add_time_session_features function) ...
    if not isinstance(df.index, pd.DatetimeIndex):
        print("Error: DataFrame index must be DatetimeIndex for time/session features.")
        return df
    df = df.copy()
    df['Hour_of_Day'] = df.index.hour
    df['Minute_of_Hour'] = df.index.minute
    df['Day_of_Week'] = df.index.dayofweek
    time_fraction_of_day = df['Hour_of_Day'] + df['Minute_of_Hour'] / 60.0
    df['Time_Sin'] = np.sin(2 * np.pi * time_fraction_of_day / 24.0)
    df['Time_Cos'] = np.cos(2 * np.pi * time_fraction_of_day / 24.0)
    df['Day_Sin'] = np.sin(2 * np.pi * df['Day_of_Week'] / 7.0)
    df['Day_Cos'] = np.cos(2 * np.pi * df['Day_of_Week'] / 7.0)
    df['Is_Asian_Session'] = ((df['Hour_of_Day'] >= 20) | (df['Hour_of_Day'] < 5)).astype(int)
    df['Is_London_Session'] = ((df['Hour_of_Day'] >= 3) & (df['Hour_of_Day'] < 12)).astype(int)
    df['Is_NY_Session'] = ((df['Hour_of_Day'] >= 8) & (df['Hour_of_Day'] < 17)).astype(int)
    df['Is_Overlap'] = ((df['Hour_of_Day'] >= 8) & (df['Hour_of_Day'] < 12)).astype(int)
    df['Is_US_Open_Hour'] = ((df['Hour_of_Day'] == 9) & (df['Minute_of_Hour'] >= 30) | (df['Hour_of_Day'] == 10) & (df['Minute_of_Hour'] < 30)).astype(int)
    df['Is_US_Close_Hour'] = ((df['Hour_of_Day'] == 15) | (df['Hour_of_Day'] == 16) & (df['Minute_of_Hour'] == 0)).astype(int)
    return df

In [None]:
avoid_funcs = {
    #'avoid_hour_18_19': avoid_hour_18_19
    #'news_window': avoid_news,
}

param_grid_strategy = {
    'SL_ATR_MULT': [1.0, 1.5, 2.0],  # Wider stops
    'TP_ATR_MULT': [3.0, 4.0, 5.0, 8.0],   # More conservative targets
    'TRAIL_START_MULT': [1.0, 1.5],    # Let winners run
    'TRAIL_STOP_MULT': [0.8, 1.0],     # Tighter trailing stops
    'TICK_VALUE': [20],
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

# Cleanup

In [None]:
df_1min, resampled = load_and_resample_data(market, timeframes=['5min', '15min','1h'])

df_5min = resampled['5min']
df_15min = resampled['15min']
df_1hr = resampled['1h']

print("5min shape:", df_5min.shape)
print("15min shape:", df_15min.shape)
print("1h shape:", df_1hr.shape)


df_merged = apply_feature_engineering(
    resampled=resampled,
    add_all_features=add_all_features,
    add_time_session_features=add_time_session_features,
    add_trend_features=add_trend_features,  # Pass your trend features function here
    timeframes=['5min', '15min', '1h'],
    base_tf='5min'
)

# Add this code right after the feature engineering but before model fitting

print(f"Original merged dataset shape: {df_merged.shape}")

# Count rows with at least one NaN
nan_rows_count = df_merged.isna().any(axis=1).sum()
total_rows = len(df_merged)
print(f"Rows with at least one NaN: {nan_rows_count} out of {total_rows} ({(nan_rows_count/total_rows)*100:.2f}%)")


# Verify Data
# def check_feature_alignment(df_merged, tf_suffix, feature_keyword, sample_times):
#     """
#     Check if features from the correct timeframe are available and what values they have at key timestamps.
#     """
#     suffix_str = f"_{tf_suffix}"
#     matching_cols = [col for col in df_merged.columns if feature_keyword in col and col.endswith(suffix_str)]
    
#     print(f"\n🔍 Checking features containing '{feature_keyword}' with suffix '{suffix_str}'")

#     if not matching_cols:
#         print(f"⚠️ No matching columns found. Available columns ending with {suffix_str}:")
#         sample_suffix_cols = [col for col in df_merged.columns if col.endswith(suffix_str)]
#         print(sample_suffix_cols[:10])  # Show only first 10 to keep it clean
#         return

#     for time in sample_times:
#         if time not in df_merged.index:
#             print(f"❌ Time {time} not found in df_merged index.")
#         else:
#             print(f"\n⏰ At time {time} — values:")
#             print(df_merged.loc[time, matching_cols])

# sample_times = [
#     pd.Timestamp("2022-01-05 12:05:00-05:00"),
#     pd.Timestamp("2022-01-05 12:15:00-05:00"),
#     pd.Timestamp("2022-01-05 12:59:00-05:00"),
#     pd.Timestamp("2022-01-05 13:00:00-05:00")
# ]

# check_feature_alignment(df_merged, tf_suffix="1h", feature_keyword="RSI", sample_times=sample_times)
# check_feature_alignment(df_merged, tf_suffix="15min", feature_keyword="MACD", sample_times=sample_times)
# check_feature_alignment(df_merged, tf_suffix="5min", feature_keyword="EMA", sample_times=sample_times)


In [None]:
lookahead_options = [12, 6, 3]
for lookahead in lookahead_options:
    labeled = label_and_save(
        df_input_features=df_merged,
        lookahead_period=lookahead,
        vol_col_name='ATR_14_5min',
        pt_multiplier=2.0,
        sl_multiplier=1.0,
        min_return_percentage=0.0005,
        output_file_suffix=f'{lookahead}{market}',
        feature_columns_for_dropna=[]
    )

# Train

##### Real Training

In [None]:
selected_indicators_class = [
    # 'VWAP_Session_5min',
    # 'VWAP_Session_15min',
    # 'VWAP_Session_1h',

    # 'Vol_Delta_1_5min',
    # 'Vol_Delta_1_15min',
    # 'Vol_Delta_1_1h',

    # 'Vol_Delta_3_5min',
    # 'Vol_Delta_rollsum_3_5min',
    # 'Vol_Delta_rollavg_3_5min',
    # 'Vol_Delta_3_15min',
    # 'Vol_Delta_rollsum_3_15min',
    # 'Vol_Delta_rollavg_3_15min',
    # 'Vol_Delta_2_1h',
    # 'Vol_Delta_rollsum_2_1h',
    # 'Vol_Delta_rollavg_2_1h',

    # 'High_Vol_Event_20_5min',
    # 'High_Vol_Event_20_15min',
    # 'High_Vol_Event_20_1h',
    # 'High_Vol_Event_10_5min',
    # 'High_Vol_Event_10_15min',
    # 'High_Vol_Event_10_1h',
    # 'High_Vol_Event_5_5min',
    # 'High_Vol_Event_5_15min',
    # 'High_Vol_Event_5_1h',

    # 'Candle_Body_5min',
    # 'Candle_Body_15min',
    # 'Candle_Body_1h',

    # 'Candle_Body_%_5min',
    # 'Candle_Body_%_15min',
    # 'Candle_Body_%_1h',

    # 'Is_Choppy_14_5min',
    # 'Is_Choppy_14_15min',
    # 'Is_Choppy_7_1h',

    # 'RSI_14_5min',
    # 'RSI_14_15min',
    # 'RSI_7_1h',

    # 'ATR_14_5min',
    # 'ATR_14_15min',
    # 'ATR_7_1h',

    # 'Time_Sin',
    # 'Time_Cos',
    # ————— Core trend / momentum —————
    # fast-/slow EMA and their slope – catches directional bias
    'EMA_9_5min', 'EMA_21_5min', 'EMA_9_15min', 'EMA_21_15min',
    'EMA_9_Slope_9_5min', 'EMA_21_Slope_21_5min',
    'close_vs_EMA9_5min',         # (close / EMA_9) – long>1, short<1
    'EMA_9_vs_EMA21_5min',         # (EMA_9 / EMA_21)
    'MACDh_12_26_9_5min',     # histogram ≈ momentum direction
    'MACDh_12_26_9_15min',

    # ————— Volatility / breakout —————
    'Donchian_Dist_20_5min',      # (close – Donchian_mid) / ATR
    'BBB_20_2.0_5min',     # squeeze → expansion → direction
    'ATR_14_5min',                # keep one ATR only (14 on base tf)

    # ————— Order-flow & volume —————
    'Rel_Vol_20_5min',            # volume / SMA(volume,20)
    'CVD_3_5min',                 # short cumulative volume delta
    'Vol_Delta_3_5min',           # **retain**   ↳ already in v1
    'Vol_Delta_rollsum_3_5min',   # **retain**
    # drop most 15 m / 1 h deltas – little incremental lift

    # ————— Price-action ratios —————
    'Upper_Wick_%_5min', 'Lower_Wick_%_5min',
    'CDL_ENGULFING_5min',  # 1/0 flags
    'Prev_Swing_Dist_5min',        # dist to prev swing hi/lo ÷ ATR

    # ————— Market-regime / filter —————
    'Is_Choppy_14_5min',           # keep single choppiness flag
    'ADX_14_5min', 'Plus_DI_14_5min', 'Minus_DI_14_5min',   # trend strength/side
    'Time_Sin', 'Time_Cos',        # intraday periodicity (retain)

    # ————— Session VWAP —————
    'VWAP_Session_5min',           # keep 5 min only → subsumes higher tfs
    'close_vs_VWAP_D_5min',          # (close / vwap) – directional bias
]

selected_indicators_reg = [
    'VWAP_Session_5min',
    'VWAP_Session_15min',
    'VWAP_Session_1h',

    'POC_Current_5min',
    'POC_Previous_5min',
    'POC_Current_15min',
    'POC_Previous_15min',
    'POC_Current_1h',
    'POC_Previous_1h',

    'Vol_Delta_1_5min',
    'Vol_Delta_1_15min',
    'Vol_Delta_1_1h',

    'Vol_Delta_3_5min',
    'Vol_Delta_rollsum_3_5min',
    'Vol_Delta_rollavg_3_5min',
    'Vol_Delta_3_15min',
    'Vol_Delta_rollsum_3_15min',
    'Vol_Delta_rollavg_3_15min',
    'Vol_Delta_2_1h',
    'Vol_Delta_rollsum_2_1h',
    'Vol_Delta_rollavg_2_1h',

    'Vol_zscore_20_5min',
    'Vol_zscore_10_15min',
    'Vol_zscore_5_1h',

    'EMA_21_Slope_21_3_5min',
    'EMA_21_Slope_21_3_15min',
    'EMA_21_Slope_21_2_1h',

    'close_vs_EMA21_5min',
    'close_vs_EMA21_15min',
    'close_vs_EMA21_1h',

    'ATR_14_5min',
    'ATR_14_15min',
    'ATR_7_1h',

    'CHOP_14_1_100_5min',
    'CHOP_14_1_100_5min',
    'CHOP_7_1_100_5min',
    
    'Hour_of_Day',
    'Is_Asian_Session',
    'Is_London_Session',
    'Is_NY_Session',
    'Is_Overlap',
    'Is_US_Open_Hour',
    'Is_US_Close_Hour'
]

In [None]:
class CNN1DWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, input_shape, filters=64, kernel_size=3, dropout=0.2, learning_rate=0.001):
        self.input_shape = input_shape
        self.filters = filters
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.model = None

    def build_model(self):
        model = Sequential([
            Conv1D(self.filters, kernel_size=self.kernel_size, activation='relu', input_shape=(self.input_shape, 1)),
            Dropout(self.dropout),
            Conv1D(self.filters * 2, kernel_size=self.kernel_size, activation='relu'),
            Dropout(self.dropout),
            GlobalAveragePooling1D(),
            Dense(32, activation='relu'),
            Dense(1)
        ])
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss='mse')
        return model

    def fit(self, X, y, **kwargs):
        X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
        self.model = self.build_model()
        self.model.fit(
            X_reshaped, y,
            epochs=50,
            batch_size=32,
            validation_split=0.2,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
            ],
            verbose=0
        )
        return self

    def predict(self, X):
        X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
        return self.model.predict(X_reshaped).flatten()

In [None]:
def run_lookahead_for_session_regression():
    return
    # === Load data ===
    path = f"parquet/labeled_data_L12_PT2SL1VB12{market}.parquet"
    labeled = pd.read_parquet(path)

    # === Ensure datetime column exists and is parsed ===
    if labeled.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(labeled.index):
        labeled = labeled.reset_index()
    if 'datetime' not in labeled.columns:
        raise KeyError("❌ 'datetime' column is missing.")

    labeled['datetime'] = pd.to_datetime(labeled['datetime'])
    labeled = labeled.sort_values('datetime')

    # === Train/test split ===
    cutoff_date = pd.Timestamp("2025-05-01", tz="America/New_York")
    train = labeled[labeled['datetime'] < cutoff_date]
    test = labeled[labeled['datetime'] >= cutoff_date]

    train = train.set_index('datetime')
    test = test.set_index('datetime')

    # === Feature selection ===
    X_train = train[selected_indicators]
    X_test = test[selected_indicators]

    # === Find regression target column ===
    reg_cols = [col for col in labeled.columns if col.startswith("reg_value")]
    if not reg_cols:
        raise ValueError("❌ No regression target column found starting with 'reg_value'.")
    reg_col = reg_cols[0]
    print(f"📌 Using regression target column: {reg_col}")

    y_train_seq = train[reg_col]
    y_test_seq = test[reg_col]

    print(f"Train range: {train.index.min()} to {train.index.max()} | Rows: {len(train)}")
    print(f"Test range: {test.index.min()} to {test.index.max()} | Rows: {len(test)}")

    ###########################
    ########## Models #########
    ###########################    
    def tune_catboost_regressor(X, y, df, study_name="catboost-default", db_path="dbs/catboost.db", n_trials=1, threshold_points=10):
        def objective(trial):
            # 1) Suggest a set of CatBoost hyperparameters:
            params = {
                "iterations"       : trial.suggest_int("iterations", 100, 5000, step=100),
                "depth"            : trial.suggest_int("depth", 4, 10),
                "learning_rate"    : trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "l2_leaf_reg"      : trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
                "random_strength"  : trial.suggest_float("random_strength", 0.5, 5.0),
                "min_data_in_leaf" : trial.suggest_int("min_data_in_leaf", 10, 100),
                "loss_function"    : "RMSE",   # We will still optimize Sharpe, not RMSE directly.
                "verbose"          : 0,
                "thread_count"     : 3,
                "random_state"     : 42
            }

            fold_scores = []
            tscv = TimeSeriesSplit(n_splits=4)

            for train_idx, val_idx in tscv.split(X):
                X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
                prices_val   = df.iloc[val_idx]["open"].to_numpy()  # entry prices

                # build sample weights that up‐weight negative examples if you like:
                w_tr = np.ones(len(train_idx))
                # (optional) w_tr[y_tr<0] *= (1 + 10*abs(y_tr[y_tr<0]))

                # train
                model = CatBoostRegressor(**params)
                model.fit(X_tr, y_tr, sample_weight=w_tr)

                # predict
                preds = model.predict(X_val)
                actual = y_val.to_numpy()

                # compute direction‐aware 10-point threshold in pct
                thr_pct = threshold_points / prices_val

                # classify
                long_pred =  preds >=  thr_pct
                long_true =  actual >=  thr_pct
                short_pred = preds <= -thr_pct
                short_true = actual <= -thr_pct

                correct = np.sum(long_pred & long_true) + np.sum(short_pred & short_true)
                wrong   = np.sum(long_pred & ~long_true) + np.sum(short_pred & ~short_true)

                fold_scores.append(correct - wrong)

            # average net‐correctness across folds
            return float(np.mean(fold_scores))

        study = optuna.create_study(
            direction="maximize",
            study_name=study_name,
            storage=f"sqlite:///{db_path}",
            load_if_exists=True
        )
        study.optimize(objective, n_trials=n_trials, n_jobs=4)

        # pull out only the CatBoost init args
        all_params = study.best_trial.params
        valid_keys = {"iterations","depth","learning_rate","l2_leaf_reg","random_strength","min_data_in_leaf","loss_function"}
        best_model_params = {k: all_params[k] for k in all_params if k in valid_keys}
        return best_model_params
    
    def tune_xgbm_regressor(X, y, df,study_name="xgbm-default", db_path="dbs/xgbm.db", n_trials=1, threshold_points=10):
        def objective(trial):
            # 1) Suggest a set of CatBoost hyperparameters:
            params = {
                "objective":        "reg:squarederror",    # we’ll still optimize Sharpe, not RMSE directly
                "n_estimators":     trial.suggest_int("n_estimators",     100,  5000, step=50),
                "learning_rate":    trial.suggest_float("learning_rate",  0.01, 0.3,  log=True),
                "max_depth":        trial.suggest_int("max_depth",        3,    12),
                "min_child_weight": trial.suggest_int("min_child_weight", 1,    10),
                "subsample":        trial.suggest_float("subsample",       0.6,  1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree",0.6,  1.0),
                "gamma":            trial.suggest_float("gamma",           0.0,  0.1),
                "reg_alpha":        trial.suggest_float("reg_alpha",       0.0,  1.0),
                "reg_lambda":       trial.suggest_float("reg_lambda",      0.0,  1.0),
                "random_state":     42,
                "n_jobs":           3,         # or -1 if you have ≥8 physical cores
                "verbosity":        0          # silent
            }

            fold_scores = []
            tscv = TimeSeriesSplit(n_splits=4)

            for train_idx, val_idx in tscv.split(X):
                X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
                prices_val   = df.iloc[val_idx]["open"].to_numpy()  # entry prices

                # build sample weights that up‐weight negative examples if you like:
                w_tr = np.ones(len(train_idx))
                # (optional) w_tr[y_tr<0] *= (1 + 10*abs(y_tr[y_tr<0]))

                # train
                model = xgb.XGBRegressor(**params)
                model.fit(X_tr, y_tr, sample_weight=w_tr)

                # predict
                preds = model.predict(X_val)
                actual = y_val.to_numpy()

                # compute direction‐aware 10-point threshold in pct
                thr_pct = threshold_points / prices_val

                # classify
                long_pred =  preds >=  thr_pct
                long_true =  actual >=  thr_pct
                short_pred = preds <= -thr_pct
                short_true = actual <= -thr_pct

                correct = np.sum(long_pred & long_true) + np.sum(short_pred & short_true)
                wrong   = np.sum(long_pred & ~long_true) + np.sum(short_pred & ~short_true)

                fold_scores.append(correct - wrong)

            # average net‐correctness across folds
            return float(np.mean(fold_scores))

        study = optuna.create_study(
            direction="maximize",
            study_name=study_name,
            storage=f"sqlite:///{db_path}",
            load_if_exists=True
        )
        study.optimize(objective, n_trials=n_trials, n_jobs=3)

        # pull out only the CatBoost init args
        all_params = study.best_trial.params
        valid_keys = {"iterations","depth","learning_rate","l2_leaf_reg","random_strength","min_data_in_leaf","loss_function"}
        best_model_params = {k: all_params[k] for k in all_params if k in valid_keys}
        return best_model_params

    ################################################
    ####### Tune models
    xgbm_params          = tune_xgbm_regressor(X_train, y_train_seq, train, study_name=f"xgbm-{market}")
    catboost_params      = tune_catboost_regressor(X_train, y_train_seq, train, study_name=f"catboost-{market}")

    ################################################
    ####### Train models
    catboost    = CatBoostRegressor(**catboost_params, random_state=42, verbose=0)
    xgbm        = xgb.XGBRegressor(**xgbm_params, random_state=42)

    ################################################
    ####### Base Stacks
    catboost_models = [catboost]
    catboost_oof = generate_oof_predictions(catboost_models, X_train, y_train_seq, splits=4)
    # The returned DataFrame usually has column name ["model_0"] by default.
    # Rename it to "catboost_oof" so that the same name can be used at test time:
    catboost_oof = catboost_oof.rename(columns={"model_0": "catboost_oof"})

    scaler_cb = StandardScaler()
    catboost_oof_scaled = scaler_cb.fit_transform(catboost_oof)  
    # Now the scaler knows there’s exactly one feature: "catboost_oof".

    # (b) LightGBM OOF
    xgbm_models = [xgbm]
    xgbm_oof = generate_oof_predictions(xgbm_models, X_train, y_train_seq, splits=4)
    xgbm_oof = xgbm_oof.rename(columns={"model_0": "xgbm_oof"})

    scaler_xgb = StandardScaler()
    xgbm_oof_scaled = scaler_xgb.fit_transform(xgbm_oof)
    # The scaler now knows there’s exactly one feature: "lgbm_oof".


    # 5) === Build your meta‐training matrix ===
    X_meta_train = pd.DataFrame({
        "catboost_oof": catboost_oof_scaled.ravel(),   # flatten into 1D
        "xgbm_oof":     xgbm_oof_scaled.ravel()
    }, index=X_train.index)

    alphas     = np.logspace(-4, 2, 40)
    meta_model = RidgeCV(alphas=alphas)
    meta_model.fit(X_meta_train, y_train_seq)


    # 6) === Retrain base models on the full training set ===
    for model in catboost_models + xgbm_models:
        model.fit(X_train, y_train_seq)


    # 7) === At test time, get each base model’s raw prediction on X_test ===
    #      Make sure to use the exact same column names as above!

    # (a) CatBoost raw test‐pred:
    cat_preds = pd.DataFrame({
        "catboost_oof": catboost_models[0].predict(X_test)
    }, index=X_test.index)

    # (b) LightGBM raw test‐pred:
    lgb_preds = pd.DataFrame({
        "xgbm_oof": xgbm_models[0].predict(X_test)
    }, index=X_test.index)


    # 8) === Scale those test‐time columns with the *same* scalers ===
    cat_preds_scaled = scaler_cb.transform(cat_preds)    # cat_preds has column "catboost_oof"
    xgb_preds_scaled = scaler_xgb.transform(lgb_preds)  # lgb_preds has column "lgbm_oof"


    # 9) === Build the two‐column “meta” feature matrix for test ===
    X_meta_test = pd.DataFrame({
        "catboost_oof": cat_preds_scaled.ravel(),
        "xgbm_oof":     xgb_preds_scaled.ravel()
    }, index=X_test.index)


    # 10) === Final stacked prediction ===
    preds_stack = meta_model.predict(X_meta_test)

    corr = np.corrcoef(y_test_seq, preds_stack)[0, 1]
    print(f"Correlation with target: {corr:.4f}")

    ################################################
    ####### Evaluate Model
    def evaluate_model(name, model, Xtr, Xte, ytr, yte, transformed=False):
        train_preds = model.predict(Xtr)
        test_preds = model.predict(Xte)

        if transformed:
        # Inverse-transform predictions
            train_preds = np.sign(train_preds) * (np.expm1(np.abs(train_preds)))
            test_preds = np.sign(test_preds) * (np.expm1(np.abs(test_preds)))
            ytr = np.sign(ytr) * (np.expm1(np.abs(ytr)))

        train_mse = mean_squared_error(ytr, train_preds)
        test_mse = mean_squared_error(yte, test_preds)
        overfit_ratio = test_mse / train_mse if train_mse != 0 else float('inf')

        print(f"\n📊 {name} Performance:")
        print(f"Train MSE: {train_mse:.8f}")
        print(f"Test MSE: {test_mse:.8f}")
        print(f"Overfit ratio (Test / Train): {overfit_ratio:.2f}")
        if overfit_ratio > 1.5:
            print("⚠️ Potential overfitting detected.")
        elif overfit_ratio < 0.7:
            print("⚠️ Possibly underfitting.")
        else:
            print("✅ Generalization looks reasonable.")
        return test_preds

    ####### Tree Based #######
    print("\nEvaluation CatBoost")
    preds_catboost  = evaluate_model("CatBoostRegressorUp", catboost, X_train, X_test, y_train_seq, y_test_seq, transformed=False)

    print("\nEvaluation LGBM")
    preds_xgbm  = evaluate_model("XGBM", xgbm, X_train, X_test, y_train_seq, y_test_seq, transformed=False)

    ################################################
    ####### Target Distribution
    print("\n🔍 Target distribution Seq:")
    print(y_train_seq.describe())

    ################################################
    ####### Choose final model
    for pred in [preds_stack, preds_xgbm, preds_catboost]:
        print("\n🔍 Checking prediction variance:")
        print(f"Min: {pred.min():.8f}")
        print(f"Max: {pred.max():.8f}")
        print(f"Mean: {pred.mean():.8f}")
        print(f"Std Dev: {pred.std():.8f}")
        print(f"First 5 Predictions: {pred[:5]}")

        mae = mean_absolute_error(y_test_seq, pred)
        rmse = np.sqrt(mean_squared_error(y_test_seq, pred))
        r2 = r2_score(y_test_seq, pred)

        print(f"MAE: {mae:.4f}")
        print(f"RMSE: {rmse:.4f}")
        print(f"R²: {r2:.4f}")

    # === Ensure datetime is preserved ===
    X_test = X_test.copy()
    X_test["datetime"] = X_test.index

    metadata = {
        "catboost_params": catboost_params,
        "xgbm_params": xgbm_params,
    }
    with open(f"regression_metadata_{market}.json", "w") as f:
        json.dump(metadata, f, indent=2)

    joblib.dump(list(X_train.columns), f"pkl/model_features_{market}-12combo.pkl")
    joblib.dump(meta_model, f"pkl/stack_model_regression_{market}-12combo.pkl")

    return {
        'preds_stack': preds_stack,
        'X_test': X_test,
        'true_values': y_test_seq
    }

In [None]:
def run_lookahead_for_session_classification(LOOKAHEAD):
    return
    # === Load data ===
    path = f"parquet/labeled_data_{LOOKAHEAD}{market}.parquet"
    labeled = pd.read_parquet(path)

    # === Ensure datetime column exists and is parsed ===
    if labeled.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(labeled.index):
        labeled = labeled.reset_index()
    if 'datetime' not in labeled.columns:
        raise KeyError("❌ 'datetime' column is missing.")

    labeled['datetime'] = pd.to_datetime(labeled['datetime'])
    labeled = labeled.sort_values('datetime')

    # === Train/test split ===
    cutoff_date = pd.Timestamp("2025-05-01", tz="America/New_York")
    train = labeled[labeled['datetime'] < cutoff_date]
    test = labeled[labeled['datetime'] >= cutoff_date]

    train = train.set_index('datetime')
    test = test.set_index('datetime')

    # === Feature selection ===
    X_train = train[selected_indicators]
    X_test = test[selected_indicators]

    # === Find regression target column ===
    reg_cols = [col for col in labeled.columns if col.startswith("clf_target")]
    if not reg_cols:
        raise ValueError("❌ No regression target column found starting with 'clf_target'.")
    reg_col = reg_cols[0]
    print(f"📌 Using regression target column: {reg_col}")

    y_train = train[reg_col]
    y_test = test[reg_col]

    print(f"Train range: {train.index.min()} to {train.index.max()} | Rows: {len(train)}")
    print(f"Test range: {test.index.min()} to {test.index.max()} | Rows: {len(test)}")

    ###########################
    ########## Models #########
    ###########################

    def tune_xgboost(X_train, y_train):
        def objective(trial):

            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'max_depth': trial.suggest_int('max_depth', 3, 12),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0.0, 5.0),
                'eval_metric': 'logloss',
                'objective': 'multi:softmax',  # or 'multi:softprob' if you need probabilities
                'num_class': len(np.unique(y_train))
            }

            tscv = TimeSeriesSplit(n_splits=splits)
            scores = []

            for train_idx, val_idx in tscv.split(X_train):
                X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

                sample_weights = compute_sample_weight(class_weight='balanced', y=y_tr)

                model = XGBClassifier(**params, random_state=42, n_jobs=-1)
                model.fit(X_tr, y_tr, sample_weight=sample_weights)

                preds = model.predict(X_val)
                score = f1_score(y_val, preds, average='macro')
                scores.append(score)

            print(f"Trial {trial.number} F1 Score: {np.mean(scores):.5f} | Params: {params}")
            return np.mean(scores)

        study = optuna.create_study(
            direction='maximize',
            study_name=f'xgb_opt_class_{LOOKAHEAD}',
            sampler=optuna.samplers.TPESampler(seed=42),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
            storage=f'sqlite:///xgb_opt_study_session_less.db',
            load_if_exists=True
        )
        study.optimize(objective, n_trials=2)
        return study.best_params

    def tune_rf(X_train, y_train):
        def objective(trial):
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
                "max_depth": trial.suggest_int("max_depth", 3, 20),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
                "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
                "class_weight": trial.suggest_categorical("class_weight", [None, "balanced", "balanced_subsample"]),
                "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
            }

            tscv = TimeSeriesSplit(n_splits=splits)
            model = RandomForestClassifier(**params, random_state=42, n_jobs=-1)

            scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring="f1_macro", n_jobs=-1)
            print(f"Trial {trial.number} F1 Score: {scores.mean():.5f} | Params: {params}")
            return scores.mean()

        study = optuna.create_study(
            direction="maximize",
            study_name=f"rf_opt_class_{LOOKAHEAD}",
            sampler=optuna.samplers.TPESampler(seed=42),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
            storage=f"sqlite:///rf_opt_study_session_less.db",
            load_if_exists=True
        )
        study.optimize(objective, n_trials=2)
        return study.best_params

    def tune_catboost(X_train, y_train):
        def objective(trial):
            bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli'])

            class_weights = compute_class_weight(
                class_weight='balanced',
                classes=np.unique(y_train),
                y=y_train
            )

            params = {
                'iterations': trial.suggest_int('iterations', 300, 1500, step=100),
                'depth': trial.suggest_int('depth', 4, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
                'random_strength': trial.suggest_float('random_strength', 0.5, 5.0),
                'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
                'bootstrap_type': bootstrap_type,
                'loss_function': 'MultiClass',
                'eval_metric': 'TotalF1',
                'class_weights': class_weights.tolist(),
                'verbose': 0
            }

            if bootstrap_type == 'Bayesian':
                params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0.0, 1.0)

            model = CatBoostClassifier(**params, random_state=42)

            tscv = TimeSeriesSplit(n_splits=splits)
            scores = cross_val_score(model, X_train, y_train, cv=tscv, scoring='f1_macro', n_jobs=-1)
            print(f"Trial {trial.number} F1 Score: {scores.mean():.5f} | Params: {params}")
            return scores.mean()

        study = optuna.create_study(
            direction='maximize',
            study_name=f'catboost_opt_class_{LOOKAHEAD}',
            sampler=optuna.samplers.TPESampler(seed=42),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
            storage=f'sqlite:///catboost_opt_study_session_less.db',
            load_if_exists=True
        )
        study.optimize(objective, n_trials=2)
        return study.best_params

    def tune_meta_logreg(X_meta, y_meta):
        def objective(trial):
            penalty = trial.suggest_categorical("penalty", ["l2", None])
            if penalty is not None:
                C = trial.suggest_float("C", 1e-4, 10.0, log=True)
            else:
                C = 1.0  # default, unused

            class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

            params = {
                "penalty": penalty,
                "C": C,
                "solver": "lbfgs",
                "max_iter": 2000,
                "class_weight": class_weight,
            }

            model = make_pipeline(StandardScaler(), LogisticRegression(**params, random_state=42))
            tscv = TimeSeriesSplit(n_splits=splits)

            scores = cross_val_score(model, X_meta, y_meta, cv=tscv, scoring="f1_macro", n_jobs=-1)
            print(f"Trial {trial.number} F1 Score: {scores.mean():.5f} | Params: {params}")
            return scores.mean()

        study = optuna.create_study(
            direction="maximize",
            study_name=f"meta_logreg_class_{LOOKAHEAD}",
            sampler=optuna.samplers.TPESampler(seed=42),
            pruner=optuna.pruners.MedianPruner(n_startup_trials=5),
            storage=f"sqlite:///meta_logreg_stack_session_less.db",
            load_if_exists=True
        )
        study.optimize(objective, n_trials=2)  # adjust trial count as needed
        return study.best_params

    def tune_lstm_classifier_with_optuna(X, y, splits, lookahead, num_classes=2):
        def objective(trial):
            units = trial.suggest_int("units", 16, 128, step=16)
            lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
            batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
            epochs = trial.suggest_int("epochs", 5, 30)

            scores = []
            tscv = TimeSeriesSplit(n_splits=splits)

            for train_idx, val_idx in tscv.split(X):
                X_tr, X_val = X[train_idx], X[val_idx]
                y_tr, y_val = y[train_idx], y[val_idx]

                model = LSTMClassifierWrapper(
                    input_shape=X.shape[1],
                    units=units,
                    lr=lr,
                    epochs=epochs,
                    batch_size=batch_size,
                    verbose=0,
                    num_classes=num_classes
                )
                model.fit(X_tr, y_tr)
                preds = model.predict(X_val)
                acc = accuracy_score(y_val, preds)
                scores.append(acc)

            mean_acc = np.mean(scores)
            print(f"Trial {trial.number} Accuracy: {mean_acc:.5f} | Params: units={units}, lr={lr}, batch={batch_size}, epochs={epochs}")
            return mean_acc

        study = optuna.create_study(
            direction="maximize",
            study_name="lstm_class_opt",
            storage=f"sqlite:///lstm_class_opt_study{lookahead}_session_less.db",
            load_if_exists=True
        )
        study.optimize(objective, n_trials=2)
        print("Best trial:", study.best_trial.params)
        return study.best_trial.params
    ################################################
    ####### Ensure index consistency
    ####### Sequential #######
    # y_train_seq = y_train_class.loc[X_train_seq.index]
    # y_test_seq = y_test_class.loc[X_test_seq.index]

    ################################################
    ####### Tune models
    ####### Tree Based #######
    # catboost_params     = tune_catboost(X_train_tree, y_train_class)
    # xgboost_params      = tune_xgboost(X_train_tree, y_train_class)
    # rf_params         = tune_rf(X_train_tree, y_train_class)
    # X_lstm = X_train_seq.values
    # y_lstm = y_train_class.values

    ################################################
    ####### Train models
    ####### Tree Based #######
    # xgb_weights = compute_sample_weight('balanced', y_train_class)
    # cb_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_class), y=y_train_class)
    # catboost    = CatBoostClassifier(**catboost_params, random_state=42, class_weights=cb_weights.tolist(), verbose=0)
    # xgboost     = XGBClassifier(**xgboost_params, random_state=42)
    # rf          = RandomForestClassifier(**rf_params, random_state=42)
    # catboost.fit(X_train_tree, y_train_class)
    # xgboost.fit(X_train_tree, y_train_class, sample_weight=xgb_weights)
    # rf.fit(X_train_tree, y_train_class)
    # ####### Sequential #######
    # lstm_model = LSTMClassifierWrapper(input_shape=X_lstm.shape[1])
    # lstm_model.fit(X_lstm, y_lstm)  # wrapper does the reshaping
    # X_lstm_test = X_test_seq.values
    # lstm_preds = lstm_model.predict(X_lstm_test)
    ################################################
    ####### OOF Predicition
    ####### Tree Based #######
    oof_tree = generate_oof_predictions_class([catboost, rf], X_train_tree, y_train_class, splits)
    oof_lstm = generate_oof_lstm_classifier(LSTMClassifierWrapper, X_lstm, y_lstm, splits)  # <- Uses sequential input

    ################################################
    ####### Train Meta Model
    ####### Tree Based #######
    X_meta_train = pd.DataFrame({
        'cat': oof_tree.iloc[:, 0].values,
        'rf': oof_tree.iloc[:, 1].values,
        'lstm': oof_lstm.values
    }, index=y_train_class.index)

    X_meta_test = pd.DataFrame({
        "cat": catboost.predict(X_test_tree).flatten(),
        "rf": rf.predict(X_test_tree).flatten(),
        "lstm": lstm_model.predict(X_test_seq.values).flatten()
    })

    X_meta_train_combined = pd.concat([
        X_meta_train.reset_index(drop=True),
        X_train_linear.reset_index(drop=True)
    ], axis=1)

    X_meta_test_combined = pd.concat([
        X_meta_test.reset_index(drop=True),
        X_test_linear.reset_index(drop=True)
    ], axis=1)

    meta_params = tune_meta_logreg(X_meta_train_combined, y_train_class)
    meta_model = make_pipeline(StandardScaler(),LogisticRegression(**meta_params, random_state=42))
    meta_model.fit(X_meta_train_combined, y_train_class)

    ################################################
    ####### Evaluate Model
    def evaluate_model(name, model, Xtr, Xte, ytr, yte):
        train_preds = model.predict(Xtr)
        test_preds = model.predict(Xte)

        train_acc = accuracy_score(ytr, train_preds)
        test_acc = accuracy_score(yte, test_preds)

        print(f"\n📊 {name} Classification Accuracy:")
        print(f"Train Accuracy: {train_acc:.4f}")
        print(f"Test Accuracy: {test_acc:.4f}")

        return test_preds
    
    ####### Tree Based #######
    print("\nXGBoost")
    preds_xgboost   = evaluate_model("XGBoostRegressor", xgboost, X_train_tree, X_test_tree, y_train_seq, y_test_seq)
    print("\nCatboost")
    preds_catboost  = evaluate_model("CatBoostRegressor", catboost, X_train_tree, X_test_tree, y_train_seq, y_test_seq)
    print("\nRF")
    preds_rf        = evaluate_model("RandomForest", rf, X_train_tree, X_test_tree, y_train_seq, y_test_seq)
    print("\nLSTM")
    preds_lstm       = evaluate_model("LSTM", lstm_model, X_train_seq.values, X_test_seq.values, y_train_seq.values, y_test_seq.values)
    print("\nMeta Model")
    preds_stack     = evaluate_model("StackingRegressor", meta_model, X_meta_train_combined, X_meta_test_combined, y_train_class.values, y_test_class.values)

    ################################################
    ####### Target Distribution
    print("\n🔍 Target distribution:")
    print(y_train_class.describe())
    
    ################################################
    ####### Choose final model
    ####### Tree Based #######
    preds_xgboost = xgboost.predict(X_test_tree)
    print("\n🔍 Checking XGBoost prediction distribution (classification):")
    print(f"Classes predicted: {np.unique(preds_xgboost)}")
    print(f"Prediction counts:\n{pd.Series(preds_xgboost).value_counts()}")

    # Classification metrics
    acc = accuracy_score(y_test_class, preds_xgboost)
    f1 = f1_score(y_test_class, preds_xgboost, average='macro')
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    print("\nClassification report:")
    print(classification_report(y_test_class, preds_xgboost))

    ####### Stacked Model #######
    preds_meta_model = meta_model.predict(X_meta_test_combined)
    print("\n🔍 Checking Meta Model prediction distribution (classification):")
    print(f"Classes predicted: {np.unique(preds_meta_model)}")
    print(f"Prediction counts:\n{pd.Series(preds_meta_model).value_counts()}")

    # Classification metrics
    acc = accuracy_score(y_test_class, preds_meta_model)
    f1 = f1_score(y_test_class, preds_meta_model, average='macro')
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score (macro): {f1:.4f}")
    print("\nClassification report:")
    print(classification_report(y_test_class, preds_meta_model))

    metadata = {
        "lookahead": LOOKAHEAD,
        "xgboost_params": xgboost_params,
        "catboost_params": catboost_params,
        "rf_params": rf_params,
        "meta_params": meta_params,
    }
    with open(f"classifier_metadata_{LOOKAHEAD}.json", "w") as f:
        json.dump(metadata, f, indent=2)
        
    joblib.dump(meta_model, f"stack_model_classifier_LOOKAHEAD_{LOOKAHEAD}_session_less.pkl")
    joblib.dump(xgboost, f"xgboost_model_classifier_LOOKAHEAD_{LOOKAHEAD}_session_less.pkl")

    return {
        'lookahead': LOOKAHEAD,
        'preds_stack': meta_model.predict_proba(X_meta_test_combined),
        'preds_xgboost': xgboost.predict_proba(X_test_tree),
        'X_test_tree': X_test_tree,
        'X_test_linear': X_test_linear,
        'X_meta_test_combined': X_meta_test_combined,
        'true_values': y_test_class.values,
        'label_encoder': le
    }

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from sklearn.inspection import permutation_importance
from imblearn.over_sampling import RandomOverSampler
import warnings

warnings.filterwarnings("ignore")

# 1) Load & prepare
df = pd.read_parquet(f"parquet/labeled_data_6{market}.parquet")
clf_col = [c for c in df.columns if c.startswith("clf_target")][0]
df_clean = df.dropna(subset=[clf_col] + selected_indicators_class)

X = df_clean[selected_indicators_class].copy()
y_full = df_clean[clf_col]

# --- Feature Filtering Step 1: Zero/Near-Zero Variance ---
vt = VarianceThreshold(threshold=1e-4)
vt.fit(X)
X = X.loc[:, vt.get_support()]

# --- Feature Filtering Step 2: Multicollinearity Removal ---
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_corr = [col for col in upper.columns if any(upper[col] > 0.95)]
X = X.drop(columns=to_drop_corr)

# 2) Stage-1 target: hit vs no-hit
y_hit = (y_full != 0).astype(int)

# 3) Stage-2 data: only hits
mask_hits = y_hit == 1
X2_all = X[mask_hits]
y_dir = (y_full[mask_hits] == 2).astype(int)  # 0=LONG, 1=SHORT

# --- Feature Filtering Step 3: Permutation Importance on Stage-2 ---
imp_model = lgb.LGBMClassifier(
    objective='binary',
    class_weight='balanced',
    n_estimators=100,
    learning_rate=0.05,
    random_state=42
)
imp_model.fit(X2_all, y_dir)
perm = permutation_importance(
    imp_model, X2_all, y_dir,
    n_repeats=5, random_state=42, n_jobs=4
)
imp_series = pd.Series(perm.importances_mean, index=X2_all.columns)
to_drop_perm = imp_series[imp_series <= 0].index.tolist()
X = X.drop(columns=to_drop_perm)

# Rebuild Stage-2 inputs after dropping
mask_hits = (y_full != 0)
X2_all = X[mask_hits]

# 4) Define models
stage1_models = {
    "XGBoost": xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42, n_jobs=4
    ),
    "LightGBM": lgb.LGBMClassifier(
        objective="binary",
        class_weight="balanced",
        random_state=42, n_jobs=4
    ),
    "CatBoost": CatBoostClassifier(
        loss_function="Logloss",
        class_weights=[1,1],
        random_state=42, verbose=0
    ),
}

stage2_models = {
    "XGBoost": xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42, n_jobs=4
    ),
    "LightGBM": lgb.LGBMClassifier(
        objective="binary",
        class_weight="balanced",
        random_state=42, n_jobs=4
    ),
    "CatBoost": CatBoostClassifier(
        loss_function="Logloss",
        class_weights=[1,1],
        random_state=42, verbose=0
    ),
}

# 5) Hyperparameter grids for Stage 2
param_grids = {
    "XGBoost": {
        "max_depth": [4,6,8],
        "learning_rate": [0.05, 0.1],
        "n_estimators": [100, 200]
    },
    "LightGBM": {
        "num_leaves": [31,63,127],
        "learning_rate": [0.05, 0.1],
        "n_estimators": [100, 200]
    },
    "CatBoost": {
        "depth": [4,6,8],
        "learning_rate": [0.05, 0.1],
        "iterations": [100, 200]
    },
}

# 6) Two-stage cascade with TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=4)
for name in stage1_models:
    m1 = stage1_models[name]
    m2 = stage2_models[name]
    print(f"\n=== Two-stage Cascade with {name} ===")
    
    for fold, (tr, va) in enumerate(tscv.split(X)):
        print(f"\n--- Fold {fold} ---")
        
        # Stage 1: Hit vs No-Hit
        X1_tr, X1_va = X.iloc[tr], X.iloc[va]
        y1_tr, y1_va = y_hit.iloc[tr], y_hit.iloc[va]
        m1.fit(X1_tr, y1_tr)
        hit_pred = m1.predict(X1_va)
        print("Stage-1 report:")
        print(classification_report(
            y1_va, hit_pred,
            target_names=["NO-HIT","HIT"]
        ))
        
        # Stage 2: LONG vs SHORT on true hits
        tr_hits = tr[y1_tr.values == 1]
        X2_tr = X.iloc[tr_hits]
        y2_tr = (y_full.iloc[tr_hits] == 2).astype(int)
        
        # Oversample LONG
        ros = RandomOverSampler(sampling_strategy='not majority', random_state=42)
        X2_tr_bal, y2_tr_bal = ros.fit_resample(X2_tr, y2_tr)
        
        # Hyperparam search
        grid = GridSearchCV(
            m2, param_grids[name],
            cv=TimeSeriesSplit(n_splits=3),
            scoring='f1', n_jobs=4
        )
        grid.fit(X2_tr_bal, y2_tr_bal)
        best_m2 = grid.best_estimator_
        
        # Predict on predicted hits
        va_hits = va[hit_pred == 1]
        dir_pred = (best_m2.predict(X.iloc[va_hits])
                    if len(va_hits) else np.array([], dtype=int))
        
        # Assemble final predictions
        final_pred = np.zeros_like(hit_pred)
        final_pred[hit_pred == 1] = dir_pred + 1  # 1→LONG, 2→SHORT
        
        print("Final 3-class report:")
        print(classification_report(
            y_full.iloc[va], final_pred,
            target_names=["NO-HIT","LONG","SHORT"]
        ))


In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

efs = EFS(clf,
          min_features=1,
          max_features=len(features),
          scoring='accuracy',
          cv=5,
          n_jobs=-1)
efs = efs.fit(X.values, y)

print(f"Best CV accuracy: {efs.best_score_:.4f}")
print(f"Best subset: {efs.best_feature_names_}")


##### Running Train

In [None]:
# Regression Training
reg_results = []


os.makedirs("data", exist_ok=True)

regression_models = run_lookahead_for_session_regression()
reg_results.append(regression_models)

# Test

In [None]:
all_results = []

for result in reg_results:
    preds_stack = result['preds_stack']
    X_test_combined = result['X_test']
    y_test = result['true_values']
    labeled = pd.read_parquet(f"parquet/labeled_data_L12_PT2SL1VB12{market}.parquet")
    df_backtest = labeled.copy()

    print(f"\n🔎 Predicted return range for STACK: min={preds_stack.min():.15f}, max={preds_stack.max():.15f}")
    # results = evaluate_regression(
    #     X_test=X_test_combined,
    #     preds_stack=preds_stack,
    #     labeled=labeled,
    #     df=df_backtest,
    #     avoid_funcs=avoid_funcs,
    #     TICK_VALUE=20,
    #     is_same_session=is_same_session,
    #     long_thresh=0.0009,
    #     TRAIL_START_MULT=1.0,
    #     TRAIL_STOP_MULT=1.0,
    #     short_thresh=-0.00009,
    #     base_contracts=1,
    #     max_contracts=1,
    # )
    results = evaluate_static_tp_two_contracts(
        X_test=X_test_combined,
        preds_stack=preds_stack,
        labeled=labeled,
        df=df_backtest,
        avoid_funcs=avoid_funcs,
        TICK_VALUE=20,
        is_same_session=is_same_session,
        long_thresh=0.00001,
        short_thresh=-0.00001,
        TP_POINTS = 10.0,
    )

    all_results.append(results)
    print(
        f"\nPnL: ${results['pnl']:.2f}"
        f"\nTrades: {results['trades']}"
        f"\nWin Rate: {results['win_rate']:.2%}"
        f"\nExpectancy: {results['expectancy']:.2f}"
        f"\nProfit Factor: {results['profit_factor']:.2f}"\
        f"\nSharpe Ratio: {results['sharpe']:.2f}"
        f"\nLong Trades: {results['long_trades']} | Short Trades: {results['short_trades']}"
        f"\n"
    )

    print("Avoid Hits:")
    for name, count in results['avoid_hits'].items():
        print(f" - {name}: {count}")

    if not results['results'].empty and 'pnl' in results['results'].columns:
        print("\n🔢 Top 5 PnL trades:")
        print(results['results'].sort_values(by='pnl', ascending=False).head(5))

        print("\n🔻 Bottom 5 PnL trades:")
        print(results['results'].sort_values(by='pnl', ascending=True).head(5))
    else:
        print("\n⚠️ No trades executed, skipping PnL trade breakdown.")


summary_df = pd.DataFrame([{
    'pnl': r['pnl'],
    'sharpe': r['sharpe'],
    'expectancy': r['expectancy'],
    'profit_factor': r['profit_factor'],
    'win_rate': r['win_rate'],
    'trades': r['trades'],
    'results': r['results'],
} for r in all_results])
top = summary_df.sort_values(by='sharpe', ascending=False).head(10)
print("\n🏁 Top 10 Configurations Across All Lookaheads:")
print(top[['pnl', 'sharpe', 'expectancy', 'profit_factor', 'win_rate', 'trades']])

# Visualize

In [None]:
best_result = None

def compute_max_consecutive_loss(df):
    """
    Calculates the worst cumulative loss (drawdown) from any starting point.
    """
    pnl_series = df['pnl'].values
    max_loss = 0.0
    start_idx = 0
    end_idx = 0

    for i in range(len(pnl_series)):
        cumulative = 0.0
        for j in range(i, len(pnl_series)):
            cumulative += pnl_series[j]
            if cumulative < max_loss:
                max_loss = cumulative
                start_idx = i
                end_idx = j

    return max_loss, df['entry_time'].iloc[start_idx], df['entry_time'].iloc[end_idx]


for r in all_results:
    df = r['results'].copy()
    df = df.sort_values(by='entry_time')
    df['cumulative_pnl'] = df['pnl'].cumsum()

    # Count how many trades exited for each reason
    exit_counts = df['exit_reason'].value_counts(dropna=False)
    print(exit_counts)

    if (
        df['cumulative_pnl'].iloc[-1] > 0 and
        r['sharpe'] > 0.01 and
        r['trades'] > 1 and
        r['win_rate'] > 0.001 and
        r['profit_factor'] > 0.01 and
        r['expectancy'] > 0.01 and
        r['pnl'] > 100
    ):
        if best_result is None or r['sharpe'] > best_result['sharpe']:
            best_result = r.copy()
            best_result['cumulative_pnl'] = df['cumulative_pnl']
            best_result['entry_time'] = df['entry_time']

            # === Calculate max drawdown (largest PnL loss from peak)
            cumulative = df['cumulative_pnl']
            rolling_max = cumulative.cummax()
            drawdowns = cumulative - rolling_max
            max_drawdown = drawdowns.min()  # Most negative drop
            max_drawdown_start = rolling_max[drawdowns.idxmin()]
            best_result['max_drawdown'] = max_drawdown

# === Plot the best one ===
# === After determining best_result
if best_result:
    df = best_result['results'].copy()
    df = df.sort_values(by='entry_time')
    df['cumulative_pnl'] = df['pnl'].cumsum()

    max_loss, loss_start, loss_end = compute_max_consecutive_loss(df)

    # === Plot
    plt.figure(figsize=(12, 4))
    plt.plot(df['entry_time'], df['cumulative_pnl'], label='Cumulative PnL', color='green')
    plt.axvspan(loss_start, loss_end, color='red', alpha=0.2, label='Max Loss Window')
    plt.title(f"Top Sharpe Strategy | Max Consecutive Loss: {max_loss:.2f} | Cumulative PnL: {best_result['pnl']:.2f}")
    plt.xlabel("Datetime")
    plt.ylabel("Cumulative PnL")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

    print(f"💣 Max Consecutive PnL Loss: {max_loss:.2f}")
    print(f"📆 Period: {loss_start} → {loss_end}")
    best_result['results'].to_csv("best_strategy_results.csv", index=False)
    print("✅ Saved best_strategy_results.csv")
else:
    print("❌ No strategy met the conditions.")
