# Load Data

In [198]:
import os
import pandas as pd
import numpy as np
import ta
from ta.trend import (
    EMAIndicator,
    MACD,
    PSARIndicator
)
from ta.momentum import (
    RSIIndicator,
    StochRSIIndicator
)
from ta.volatility import (
    AverageTrueRange,
    BollingerBands
)
from ta.volume import (
    VolumeWeightedAveragePrice,
    AccDistIndexIndicator
)
from itertools import product
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, mean_squared_error
import matplotlib.pyplot as plt
from datetime import timedelta
from collections import defaultdict
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
import optuna
from sklearn.inspection import permutation_importance
import logging
import joblib
import json
import shap

optuna.logging.set_verbosity(optuna.logging.INFO)

# === Load Data ===
#folder_path = "/Users/francopapalardo-aleo/Desktop/repos/TradingAI 2/data/"
folder_path = "C:\\Users\\Franc\\repos\\Trading-ML-model\\data\\"
column_names = ['datetime', 'open', 'high', 'low', 'close', 'volume']
df_list = []
plt.rcParams['font.family'] = 'Segoe UI Emoji'

for filename in os.listdir(folder_path):
    if filename.endswith(('.csv', '.txt')):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep=';', header=None, names=column_names)
        df['source_file'] = filename
        df_list.append(df)

df = pd.concat(df_list, ignore_index=True)
df['datetime'] = pd.to_datetime(df['datetime'])
# Resample to 5-minute candles
df = df.drop_duplicates(subset='datetime', keep='first').reset_index(drop=True)
df = df.sort_values('datetime').reset_index(drop=True)
df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)

# Base time features
df['hour'] = df['datetime'].dt.hour + df['datetime'].dt.minute / 60
df['minute'] = df['datetime'].dt.minute
df['day_of_week'] = df['datetime'].dt.dayofweek  # 0 = Monday

# Custom session flags (adjust if needed)       # Regular Trading Hours
df['is_premarket'] = df['hour'].between(7, 9.5)
df['is_lunch'] = df['hour'].between(11.5, 13.5)


# Initialize features or indicators

In [199]:
# === Feature Engineering ===
# EMA
# df['ema_3'] = ta.ema(df['close'], length=3)
# df['ema_8'] = ta.ema(df['close'], length=8)
# df['ema_9'] = ta.ema(df['close'], length=9)
# df['ema_13'] = ta.ema(df['close'], length=13)
# df['ema_21'] = ta.ema(df['close'], length=21)
# df['ema_34'] = ta.ema(df['close'], length=34)
# df['ema_ratio_8_21'] = df['ema_8'] / df['ema_21']
# df['ema_diff_8_21'] = df['ema_8'] - df['ema_21']

# RSI
import ta.volatility


df['rsi_6'] = RSIIndicator(df['close'], window=6).rsi()
# df['rsi_14'] = ta.rsi(df['close'], length=14)
# df['rsi_21'] = ta.rsi(df['close'], length=21)

# Bollinger Bands for volatility regime
bb = BollingerBands(df['close'], window=20, window_dev=2)
df['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / bb.bollinger_mavg()

# Stochastic RSI for overbought/oversold with better signals than regular RSI
stoch_rsi = StochRSIIndicator(df['close'], window=14, smooth1=3, smooth2=3)
df['stoch_rsi_k'] = stoch_rsi.stochrsi_k()
df['stoch_rsi_d'] = stoch_rsi.stochrsi_d()

# Swing intensity (measures trend strength)
def swing_intensity(high, low, length=10):
    swing_high = high.rolling(length).max()
    swing_low = low.rolling(length).min()
    return (swing_high - swing_low) / swing_low

# Intraday seasonality
df['time_from_open'] = (df['datetime'].dt.hour * 60 + df['datetime'].dt.minute) - 570  # Minutes from 9:30
df['normalized_time'] = df['time_from_open'] / 390  # Normalize by trading day length

df['swing_intensity'] = swing_intensity(df['high'], df['low'])





# Normalized ATR (might be better than raw ATR)
df['natr'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range() / df['close']

# ATR
df['atr_5'] = AverageTrueRange(df['high'], df['low'], df['close'], window=5).average_true_range()
df['atr_14'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range()
# df['atr_30'] = ta.atr(df['high'], df['low'], df['close'], length=30)
df['atr_pct'] = df['atr_14'] / df['close']

# Price relative to recent ranges
df['close_to_high'] = (df['high'].rolling(10).max() - df['close']) / df['atr_14']
df['close_to_low'] = (df['close'] - df['low'].rolling(10).min()) / df['atr_14']

# MACD
# macd_slow = ta.macd(df['close'], fast=12, slow=26, signal=9)
# df['macd_slow'] = macd_slow['MACDh_12_26_9']
# df['macd_slow_diff'] = macd_slow['MACD_12_26_9'] - macd_slow['MACDs_12_26_9']

macd_fast = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd()
df['macd_fast'] = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd_signal()
df['macd_fast_diff'] = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd_diff()

# VWAP
# df['vwap'] = 
# df['vwap_diff'] = df['close'] - ((df['volume'] * (df['high'] + df['low'] + df['close']) / 3).cumsum() / df['volume'].cumsum())
# Volume delta proxy
# df['volume_delta'] = np.where(df['close'] > df['open'], df['volume'], -df['volume'])
# df['volume_delta_ema'] = df['volume_delta'].ewm(span=14).mean()

# # Candle body and total range
# df['candle_body'] = abs(df['close'] - df['open'])
df['candle_range'] = df['high'] - df['low'] + 1e-9  # avoid division by zero

# # Candle body % of range
# df['body_pct'] = df['candle_body'] / df['candle_range']

# # Wick sizes (relative to range)
# df['upper_wick'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['candle_range']
# df['lower_wick'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['candle_range']

# Previous
# df['prev_close'] = df['close'].shift(1)
# df['prev_rsi_14'] = df['rsi_14'].shift(1)
# df['prev_macd_fast'] = df['macd_fast'].shift(1)
# df['prev_ema_diff'] = df['ema_diff_8_21'].shift(1)

# First calculate return_1
df['return_1'] = df['close'].pct_change(1)

# Then use it for session_vol
df['session_vol'] = df.groupby(df['datetime'].dt.date)['return_1'].transform(
    lambda x: x.expanding().std()
)

# Rest of your feature calculations
df['time_from_open'] = (df['datetime'].dt.hour * 60 + df['datetime'].dt.minute) - 570
df['normalized_time'] = df['time_from_open'] / 390

# Volume-weighted momentum
df['volume_weighted_return'] = df['return_1'] * (df['volume'] / df['volume'].rolling(20).mean())
# df['above_vwap'] = (df['close'] > df['vwap']).astype(int)
# df['above_ema_21'] = (df['close'] > df['ema_21']).astype(int)
# df['below_vwap'] = (df['close'] < df['vwap']).astype(int)
# df['below_ema_21'] = (df['close'] < df['ema_21']).astype(int)
# df['velocity'] = df['return_1'] - df['return_1'].shift(1)

# df['vol_rolling_mean'] = df['volume'].rolling(20).mean()
# df['vol_spike'] = df['volume'] / (df['vol_rolling_mean'] + 1e-9)

# Breaks
# df['break_high_20'] = (df['high'] > df['high'].rolling(20).max().shift(1)).astype(int)
# df['break_low_20'] = (df['low'] < df['low'].rolling(20).min().shift(1)).astype(int)


highs = df['high']
lows = df['low']

def choppiness_index(high, low, close, length=14):
    tr = AverageTrueRange(high=high, low=low, close=close, window=length).average_true_range()
    atr_sum = tr.rolling(length).sum()
    high_max = high.rolling(length).max()
    low_min = low.rolling(length).min()
    return 100 * np.log10(atr_sum / (high_max - low_min)) / np.log10(length)

# def detect_pivot_highs_lows_3(df, lookback=3, lookforward=3):
#     df['pivot_high_3'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
#     df['pivot_low_3'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
#     df['is_pivot_high_3'] = df['pivot_high_3'].notna().astype(int)
#     df['is_pivot_low_3'] = df['pivot_low_3'].notna().astype(int)
#     return df

def detect_pivot_highs_lows_5(df, lookback=5, lookforward=5):
    df['pivot_high_5'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
    df['pivot_low_5'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
    df['is_pivot_high_5'] = df['pivot_high_5'].notna().astype(int)
    df['is_pivot_low_5'] = df['pivot_low_5'].notna().astype(int)
    return df

def detect_pivot_highs_lows_10(df, lookback=10, lookforward=10):
    df['pivot_high_10'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
    df['pivot_low_10'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
    df['is_pivot_high_10'] = df['pivot_high_10'].notna().astype(int)
    df['is_pivot_low_10'] = df['pivot_low_10'].notna().astype(int)
    return df

# def calc_nearest_sr_distance_fast(df):
#     highs_idx = df.index[df['is_pivot_high'] == 1].tolist()
#     lows_idx = df.index[df['is_pivot_low'] == 1].tolist()

#     res_dist = np.full(len(df), np.nan)
#     sup_dist = np.full(len(df), np.nan)

#     for i in range(len(df)):
#         current_close = df.at[i, 'close']

#         # Resistance: Find all prior pivot highs
#         prior_highs = [abs(current_close - df.at[idx, 'high']) for idx in highs_idx if idx < i]
#         res_dist[i] = min(prior_highs) if prior_highs else np.nan

#         # Support: Find all prior pivot lows
#         prior_lows = [abs(current_close - df.at[idx, 'low']) for idx in lows_idx if idx < i]
#         sup_dist[i] = min(prior_lows) if prior_lows else np.nan

#     df['dist_to_resistance'] = res_dist
#     df['dist_to_support'] = sup_dist

#     df['dist_to_res_pct'] = df['dist_to_resistance'] / df['close']
#     df['dist_to_sup_pct'] = df['dist_to_support'] / df['close']
#     return df

# def compute_fvg(df):
#     df = df.copy()
#     df['fvg_up'] = np.where((df['low'].shift(1) > df['high'].shift(2)), 1, 0)
#     df['fvg_down'] = np.where((df['high'].shift(1) < df['low'].shift(2)), 1, 0)
#     return df

# def compute_liquidity_sweeps(df, swing_window=10):
#     df = df.copy()
#     df['swing_high'] = df['high'].rolling(window=swing_window, center=False).max().shift(1)
#     df['swing_low'] = df['low'].rolling(window=swing_window, center=False).min().shift(1)

#     df['liquidity_sweep_high'] = ((df['high'] > df['swing_high']) & (df['close'] < df['swing_high'])).astype(int)
#     df['liquidity_sweep_low'] = ((df['low'] < df['swing_low']) & (df['close'] > df['swing_low'])).astype(int)

#     return df

# df = compute_fvg(df)
# df = compute_liquidity_sweeps(df)
# df = detect_pivot_highs_lows_3(df)
df = detect_pivot_highs_lows_5(df)
df = detect_pivot_highs_lows_10(df)

# df['is_pivot_high'] = df[['is_pivot_high_3', 'is_pivot_high_5', 'is_pivot_high_10']].max(axis=1)
# df['is_pivot_low']  = df[['is_pivot_low_3', 'is_pivot_low_5', 'is_pivot_low_10']].max(axis=1)

# df = calc_nearest_sr_distance_fast(df)

# === Add Feature ===
df['chop_index'] = choppiness_index(df['high'], df['low'], df['close'])

# === Strategy Setup ===
TICK_VALUE = 5
SL_ATR_MULT = 1.0
TP_ATR_MULT = 3.0
TRAIL_START_MULT = 2.5
TRAIL_STOP_MULT = 1.0
MAX_CONTRACTS = 1

param_grid_strategy = {
    'SL_ATR_MULT': [1.0],
    'TP_ATR_MULT': [2.0, 2.5, 3.0, 3.5, 4.0],
    'TRAIL_START_MULT': [0.5, 1.0, 1.5],
    'TRAIL_STOP_MULT': [0.5, 1.0, 1.5],
    'TICK_VALUE': [5],  # optional, or expand for futures like NQ/ES
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

# features = [
#     'rsi_6', 'rsi_14', 'rsi_21',
#     'ema_3', 'ema_8', 'ema_13', 'ema_9', 'ema_21','ema_34',
#     'ema_ratio_8_21', 'ema_diff_8_21',
#     'macd_slow', 'macd_slow_diff',
#     'macd_fast', 'macd_fast_diff',
#     'atr_5', 'atr_30', 'atr_14', 'atr_pct',
#     'vwap', 'vwap_diff',
#     'candle_body', 'candle_range',
#     'volume', 'chop_index',
#     'hour', 'minute', 'day_of_week',
#     'is_premarket', 'is_lunch',
#     'body_pct',  
#     'upper_wick', 'lower_wick',  # just added
#     'volume_delta_ema',
#     'return_1', 'return_3',
#     'prev_close', 'prev_rsi_14', 'prev_macd_fast', 'prev_ema_diff',
#     'above_vwap', 'above_ema_21',
#     'velocity', 'vol_spike',
#     'break_high_20', 'break_low_20',
#     'is_pivot_high_10', 'is_pivot_low_10',
#     'is_pivot_high_3', 'is_pivot_low_3',
#     'is_pivot_high_5', 'is_pivot_low_5',
#     # 'dist_to_resistance', 'dist_to_support',
#     'fvg_up', 'fvg_down',
#     'liquidity_sweep_high', 'liquidity_sweep_low',
#     'below_vwap', 'below_ema_21',
# ]

features = [
    # Existing features
    'is_pivot_low_5', 'is_pivot_high_5',
    'candle_range', 'rsi_6', 'atr_5',
    'is_pivot_low_10', 'is_pivot_high_10',
    'macd_fast_diff', 'atr_pct',
    'return_1', 'macd_fast', 'volume',
    'hour', 'chop_index', 'day_of_week',
    
    # New features
    'bb_width', 'natr',
    'stoch_rsi_k', 'stoch_rsi_d',
    'volume_weighted_return',
    'swing_intensity',
    'close_to_high', 'close_to_low',
    'session_vol', 'normalized_time'
]

avoid_funcs = {
}

df[features] = df[features].fillna(-999)

def is_same_session(start_time, end_time):
    session_start = start_time.replace(hour=18, minute=0, second=0)
    if start_time.hour < 18:
        session_start -= timedelta(days=1)
    session_end = session_start + timedelta(hours=23)
    return session_start <= start_time <= session_end and session_start <= end_time <= session_end

combo_trades = defaultdict(set)

def combo_overlap(c1, c2):
    a, b = combo_trades[frozenset(c1)], combo_trades[frozenset(c2)]
    if not a or not b:
        return 1.0
    return len(a & b) / min(len(a), len(b))


# Declare Combo function for serialization

In [200]:
def evaluate_regression_combo(
    X_test, preds, labeled, df,
    avoid_funcs,
    SL_ATR_MULT, TP_ATR_MULT, TRAIL_START_MULT, TRAIL_STOP_MULT, TICK_VALUE,
    is_same_session,
    long_thresh=0.003,
    short_thresh=-0.003
):
    temp_trades_data = []
    skipped_trades = 0
    avoid_hits = defaultdict(int)
    long_trades = 0
    short_trades = 0

    for i, idx in enumerate(X_test.index):
        row = labeled.loc[idx]
        pred_return = preds[i]

        # Decide trade direction
        if pred_return >= long_thresh:
            side = 'long'
            long_trades += 1
        elif pred_return <= short_thresh:
            side = 'short'
            short_trades += 1
        else:
            continue  # skip neutral signals

        # Trade filters
        skip_trade = False
        for name, f in avoid_funcs.items():
            try:
                if f(row):
                    avoid_hits[name] += 1
                    skip_trade = True
            except:
                continue
        if skip_trade or idx >= len(df) - 6:
            skipped_trades += 1
            continue

        # --- Trade Simulation ---
        entry_price = row['close']
        entry_time = row['datetime']
        atr = row['atr_14']

        # Stop Loss (fixed volatility-based)
        sl_price = entry_price - SL_ATR_MULT * atr if side == 'long' else entry_price + SL_ATR_MULT * atr

        # Take Profit (dynamic, from model prediction, clipped)
        expected_move = abs(pred_return) * entry_price
        min_tp = 0.001 * entry_price  # minimum 0.1% move
        max_tp = TP_ATR_MULT * atr
        tp_move = np.clip(expected_move, min_tp, max_tp)
        tp_price = entry_price + tp_move if side == 'long' else entry_price - tp_move

        # Trailing logic
        trail_trigger = entry_price + TRAIL_START_MULT * atr if side == 'long' else entry_price - TRAIL_START_MULT * atr
        trail_stop = None

        max_price, min_price = entry_price, entry_price
        exit_price, exit_time = None, None

        fwd_idx = idx + 1
        while fwd_idx < len(df):
            fwd_row = df.loc[fwd_idx]
            max_price = max(max_price, fwd_row['high'])
            min_price = min(min_price, fwd_row['low'])

            if (side == 'long' and fwd_row['low'] <= sl_price) or (side == 'short' and fwd_row['high'] >= sl_price):
                exit_price = sl_price
                exit_time = fwd_row['datetime']
                break

            if (side == 'long' and fwd_row['high'] >= tp_price) or (side == 'short' and fwd_row['low'] <= tp_price):
                exit_price = tp_price
                exit_time = fwd_row['datetime']
                break

            if side == 'long' and fwd_row['high'] >= trail_trigger:
                trail_stop = fwd_row['close'] - TRAIL_STOP_MULT * atr
            if side == 'short' and fwd_row['low'] <= trail_trigger:
                trail_stop = fwd_row['close'] + TRAIL_STOP_MULT * atr

            if trail_stop:
                if (side == 'long' and fwd_row['low'] <= trail_stop) or (side == 'short' and fwd_row['high'] >= trail_stop):
                    exit_price = trail_stop
                    exit_time = fwd_row['datetime']
                    break

            fwd_idx += 1

        if exit_price is None:
            exit_price = df.loc[len(df) - 1, 'close']
            exit_time = df.loc[len(df) - 1, 'datetime']

        if not is_same_session(entry_time, exit_time):
            continue

        GROSS_PNL = (exit_price - entry_price) * TICK_VALUE if side == 'long' else (entry_price - exit_price) * TICK_VALUE
        COMMISSION = 3.98
        pnl = GROSS_PNL - COMMISSION

        mfe = max_price - entry_price if side == 'long' else entry_price - min_price
        mae = entry_price - min_price if side == 'long' else max_price - entry_price

        temp_trades_data.append({
            'datetime': exit_time,
            'pnl': pnl,
            'mfe': mfe,
            'mae': mae,
            'gross_pnl': GROSS_PNL
        })

    # === Metrics ===
    results = pd.DataFrame(temp_trades_data)
    pnl_total = results['pnl'].sum() if not results.empty else 0
    trades = len(results)
    win_rate = (results['pnl'] > 0).mean() if not results.empty else 0
    expectancy = results['pnl'].mean() if not results.empty else 0
    profit_factor = results[results['pnl'] > 0]['pnl'].sum() / abs(results[results['pnl'] < 0]['pnl'].sum()) if not results.empty and (results['pnl'] < 0).any() else np.nan
    sharpe = results['pnl'].mean() / (results['pnl'].std() + 1e-9) * np.sqrt(trades) if trades > 1 else 0

    return {
        'pnl': pnl_total,
        'trades': trades,
        'win_rate': win_rate,
        'expectancy': expectancy,
        'profit_factor': profit_factor,
        'sharpe': sharpe,
        'long_trades': long_trades,
        'short_trades': short_trades,
        'avoid_hits': dict(avoid_hits),
        'results': results
    }

# Cleanup

In [201]:
def compute_future_return_labels(df: pd.DataFrame, lookahead: int, is_same_session_fn) -> pd.DataFrame:
   # Create a copy to avoid modifying original
    df = df.copy()
    
    # 1. Forward-looking feature detection
    forward_looking = []
    for col in df.columns:
        # Check for keywords that suggest forward-looking computation
        if any(x in col.lower() for x in ['future', 'next', 'fwd', 'forward']):
            forward_looking.append(col)
            
    if forward_looking:
        logging.warning(f"Potential forward-looking features detected: {forward_looking}")
    
    # 2. Ensure proper temporal alignment
    def align_features(row_idx):
        # Only use data available at prediction time
        current_time = df.loc[row_idx, 'datetime']
        mask = df['datetime'] < current_time
        
        # Update rolling calculations to only use past data
        for col in df.columns:
            if 'rolling' in col or 'ewm' in col:
                df.loc[row_idx, col] = df[mask][col].iloc[-1]
                
    # Apply alignment
    for idx in df.index:
        align_features(idx)

    # 3. Label computation with session boundary check
    future_returns = []
    trade_dirs = [] 

    for idx in range(len(df) - lookahead):
        start_time = df.loc[idx, 'datetime']
        end_time = df.loc[idx + lookahead, 'datetime']

        if not is_same_session_fn(start_time, end_time):
            future_returns.append(np.nan)
            trade_dirs.append(None)
            continue

        entry_price = df.loc[idx, 'close']
        future_price = df.loc[idx + lookahead, 'close']
        future_return = (future_price / entry_price) - 1

        future_returns.append(future_return)
        trade_dirs.append('long' if future_return > 0 else 'short')

    # Align output with original df
    df_labeled = df.iloc[:len(future_returns)].copy()
    df_labeled['future_return'] = future_returns
    df_labeled['trade_dir'] = trade_dirs

    # Drop NaNs
    df_labeled = df_labeled.dropna(subset=['future_return'])

    return df_labeled

In [202]:
lookahead_values = [5, 15, 20]

def label_and_save(lookahead):
    df_labeled = compute_future_return_labels(df, lookahead=lookahead, is_same_session_fn=is_same_session)
    df_labeled.to_parquet(f"labeled_data_{lookahead}.parquet")

for lookahead in lookahead_values:
    if os.path.exists(f"labeled_data_{lookahead}.parquet"):
        print(f"File labeled_data_{lookahead}.parquet already exists. Skipping...")
        continue
    else:
        print(f"Processing lookahead {lookahead}...")
        label_and_save(lookahead)


File labeled_data_5.parquet already exists. Skipping...
File labeled_data_15.parquet already exists. Skipping...
File labeled_data_20.parquet already exists. Skipping...


# Train

##### Cleaned training

In [203]:
class ModelParams:
    """Store and manage model parameters"""
    def __init__(self):
        self.rf_params = None
        self.xgb_params = None 
        self.enet_params = None
        self.meta_params = None
        
    def save(self, lookahead):
        joblib.dump(self, f"model_params_{lookahead}.pkl")
        
    @classmethod
    def load(cls, lookahead):
        return joblib.load(f"model_params_{lookahead}.pkl")

In [204]:
class FeatureManager:
    """Manage feature selection and combinations"""
    def __init__(self):
        self.selected_features = None
        self.combined_features = None
        self.feature_importances = None
        
    def select_features(self, importance_df, top_n=25):
        self.selected_features = importance_df.head(top_n)['feature'].tolist()
        return self.selected_features
    
    def combine_feature_sets(self, rf_features, l1_features=None):
        if l1_features is None:
            self.combined_features = rf_features
        else:
            self.combined_features = list(set(rf_features + l1_features))
        return self.combined_features

In [205]:
def enhance_model_training(X_train, y_train, params):
    """Enhanced model training with additional techniques"""
    
    # 1. Add sample weights based on prediction difficulty
    def compute_sample_weights(y):
        weights = np.ones(len(y))
        # Give higher weights to samples near turning points
        diff = np.abs(np.diff(np.sign(y)))
        weights[1:][diff > 0] = 2.0
        return weights
    
    sample_weights = compute_sample_weights(y_train)
    
    # 2. Add time-based validation splits
    def time_based_cv(X, n_splits=5):
        total_size = len(X)
        fold_size = total_size // n_splits
        for i in range(n_splits):
            val_start = (n_splits - i - 1) * fold_size
            val_end = val_start + fold_size
            train_idx = list(range(0, val_start))
            val_idx = list(range(val_start, val_end))
            yield train_idx, val_idx
            
    # 3. Add ensemble diversity through feature sampling
    def create_diverse_features(X, n_subsets=3):
        feature_sets = []
        n_features = X.shape[1]
        for _ in range(n_subsets):
            n_selected = np.random.randint(n_features//2, n_features)
            selected = np.random.choice(X.columns, n_selected, replace=False)
            feature_sets.append(selected)
        return feature_sets

    return {
        'sample_weights': sample_weights,
        'cv_splits': time_based_cv(X_train),
        'feature_sets': create_diverse_features(X_train)
    }

In [206]:
class PerformanceMonitor:
    """Enhanced monitor for model performance and threshold adjustment"""
    
    def __init__(self, window_size=20, threshold_bounds=(0.0001, 0.01)):
        self.performance_history = []
        self.threshold_history = []
        self.window_size = window_size
        self.min_threshold, self.max_threshold = threshold_bounds
        self.metrics_history = {
            'win_rate': [],
            'profit_factor': [],
            'sharpe': [],
            'trades_per_day': [],
            'avg_trade_duration': []
        }
        
    def update_thresholds(self, current_results):
        """Dynamically adjust prediction thresholds based on multiple metrics"""
        if len(self.performance_history) < 5:
            return 0.0005  # default threshold
            
        # Get recent performance window
        recent_metrics = pd.DataFrame(self.performance_history[-self.window_size:])
        
        # 1. Analyze performance trends
        win_rate_trend = recent_metrics['win_rate'].diff().mean()
        pf_trend = recent_metrics['profit_factor'].diff().mean()
        sharpe_trend = recent_metrics['sharpe'].diff().mean()
        
        # 2. Calculate volatility regime
        if 'results' in current_results:
            recent_pnls = current_results['results']['pnl'].rolling(self.window_size).std()
            vol_regime = 'high' if recent_pnls.mean() > recent_pnls.quantile(0.75) else 'low'
        else:
            vol_regime = 'normal'
            
        # 3. Adjust threshold based on multiple factors
        current_threshold = self.threshold_history[-1]
        adjustment = 1.0
        
        # Performance-based adjustment
        if win_rate_trend < 0 and pf_trend < 0:
            adjustment *= 1.1  # Increase threshold
        elif win_rate_trend > 0 and pf_trend > 0:
            adjustment *= 0.9  # Decrease threshold
            
        # Volatility-based adjustment
        if vol_regime == 'high':
            adjustment *= 1.2
        elif vol_regime == 'low':
            adjustment *= 0.8
            
        # Sharpe-based fine-tuning
        if sharpe_trend > 0:
            adjustment *= 0.95
            
        # Apply adjustment with bounds
        new_threshold = np.clip(
            current_threshold * adjustment,
            self.min_threshold,
            self.max_threshold
        )
        
        return new_threshold
        
    def log_performance(self, results, threshold):
        """Log comprehensive performance metrics"""
        self.performance_history.append(results)
        self.threshold_history.append(threshold)
        
        # Update detailed metrics
        if 'results' in results and not results['results'].empty:
            trades_df = results['results']
            
            # Calculate additional metrics
            daily_trades = trades_df.groupby(
                trades_df['datetime'].dt.date
            ).size().mean()
            
            avg_duration = (
                trades_df['datetime'].diff()
                .mean()
                .total_seconds() / 60  # in minutes
            )
            
            # Update metrics history
            self.metrics_history['win_rate'].append(results['win_rate'])
            self.metrics_history['profit_factor'].append(results['profit_factor'])
            self.metrics_history['sharpe'].append(results['sharpe'])
            self.metrics_history['trades_per_day'].append(daily_trades)
            self.metrics_history['avg_trade_duration'].append(avg_duration)
    
    def get_monitoring_stats(self):
        """Get comprehensive monitoring statistics"""
        return {
            'metrics_history': self.metrics_history,
            'current_threshold': self.threshold_history[-1] if self.threshold_history else None,
            'performance_stability': self._calculate_stability(),
            'recommendations': self._generate_recommendations()
        }
    
    def _calculate_stability(self):
        """Calculate performance stability metrics"""
        if len(self.performance_history) < self.window_size:
            return None
            
        recent_metrics = pd.DataFrame(self.metrics_history)
        return {
            'metric_volatility': recent_metrics.std().to_dict(),
            'trend_strength': recent_metrics.apply(
                lambda x: np.abs(x.autocorr()), axis=0
            ).to_dict()
        }
    
    def _generate_recommendations(self):
        """Generate recommendations based on performance analysis"""
        if not self.performance_history:
            return []
            
        recommendations = []
        recent_perf = pd.DataFrame(self.performance_history[-self.window_size:])
        
        # Check win rate stability
        if recent_perf['win_rate'].std() > 0.1:
            recommendations.append(
                "High win rate volatility detected. Consider adjusting position sizing."
            )
            
        # Check profit factor
        if recent_perf['profit_factor'].mean() < 1.2:
            recommendations.append(
                "Low profit factor. Consider increasing threshold or reviewing stop loss levels."
            )
            
        # Check trade frequency
        if len(recent_perf) > 0:
            avg_trades = len(recent_perf) / self.window_size
            if avg_trades < 5:
                recommendations.append(
                    "Low trade frequency. Consider relaxing entry conditions."
                )
            elif avg_trades > 20:
                recommendations.append(
                    "High trade frequency. Consider stricter filtering."
                )
                
        return recommendations

In [207]:
def enhance_features(df):
    """Add more sophisticated features"""
    
    # 1. Volume Profile
    df['volume_profile'] = df.groupby(pd.Grouper(key='datetime', freq='1D'))['volume'].transform(
        lambda x: x / x.sum()
    )
    
    # 2. Price Momentum Features
    for period in [3, 5, 8, 13]:
        df[f'momentum_{period}'] = df['close'].pct_change(period)
        df[f'volume_momentum_{period}'] = df['volume'].pct_change(period)
    
    # 3. Volatility Features
    df['high_low_range'] = (df['high'] - df['low']) / df['close']
    df['close_open_range'] = (df['close'] - df['open']) / df['open']
    
    # 4. Time-based Features
    df['time_of_day'] = df['datetime'].dt.hour + df['datetime'].dt.minute / 60
    df['day_of_week'] = df['datetime'].dt.dayofweek
    
    return df

In [None]:
class StudyManager:
    """Manage Optuna studies with SQLite storage"""
    
    def __init__(self, base_path="studies"):
        self.base_path = base_path
        os.makedirs(base_path, exist_ok=True)
    
    def get_study(self, study_name, direction='maximize'):
        """Get or create a study with SQLite storage"""
        db_path = os.path.join(self.base_path, f"{study_name}-05-14.db")
        storage = optuna.storages.RDBStorage(
            url=f"sqlite:///{db_path}",
            engine_kwargs={"connect_args": {"timeout": 300}}
        )
        
        # Load existing study or create new one
        try:
            study = optuna.load_study(
                study_name=study_name,
                storage=storage
            )
        except:
            study = optuna.create_study(
                study_name=study_name,
                storage=storage,
                direction=direction,
                load_if_exists=True
            )
            
        return study
    
    def save_best_trial(self, study_name, params, score):
        """Save best trial parameters to JSON"""
        output_path = os.path.join(self.base_path, f"{study_name}_best.json")
        with open(output_path, 'w') as f:
            json.dump({
                'params': params,
                'score': score,
                'timestamp': str(pd.Timestamp.now())
            }, f, indent=2)

In [None]:
class OptimizationManager:
    """Manages joint optimization of base models and meta-learner"""
    
    def __init__(self, n_trials=100, study_name="ensemble_optimization"):
        self.n_trials = n_trials
        self.study_name = study_name
        self.best_params = None
        self.best_score = float('-inf')
        self.X_train = None
        self.y_train = None
        self.study_manager = StudyManager()
        
    # Modify OptimizationManager._evaluate_ensemble
    def _evaluate_ensemble(self, rf_params, xgb_params, enet_params, meta_params, weights):
        """Evaluate ensemble with given parameters"""
        try:
            # Create base models
            rf = RandomForestRegressor(**rf_params, random_state=42)
            xgb = XGBRegressor(**xgb_params, random_state=42)
            enet = ElasticNet(**enet_params, random_state=42)
            
            # Create meta learner
            meta = Ridge(**meta_params)
            
            # Create and train stack with reduced parallelization
            stack = StackingRegressor(
                estimators=[
                    ('rf', rf),
                    ('xgb', xgb),
                    ('enet', enet)
                ],
                final_estimator=meta,
                n_jobs=1  # Reduce parallel jobs
            )
            
            # Cross validation with fewer splits
            cv = TimeSeriesSplit(n_splits=3)  # Reduced from 5
            scores = cross_val_score(
                stack, 
                self.X_train, 
                self.y_train, 
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=1  # Reduce parallel jobs
            )
            
            return scores
            
        except Exception as e:
            logging.error(f"Evaluation failed: {e}")
            return np.array([-np.inf])
    
    def joint_objective(self, trial):
        """Optimize all models together"""
        try:
            # Base model parameters
            rf_params = {
                'n_estimators': trial.suggest_int('rf_n_estimators', 300, 800),
                'max_depth': trial.suggest_int('rf_max_depth', 10, 30),
                'min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 5, 50)
            }
            
            xgb_params = {
                'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
                'learning_rate': trial.suggest_float('xgb_lr', 0.01, 0.3),
                'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 500)
            }
            
            enet_params = {
                'alpha': trial.suggest_float('enet_alpha', 0.0001, 1.0, log=True),
                'l1_ratio': trial.suggest_float('enet_l1_ratio', 0.0, 1.0)
            }
            
            # Meta-learner parameters
            meta_params = {
                'alpha': trial.suggest_float('meta_alpha', 0.0001, 10.0, log=True),
                'solver': trial.suggest_categorical('meta_solver', ['auto', 'svd', 'cholesky'])
            }
            
            # Training weights
            rf_weight = trial.suggest_float('rf_weight', 0.1, 1.0)
            xgb_weight = trial.suggest_float('xgb_weight', 0.1, 1.0)
            enet_weight = trial.suggest_float('enet_weight', 0.1, 1.0)
            
            # Normalize weights
            total = rf_weight + xgb_weight + enet_weight
            weights = [rf_weight/total, xgb_weight/total, enet_weight/total]
            
            # Evaluate ensemble
            cv_scores = self._evaluate_ensemble(
                rf_params, xgb_params, enet_params, meta_params, weights
            )
            
            return np.mean(cv_scores)
            
        except Exception as e:
            logging.error(f"Trial failed: {e}")
            return float('-inf')
            
    def optimize(self, X_train, y_train):
        """Run optimization with persistence"""
        self.X_train = X_train
        self.y_train = y_train
        
        # Get or create study
        study = self.study_manager.get_study(self.study_name)
        
        # Continue optimization
        study.optimize(self.joint_objective, n_trials=self.n_trials)
        
        self.best_params = study.best_params
        self.best_score = study.best_value
        
        # Save best trial
        self.study_manager.save_best_trial(
            self.study_name,
            self.best_params,
            self.best_score
        )
        
        return self.best_params

##### Rest

In [210]:
def load_and_split_data(LOOKAHEAD, features):
    """Load and split data for training"""
    labeled = pd.read_parquet(f"labeled_data_{LOOKAHEAD}.parquet")
    labeled = labeled.replace([np.inf, -np.inf], np.nan)
    labeled = labeled.dropna(subset=features + ['future_return'])

    cutoff_date = pd.Timestamp("2025-05-01")
    train = labeled[labeled['datetime'] < cutoff_date]
    test = labeled[labeled['datetime'] >= cutoff_date]

    X_train_full, y_train = train[features], train['future_return']
    X_test_full, y_test = test[features], test['future_return']

    return X_train_full, X_test_full, y_train, y_test, labeled, train, test

In [None]:
def optimize_rf_params(X_train_full, y_train, LOOKAHEAD):
    """Optimize Random Forest parameters using Optuna with enhanced CV"""
    def objective(trial):
        try:
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 800),
                'max_depth': trial.suggest_int('max_depth', 10, 30),
                'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 50),
                'min_samples_split': trial.suggest_int('min_samples_split', 10, 50),
            }
            model = RandomForestRegressor(**params, random_state=42, n_jobs=1)
            
            # Use enhanced cross-validation
            cv_splits = create_time_series_cv(X_train_full)
            scores = []
            
            for train_idx, val_idx in cv_splits:
                X_train = X_train_full.iloc[train_idx]
                y_train_split = y_train.iloc[train_idx]
                X_val = X_train_full.iloc[val_idx]
                y_val = y_train.iloc[val_idx]
                
                model.fit(X_train, y_train_split)
                val_score = -mean_squared_error(y_val, model.predict(X_val))
                scores.append(val_score)
                
            return np.mean(scores)
        except Exception as e:
            logging.warning(f"Trial failed: {e}")
            return float('-inf')

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    return study.best_params

In [212]:
def optimize_xgb_params(X_train_full, y_train, LOOKAHEAD):
    def objective(trial):
        params = {
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5)
        }
        model = XGBRegressor(**params, random_state=42)
        tscv = TimeSeriesSplit(n_splits=5)
        scores = cross_val_score(model, X_train_full, y_train, cv=tscv, scoring='neg_mean_squared_error')
        return scores.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    return study.best_params

In [213]:
def optimize_elastic_params(X_train_full, y_train, LOOKAHEAD):
    def objective(trial):
        params = {
            'alpha': trial.suggest_float('alpha', 0.0001, 1.0, log=True),
            'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
            'max_iter': 1000
        }
        model = ElasticNet(**params, random_state=42)
        tscv = TimeSeriesSplit(n_splits=5)
        scores = cross_val_score(model, X_train_full, y_train, cv=tscv, scoring='neg_mean_squared_error')
        return scores.mean()

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    return study.best_params

In [None]:
def train_base_models(X_train, X_test, y_train, combined_features, training_enhancements, model_params):
    """Train base models with reduced memory usage"""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    X_train_df = pd.DataFrame(X_train_scaled, columns=combined_features, index=X_train.index)
    X_test_df = pd.DataFrame(X_test_scaled, columns=combined_features, index=X_test.index)
    
    # Use sample weights and fewer feature subsets
    sample_weights = training_enhancements['sample_weights']
    feature_sets = training_enhancements['feature_sets'][:2]  # Use fewer feature subsets
    
    models = {}
    
    # Train RF with reduced complexity
    rf = RandomForestRegressor(
        **model_params.rf_params,
        random_state=42,
        n_jobs=1  # Reduce parallel jobs
    )
    rf.fit(X_train_df, y_train, sample_weight=sample_weights)
    
    # Train XGBoost with reduced complexity
    xgb = XGBRegressor(
        **model_params.xgb_params,
        eval_metric='rmse',
        random_state=42,
        n_jobs=1  # Reduce parallel jobs
    )
    xgb.fit(X_train_scaled, y_train, sample_weight=sample_weights)
    
    # Train ElasticNet
    enet = ElasticNet(**model_params.enet_params, max_iter=1000)
    enet.fit(X_train_scaled, y_train, sample_weight=sample_weights)
    
    models = {
        'rf': rf,
        'xgb': xgb,
        'elasticnet': enet
    }
    
    return models, scaler, X_train_scaled, X_test_scaled, X_train_df, X_test_df, None

In [None]:
def create_stacking_ensemble(base_models, meta_model, X_train, y_train):
    """Create and train stacking ensemble"""
    stack = StackingRegressor(
        estimators=[
            ('rf', base_models['rf']),
            ('xgb', base_models['xgb']),
            ('enet', base_models['elasticnet'])
        ],
        final_estimator=meta_model,
        n_jobs=1
    )
    stack.fit(X_train, y_train)
    return stack

In [216]:
def evaluate_and_calibrate(stack, X_test_scaled, y_test):
    """Evaluate stack model and calibrate predictions"""
    stack_preds = stack.predict(X_test_scaled)
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(stack_preds, y_test)
    calibrated_preds = iso.predict(stack_preds)
    return calibrated_preds

In [217]:
def run_backtesting(preds, X_test_df, labeled, df, combinations, thresholds, LOOKAHEAD):
    """Run backtesting with different parameters"""
    all_results = []
    for params in combinations:
        for thresh in thresholds:
            test_results = evaluate_regression_combo(
                X_test=X_test_df,
                preds=preds,
                labeled=labeled,
                df=df,
                avoid_funcs=avoid_funcs,
                SL_ATR_MULT=params['SL_ATR_MULT'],
                TP_ATR_MULT=params['TP_ATR_MULT'],
                TRAIL_START_MULT=params['TRAIL_START_MULT'],
                TRAIL_STOP_MULT=params['TRAIL_STOP_MULT'],
                TICK_VALUE=params['TICK_VALUE'],
                is_same_session=is_same_session,
                long_thresh=thresh,
                short_thresh=-thresh
            )
            test_results['params'] = params
            all_results.append(test_results)
            
    return all_results

In [218]:
def save_model_artifacts(metadata, stack, scaler, study, meta_params, LOOKAHEAD):
    """Save model artifacts and metadata"""
    with open(f"model_metadata_{LOOKAHEAD}.json", "w") as f:
        json.dump(metadata, f, indent=2)
    
    joblib.dump(meta_params, f"meta_model_params_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(stack, f"stack_model_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(scaler, f"scaler_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(study.trials_dataframe(), f"xgb_trials_df_{LOOKAHEAD}_05-14.pkl")

In [219]:
def get_shap_importance(model, X_train):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    feature_importance = np.abs(shap_values).mean(0)
    return pd.DataFrame({
        'feature': X_train.columns,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

In [220]:
def create_time_series_cv(X, embargo_size=5):
    """Enhanced time series cross validation with purging and embargo"""
    n_samples = len(X)
    # Minimum size for training (e.g., 6 months of data)
    min_train_size = n_samples // 4
    # Size of validation set (e.g., 1 month of data)
    val_size = n_samples // 12
    # Gap between train and validation (purging)
    purge_size = 3
    
    for i in range(3):  # 3 folds
        # Calculate indices
        train_end = n_samples - (i + 1) * (val_size + purge_size + embargo_size)
        val_start = train_end + purge_size
        val_end = val_start + val_size
        
        if train_end < min_train_size:
            break
            
        yield (
            np.arange(0, train_end),  # Training indices
            np.arange(val_start, val_end)  # Validation indices
        )

In [221]:
def analyze_studies():
    """Analyze all stored studies"""
    study_manager = StudyManager()
    
    for file in os.listdir(study_manager.base_path):
        if file.endswith('.db'):
            study_name = file[:-3]  # Remove .db extension
            study = study_manager.get_study(study_name)
            
            print(f"\nAnalyzing study: {study_name}")
            print(f"Number of trials: {len(study.trials)}")
            print(f"Best value: {study.best_value}")
            print(f"Best parameters:")
            for key, value in study.best_params.items():
                print(f"  {key}: {value}")
            
            # Plot optimization history
            optuna.visualization.plot_optimization_history(study)
            plt.title(f"Optimization History - {study_name}")
            plt.show()

##### Real Training

In [222]:
def run_lookahead(LOOKAHEAD):
    """Main function orchestrating the entire process"""
    print(f"\n🔍 Running LOOKAHEAD={LOOKAHEAD}")
    logging.info(f"Loading labeled data for LOOKAHEAD={LOOKAHEAD}")
    
    # Initialize parameters
    model_params = ModelParams()
    
    # Load and split data
    X_train_full, X_test_full, y_train, y_test, labeled, train, test = load_and_split_data(LOOKAHEAD, features)
    
    # Initialize feature manager
    feature_manager = FeatureManager()
    
    # Initialize optimization manager
    opt_manager = OptimizationManager(n_trials=100)

    # Initialize optimization manager with unique study name
    study_name = f"ensemble_optimization_lookahead_{LOOKAHEAD}"
    opt_manager = OptimizationManager(n_trials=100, study_name=study_name)
    
    # Joint optimization of all parameters
    best_params = opt_manager.optimize(X_train_full, y_train)
    
    # Update model parameters
    model_params.rf_params = {k.replace('rf_', ''): v for k, v in best_params.items() if k.startswith('rf_')}
    model_params.xgb_params = {k.replace('xgb_', ''): v for k, v in best_params.items() if k.startswith('xgb_')}
    model_params.enet_params = {k.replace('enet_', ''): v for k, v in best_params.items() if k.startswith('enet_')}
    model_params.meta_params = {k.replace('meta_', ''): v for k, v in best_params.items() if k.startswith('meta_')}
    
    # Get enhanced training features
    training_enhancements = enhance_model_training(X_train_full, y_train, model_params)
    
     # Initialize enhanced performance monitor
    performance_monitor = PerformanceMonitor(
        window_size=20,
        threshold_bounds=(0.0001, 0.01)
    )
    
    # Train base models with enhancements
    base_models, scaler, X_train_scaled, X_test_scaled, X_train_df, X_test_df, rf_ensemble_preds = train_base_models(
        X_train_full, 
        X_test_full, 
        y_train, 
        feature_manager.combine_feature_sets(features),
        training_enhancements,
        model_params  # Pass model parameters
    )
    
    # Create and train stacking ensemble
    meta_model = Ridge(**model_params.meta_params)
    stack = create_stacking_ensemble(base_models, meta_model, X_train_scaled, y_train)
    
    # Evaluate and calibrate predictions
    calibrated_preds = evaluate_and_calibrate(stack, X_test_scaled, y_test)
    
    # Run backtesting with dynamic thresholds
    threshold = performance_monitor.update_thresholds(None)
    all_results = []

    # Iterative threshold adjustment
    for i in range(3):  # Multiple passes for threshold stabilization
        results = run_backtesting(
            calibrated_preds, X_test_df, labeled, df,
            combinations, [threshold], LOOKAHEAD
        )
        
        # Log performance and adjust threshold
        performance_monitor.log_performance(results[0], threshold)
        threshold = performance_monitor.update_thresholds(results[0])
        all_results.extend(results)
        
        # Get monitoring stats
        monitoring_stats = performance_monitor.get_monitoring_stats()
        logging.info(f"Monitoring stats for iteration {i+1}:")
        logging.info(json.dumps(monitoring_stats, indent=2))
        
        # Check if we need to apply recommendations
        for rec in monitoring_stats['recommendations']:
            logging.warning(f"Recommendation: {rec}")
    
    # Log performance
    performance_monitor.log_performance(all_results[0], threshold)
    
    # Save artifacts
    metadata = {
        "lookahead": LOOKAHEAD,
        "train_range": [str(train["datetime"].min()), str(train["datetime"].max())],
        "test_range": [str(test["datetime"].min()), str(test["datetime"].max())],
        "features_used": feature_manager.combined_features,
        "model_params": model_params.__dict__
    }
    
    save_model_artifacts(metadata, stack, scaler, model_params, LOOKAHEAD)
    
    return {
        'lookahead': LOOKAHEAD,
        'results': all_results,
        'models': base_models,
        'stack': stack,
        'scaler': scaler,
        'performance_monitor': performance_monitor
    }

In [223]:
lookahead_values = [5, 15]

# Run sequentially instead of in parallel
lookahead_results = []
for val in lookahead_values:
    try:
        result = run_lookahead(val)
        lookahead_results.append(result)
    except Exception as e:
        logging.error(f"Failed for lookahead {val}: {e}")
        continue


🔍 Running LOOKAHEAD=5


[I 2025-05-14 19:59:01,986] A new study created in RDB with name: ensemble_optimization_lookahead_5
ERROR:root:Evaluation failed: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

Detailed tracebacks of the workers should have been printed to stderr in the executor process if faulthandler was not disabled.
[I 2025-05-14 19:59:02,580] Trial 0 finished with value: -inf and parameters: {'rf_n_estimators': 770, 'rf_max_depth': 14, 'rf_min_samples_leaf': 31, 'xgb_max_depth': 9, 'xgb_lr': 0.1484716625923573, 'xgb_n_estimators': 265, 'enet_alpha': 0.0031477422680755793, 'enet_l1_ratio': 0.43296654049644157, 'meta_alpha': 0.0057024275778216615, 'meta_solver': 'svd', 'rf_weight': 0.42973650108848815, 'xgb_weight': 0.6528472975753976, 'enet_weight': 0.9588175479046734}. Best is trial 0 with value: -inf.
ERROR:root:Evaluation

KeyboardInterrupt: 

# Visualize

In [None]:
for result in lookahead_results:
    stack_preds = result['stack'].predict(X_test_scaled)
    rf_preds = result['models']['rf'].predict(X_test_scaled)
    xgb_preds = result['models']['xgb'].predict(X_test_scaled)
    enet_preds = result['models']['elasticnet'].predict(X_test_scaled)
    
    plt.figure(figsize=(12, 4))
    plt.plot(rf_preds[:100], label='RF')
    plt.plot(xgb_preds[:100], label='XGB')
    plt.plot(enet_preds[:100], label='ElasticNet')
    plt.plot(stack_preds[:100], label='Stack', linewidth=2)

In [None]:
for result in lookahead_results:
    df = result['results_df'].copy()
    df = df.sort_values(by='datetime')  # Ensure correct order
    df['cumulative_pnl'] = df['pnl'].cumsum()

    plt.figure(figsize=(12, 4))
    plt.plot(df['datetime'], df['cumulative_pnl'], label='Cumulative PnL', color='green')
    plt.title(f"Cumulative PnL (LOOKAHEAD={result['lookahead']})")
    plt.xlabel("Datetime")
    plt.ylabel("PnL")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Test Model

# Sort and Plot

In [None]:
# Predictions
# y_pred = best_lookahead.predict(X_test)
best_lookahead = max(lookahead_results, key=lambda x: max(r['pnl'] for r in x['results']))
y_pred = best_lookahead['stack'].predict(X_test_scaled)

# Confusion Matrix
labels = sorted(class_mapping)  # Make sure the order matches
cm = confusion_matrix(y_test, y_pred, labels=labels)

# Display Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, labels=labels, digits=2))