# Load Data

In [4]:
import os
import pandas as pd
import numpy as np
import pytz
from ta.trend import (
    MACD,
)
from ta.momentum import (
    RSIIndicator,
    StochRSIIndicator
)
from ta.volatility import (
    AverageTrueRange,
    BollingerBands
)
from ta.volume import (
    VolumeWeightedAveragePrice,
    AccDistIndexIndicator
)
from itertools import product
from sklearn.model_selection import cross_val_score, TimeSeriesSplit, KFold
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from datetime import timedelta
from collections import defaultdict
from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.base import clone
import optuna
from sklearn.inspection import permutation_importance
import logging
import joblib
import json
import seaborn as sns

optuna.logging.set_verbosity(optuna.logging.INFO)

# === Load Data ===
#folder_path = "/Users/francopapalardo-aleo/Desktop/repos/TradingAI 2/data/"
folder_path = "./data/"
column_names = ['datetime', 'open', 'high', 'low', 'close', 'volume']
df_list = []
plt.rcParams['font.family'] = 'Segoe UI Emoji'

for filename in os.listdir(folder_path):
    if filename.endswith(('.csv', '.txt')):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, sep=';', header=None, names=column_names)
        df['source_file'] = filename
        df_list.append(df)

df = pd.concat(df_list, ignore_index=True)
df['datetime'] = pd.to_datetime(df['datetime'])
# Resample to 5-minute candles
df = df.drop_duplicates(subset='datetime', keep='first').reset_index(drop=True)
df = df.sort_values('datetime').reset_index(drop=True)
df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)

# Base time features
df['hour'] = df['datetime'].dt.hour + df['datetime'].dt.minute / 60
df['minute'] = df['datetime'].dt.minute
df['day_of_week'] = df['datetime'].dt.dayofweek  # 0 = Monday

# Custom session flags (adjust if needed)       # Regular Trading Hours
df['is_premarket'] = df['hour'].between(7, 9.5)
df['is_lunch'] = df['hour'].between(11.5, 13.5)
df['is_postmarket'] = df['hour'].between(15.5, 20)
df['is_after_hours'] = df['hour'].between(20, 23.5)


# Initialize features or indicators

In [None]:
# === Feature Engineering ===
# EMA
# df['ema_3'] = ta.ema(df['close'], length=3)
# df['ema_8'] = ta.ema(df['close'], length=8)
# df['ema_9'] = ta.ema(df['close'], length=9)
# df['ema_13'] = ta.ema(df['close'], length=13)
# df['ema_21'] = ta.ema(df['close'], length=21)
# df['ema_34'] = ta.ema(df['close'], length=34)
# df['ema_ratio_8_21'] = df['ema_8'] / df['ema_21']
# df['ema_diff_8_21'] = df['ema_8'] - df['ema_21']

# RSI
df['rsi_6'] = RSIIndicator(df['close'], window=6).rsi()
# df['rsi_14'] = ta.rsi(df['close'], length=14)
# df['rsi_21'] = ta.rsi(df['close'], length=21)

# Bollinger Bands for volatility regime
bb = BollingerBands(df['close'], window=20, window_dev=2)
df['bb_width'] = (bb.bollinger_hband() - bb.bollinger_lband()) / bb.bollinger_mavg()

# Stochastic RSI for overbought/oversold with better signals than regular RSI
stoch_rsi = StochRSIIndicator(df['close'], window=14, smooth1=3, smooth2=3)
df['stoch_rsi_k'] = stoch_rsi.stochrsi_k()
df['stoch_rsi_d'] = stoch_rsi.stochrsi_d()

# Swing intensity (measures trend strength)
def swing_intensity(high, low, length=10):
    swing_high = high.rolling(length).max()
    swing_low = low.rolling(length).min()
    return (swing_high - swing_low) / swing_low

# Intraday seasonality
df['time_from_open'] = (df['datetime'].dt.hour * 60 + df['datetime'].dt.minute) - 570  # Minutes from 9:30
df['normalized_time'] = df['time_from_open'] / 390  # Normalize by trading day length

df['swing_intensity'] = swing_intensity(df['high'], df['low'])

# Normalized ATR (might be better than raw ATR)
df['natr'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range() / df['close']

# ATR
df['atr_5'] = AverageTrueRange(df['high'], df['low'], df['close'], window=5).average_true_range()
df['atr_14'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range()
# df['atr_30'] = ta.atr(df['high'], df['low'], df['close'], length=30)
df['atr_pct'] = df['atr_14'] / df['close']

# Price relative to recent ranges
df['close_to_high'] = (df['high'].rolling(10).max() - df['close']) / df['atr_14']
df['close_to_low'] = (df['close'] - df['low'].rolling(10).min()) / df['atr_14']

# MACD
# macd_slow = ta.macd(df['close'], fast=12, slow=26, signal=9)
# df['macd_slow'] = macd_slow['MACDh_12_26_9']
# df['macd_slow_diff'] = macd_slow['MACD_12_26_9'] - macd_slow['MACDs_12_26_9']

macd_fast = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd()
df['macd_fast'] = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd_signal()
df['macd_fast_diff'] = MACD(df['close'], window_fast=6, window_slow=13, window_sign=5).macd_diff()

# VWAP
vwap = VolumeWeightedAveragePrice(
    high=df['high'],
    low=df['low'],
    close=df['close'],
    volume=df['volume'],
    window=14
)
df['vwap'] = vwap.vwap
df['vwap_diff'] = df['close'] - df['vwap']
df['above_vwap'] = (df['close'] > df['vwap']).astype(int)
df['below_vwap'] = (df['close'] < df['vwap']).astype(int)

# # Candle body and total range
# df['candle_body'] = abs(df['close'] - df['open'])
df['candle_range'] = df['high'] - df['low'] + 1e-9  # avoid division by zero

# # Candle body % of range
# df['body_pct'] = df['candle_body'] / df['candle_range']

# # Wick sizes (relative to range)
# df['upper_wick'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['candle_range']
# df['lower_wick'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['candle_range']

# Previous
# df['prev_close'] = df['close'].shift(1)
# df['prev_rsi_14'] = df['rsi_14'].shift(1)
# df['prev_macd_fast'] = df['macd_fast'].shift(1)
# df['prev_ema_diff'] = df['ema_diff_8_21'].shift(1)

# First calculate return_1
df['return_1'] = df['close'].pct_change(1)

# Then use it for session_vol
df['session_vol'] = df.groupby(df['datetime'].dt.date)['return_1'].transform(
    lambda x: x.expanding().std()
)

# Rest of your feature calculations
df['time_from_open'] = (df['datetime'].dt.hour * 60 + df['datetime'].dt.minute) - 570
df['normalized_time'] = df['time_from_open'] / 390

# Volume-weighted momentum
df['volume_weighted_return'] = df['return_1'] * (df['volume'] / df['volume'].rolling(20).mean())
# df['above_vwap'] = (df['close'] > df['vwap']).astype(int)
# df['above_ema_21'] = (df['close'] > df['ema_21']).astype(int)
# df['below_vwap'] = (df['close'] < df['vwap']).astype(int)
# df['below_ema_21'] = (df['close'] < df['ema_21']).astype(int)
# df['velocity'] = df['return_1'] - df['return_1'].shift(1)

# df['vol_rolling_mean'] = df['volume'].rolling(20).mean()
df['vol_spike'] = df['volume'] / (df['volume'].rolling(20).mean() + 1e-9)

df['candle_body'] = (df['close'] - df['open']).abs()
df['candle_range'] = df['high'] - df['low'] + 1e-9  # To avoid division by zero

df['body_pct'] = df['candle_body'] / df['candle_range']
df['upper_wick'] = (df['high'] - df[['close', 'open']].max(axis=1)) / df['candle_range']
df['lower_wick'] = (df[['close', 'open']].min(axis=1) - df['low']) / df['candle_range']



# Breaks
# df['break_high_20'] = (df['high'] > df['high'].rolling(20).max().shift(1)).astype(int)
# df['break_low_20'] = (df['low'] < df['low'].rolling(20).min().shift(1)).astype(int)


highs = df['high']
lows = df['low']

def choppiness_index(high, low, close, length=14):
    tr = AverageTrueRange(high=high, low=low, close=close, window=length).average_true_range()
    atr_sum = tr.rolling(length).sum()
    high_max = high.rolling(length).max()
    low_min = low.rolling(length).min()
    return 100 * np.log10(atr_sum / (high_max - low_min)) / np.log10(length)

# def detect_pivot_highs_lows_3(df, lookback=3, lookforward=3):
#     df['pivot_high_3'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
#     df['pivot_low_3'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
#     df['is_pivot_high_3'] = df['pivot_high_3'].notna().astype(int)
#     df['is_pivot_low_3'] = df['pivot_low_3'].notna().astype(int)
#     return df

def detect_pivot_highs_lows_5(df, lookback=5, lookforward=5):
    df['pivot_high_5'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
    df['pivot_low_5'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
    df['is_pivot_high_5'] = df['pivot_high_5'].notna().astype(int)
    df['is_pivot_low_5'] = df['pivot_low_5'].notna().astype(int)
    return df

def detect_pivot_highs_lows_10(df, lookback=10, lookforward=10):
    df['pivot_high_10'] = highs[(highs.shift(lookback) < highs) & (highs.shift(-lookforward) < highs)]
    df['pivot_low_10'] = lows[(lows.shift(lookback) > lows) & (lows.shift(-lookforward) > lows)]
    
    df['is_pivot_high_10'] = df['pivot_high_10'].notna().astype(int)
    df['is_pivot_low_10'] = df['pivot_low_10'].notna().astype(int)
    return df

def add_session_flags(df):
    # Ensure datetime is timezone-aware (New York time)
    ny_tz = pytz.timezone('America/New_York')
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['datetime'] = df['datetime'].dt.tz_localize('UTC').dt.tz_convert(ny_tz)

    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df['day_of_week'] = df['datetime'].dt.dayofweek  # 0 = Monday

    # Define sessions in NY time
    def classify_session(row):
        hour = row['hour']
        minute = row['minute']
        time_val = hour * 60 + minute

        if 19*60 <= time_val < 3*60 + 30 + 1440:  # 7:00 PM to 3:30 AM (Asia)
            return 'asia'
        elif 3*60 + 30 <= time_val < 8*60:        # 3:30 AM to 8:00 AM (London pre-open)
            return 'london_pre'
        elif 8*60 <= time_val < 12*60:            # 8:00 AM to 12:00 PM (London/NY overlap)
            return 'london_ny'
        elif 12*60 <= time_val < 16*60:           # 12:00 PM to 4:00 PM (NY)
            return 'ny'
        else:
            return 'other'

    df['session'] = df.apply(classify_session, axis=1)

    df['is_asian_session'] = (df['session'] == 'asia').astype(int)
    df['is_london_session'] = ((df['session'] == 'london_pre') | (df['session'] == 'london_ny')).astype(int)
    df['is_ny_session'] = (df['session'] == 'ny').astype(int)
    df['is_session_overlap'] = (df['session'] == 'london_ny').astype(int)

    return df

# def calc_nearest_sr_distance_fast(df):
#     highs_idx = df.index[df['is_pivot_high'] == 1].tolist()
#     lows_idx = df.index[df['is_pivot_low'] == 1].tolist()

#     res_dist = np.full(len(df), np.nan)
#     sup_dist = np.full(len(df), np.nan)

#     for i in range(len(df)):
#         current_close = df.at[i, 'close']

#         # Resistance: Find all prior pivot highs
#         prior_highs = [abs(current_close - df.at[idx, 'high']) for idx in highs_idx if idx < i]
#         res_dist[i] = min(prior_highs) if prior_highs else np.nan

#         # Support: Find all prior pivot lows
#         prior_lows = [abs(current_close - df.at[idx, 'low']) for idx in lows_idx if idx < i]
#         sup_dist[i] = min(prior_lows) if prior_lows else np.nan

#     df['dist_to_resistance'] = res_dist
#     df['dist_to_support'] = sup_dist

#     df['dist_to_res_pct'] = df['dist_to_resistance'] / df['close']
#     df['dist_to_sup_pct'] = df['dist_to_support'] / df['close']
#     return df

# def compute_fvg(df):
#     df = df.copy()
#     df['fvg_up'] = np.where((df['low'].shift(1) > df['high'].shift(2)), 1, 0)
#     df['fvg_down'] = np.where((df['high'].shift(1) < df['low'].shift(2)), 1, 0)
#     return df

# def compute_liquidity_sweeps(df, swing_window=10):
#     df = df.copy()
#     df['swing_high'] = df['high'].rolling(window=swing_window, center=False).max().shift(1)
#     df['swing_low'] = df['low'].rolling(window=swing_window, center=False).min().shift(1)

#     df['liquidity_sweep_high'] = ((df['high'] > df['swing_high']) & (df['close'] < df['swing_high'])).astype(int)
#     df['liquidity_sweep_low'] = ((df['low'] < df['swing_low']) & (df['close'] > df['swing_low'])).astype(int)

#     return df

# df = compute_fvg(df)
# df = compute_liquidity_sweeps(df)
# df = detect_pivot_highs_lows_3(df)
df = detect_pivot_highs_lows_5(df)
df = detect_pivot_highs_lows_10(df)
df = add_session_flags(df)

df['is_pivot_high'] = df[['is_pivot_high_5', 'is_pivot_high_10']].max(axis=1)
df['is_pivot_low']  = df[['is_pivot_low_5', 'is_pivot_low_10']].max(axis=1)

# df = calc_nearest_sr_distance_fast(df)

# === Add Feature ===
df['chop_index'] = choppiness_index(df['high'], df['low'], df['close'])

# === Strategy Setup ===
TICK_VALUE = 5
SL_ATR_MULT = 1.0
TP_ATR_MULT = 3.0
TRAIL_START_MULT = 2.5
TRAIL_STOP_MULT = 1.0
MAX_CONTRACTS = 1

param_grid_strategy = {
    'SL_ATR_MULT': [1.0],
    'TP_ATR_MULT': [2.0, 2.5, 3.0, 3.5, 4.0],
    'TRAIL_START_MULT': [0.5, 1.0, 1.5],
    'TRAIL_STOP_MULT': [0.5, 1.0, 1.5],
    'TICK_VALUE': [5],  # optional, or expand for futures like NQ/ES
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

# features = [
#     'rsi_6', 'rsi_14', 'rsi_21',
#     'ema_3', 'ema_8', 'ema_13', 'ema_9', 'ema_21','ema_34',
#     'ema_ratio_8_21', 'ema_diff_8_21',
#     'macd_slow', 'macd_slow_diff',
#     'macd_fast', 'macd_fast_diff',
#     'atr_5', 'atr_30', 'atr_14', 'atr_pct',
#     'vwap', 'vwap_diff',
#     'candle_body', 'candle_range',
#     'volume', 'chop_index',
#     'hour', 'minute', 'day_of_week',
#     'is_premarket', 'is_lunch',
#     'body_pct',  
#     'upper_wick', 'lower_wick',  # just added
#     'volume_delta_ema',
#     'return_1', 'return_3',
#     'prev_close', 'prev_rsi_14', 'prev_macd_fast', 'prev_ema_diff',
#     'above_vwap', 'above_ema_21',
#     'velocity', 'vol_spike',
#     'break_high_20', 'break_low_20',
#     'is_pivot_high_10', 'is_pivot_low_10',
#     'is_pivot_high_3', 'is_pivot_low_3',
#     'is_pivot_high_5', 'is_pivot_low_5',
#     # 'dist_to_resistance', 'dist_to_support',
#     'fvg_up', 'fvg_down',
#     'liquidity_sweep_high', 'liquidity_sweep_low',
#     'below_vwap', 'below_ema_21',
# ]

features = [
    # Existing features
    'is_pivot_low', 'is_pivot_high',
    'candle_range', 'rsi_6', 'atr_5',
    'macd_fast_diff', 'atr_pct',
    'return_1', 'macd_fast', 'volume',
    'hour', 'chop_index', 'day_of_week',
    
    # New features
    'bb_width', 'natr',
    'stoch_rsi_k', 'stoch_rsi_d',
    'volume_weighted_return',
    'swing_intensity',
    'close_to_high', 'close_to_low',
    'session_vol', 'normalized_time',

    # Just added
    'above_vwap', 'below_vwap',
    'vwap', 'vwap_diff',
    'vol_spike', 'body_pct',
    'upper_wick', 'lower_wick',
    'is_asian_session', 'is_london_session', 'is_ny_session',
    'is_session_overlap',
]

avoid_funcs = {
}

df[features] = df[features].fillna(-999)

def is_same_session(start_time, end_time):
    session_start = start_time.replace(hour=18, minute=0, second=0)
    if start_time.hour < 18:
        session_start -= timedelta(days=1)
    session_end = session_start + timedelta(hours=23)
    return session_start <= start_time <= session_end and session_start <= end_time <= session_end

combo_trades = defaultdict(set)

def combo_overlap(c1, c2):
    a, b = combo_trades[frozenset(c1)], combo_trades[frozenset(c2)]
    if not a or not b:
        return 1.0
    return len(a & b) / min(len(a), len(b))


TypeError: 'Series' object is not callable

# Declare Combo function for serialization

In [None]:
def evaluate_regression_combo(
    X_test, preds, labeled, df,
    avoid_funcs,
    SL_ATR_MULT, TP_ATR_MULT, TRAIL_START_MULT, TRAIL_STOP_MULT, TICK_VALUE,
    is_same_session,
    long_thresh=0.003,
    short_thresh=-0.003
):
    temp_trades_data = []
    skipped_trades = 0
    avoid_hits = defaultdict(int)
    long_trades = 0
    short_trades = 0

    for i, idx in enumerate(X_test.index):
        row = labeled.loc[idx]
        pred_return = preds[i]

        # Decide trade direction
        if pred_return >= long_thresh:
            side = 'long'
            long_trades += 1
        elif pred_return <= short_thresh:
            side = 'short'
            short_trades += 1
        else:
            continue  # skip neutral signals

        # Trade filters
        skip_trade = False
        for name, f in avoid_funcs.items():
            try:
                if f(row):
                    avoid_hits[name] += 1
                    skip_trade = True
            except:
                continue
        if skip_trade or idx >= len(df) - 6:
            skipped_trades += 1
            continue

        # --- Trade Simulation ---
        entry_price = row['close']
        entry_time = row['datetime']
        atr = row['atr_14']

        # Stop Loss (fixed volatility-based)
        sl_price = entry_price - SL_ATR_MULT * atr if side == 'long' else entry_price + SL_ATR_MULT * atr

        # Take Profit (dynamic, from model prediction, clipped)
        expected_move = abs(pred_return) * entry_price
        min_tp = 0.001 * entry_price  # minimum 0.1% move
        max_tp = TP_ATR_MULT * atr
        tp_move = np.clip(expected_move, min_tp, max_tp)
        tp_price = entry_price + tp_move if side == 'long' else entry_price - tp_move

        # Trailing logic
        trail_trigger = entry_price + TRAIL_START_MULT * atr if side == 'long' else entry_price - TRAIL_START_MULT * atr
        trail_stop = None

        max_price, min_price = entry_price, entry_price
        exit_price, exit_time = None, None

        fwd_idx = idx + 1
        while fwd_idx < len(df):
            fwd_row = df.loc[fwd_idx]
            max_price = max(max_price, fwd_row['high'])
            min_price = min(min_price, fwd_row['low'])

            if (side == 'long' and fwd_row['low'] <= sl_price) or (side == 'short' and fwd_row['high'] >= sl_price):
                exit_price = sl_price
                exit_time = fwd_row['datetime']
                break

            if (side == 'long' and fwd_row['high'] >= tp_price) or (side == 'short' and fwd_row['low'] <= tp_price):
                exit_price = tp_price
                exit_time = fwd_row['datetime']
                break

            if side == 'long' and fwd_row['high'] >= trail_trigger:
                trail_stop = fwd_row['close'] - TRAIL_STOP_MULT * atr
            if side == 'short' and fwd_row['low'] <= trail_trigger:
                trail_stop = fwd_row['close'] + TRAIL_STOP_MULT * atr

            if trail_stop:
                if (side == 'long' and fwd_row['low'] <= trail_stop) or (side == 'short' and fwd_row['high'] >= trail_stop):
                    exit_price = trail_stop
                    exit_time = fwd_row['datetime']
                    break

            fwd_idx += 1

        if exit_price is None:
            exit_price = df.loc[len(df) - 1, 'close']
            exit_time = df.loc[len(df) - 1, 'datetime']

        if not is_same_session(entry_time, exit_time):
            continue

        GROSS_PNL = (exit_price - entry_price) * TICK_VALUE if side == 'long' else (entry_price - exit_price) * TICK_VALUE
        COMMISSION = 3.98
        pnl = GROSS_PNL - COMMISSION

        mfe = max_price - entry_price if side == 'long' else entry_price - min_price
        mae = entry_price - min_price if side == 'long' else max_price - entry_price

        temp_trades_data.append({
            'datetime': exit_time,
            'pnl': pnl,
            'mfe': mfe,
            'mae': mae,
            'gross_pnl': GROSS_PNL
        })

    # === Metrics ===
    results = pd.DataFrame(temp_trades_data)
    pnl_total = results['pnl'].sum() if not results.empty else 0
    trades = len(results)
    win_rate = (results['pnl'] > 0).mean() if not results.empty else 0
    expectancy = results['pnl'].mean() if not results.empty else 0
    profit_factor = results[results['pnl'] > 0]['pnl'].sum() / abs(results[results['pnl'] < 0]['pnl'].sum()) if not results.empty and (results['pnl'] < 0).any() else np.nan
    sharpe = results['pnl'].mean() / (results['pnl'].std() + 1e-9) * np.sqrt(trades) if trades > 1 else 0

    return {
        'pnl': pnl_total,
        'trades': trades,
        'win_rate': win_rate,
        'expectancy': expectancy,
        'profit_factor': profit_factor,
        'sharpe': sharpe,
        'long_trades': long_trades,
        'short_trades': short_trades,
        'avoid_hits': dict(avoid_hits),
        'results': results
    }

# Cleanup

In [None]:
def compute_future_return_labels(df: pd.DataFrame, lookahead: int, is_same_session_fn) -> pd.DataFrame:
   # Create a copy to avoid modifying original
    df = df.copy()
    
    # 1. Forward-looking feature detection
    forward_looking = []
    for col in df.columns:
        # Check for keywords that suggest forward-looking computation
        if any(x in col.lower() for x in ['future', 'next', 'fwd', 'forward']):
            forward_looking.append(col)
            
    if forward_looking:
        logging.warning(f"Potential forward-looking features detected: {forward_looking}")
    
    # 2. Ensure proper temporal alignment
    def align_features(row_idx):
        # Only use data available at prediction time
        current_time = df.loc[row_idx, 'datetime']
        mask = df['datetime'] < current_time
        
        # Update rolling calculations to only use past data
        for col in df.columns:
            if 'rolling' in col or 'ewm' in col:
                df.loc[row_idx, col] = df[mask][col].iloc[-1]
                
    # Apply alignment
    for idx in df.index:
        align_features(idx)

    # 3. Label computation with session boundary check
    future_returns = []
    trade_dirs = [] 

    for idx in range(len(df) - lookahead):
        start_time = df.loc[idx, 'datetime']
        end_time = df.loc[idx + lookahead, 'datetime']

        if not is_same_session_fn(start_time, end_time):
            future_returns.append(np.nan)
            trade_dirs.append(None)
            continue

        entry_price = df.loc[idx, 'close']
        future_price = df.loc[idx + lookahead, 'close']
        future_return = (future_price / entry_price) - 1

        future_returns.append(future_return)
        trade_dirs.append('long' if future_return > 0 else 'short')

    # Align output with original df
    df_labeled = df.iloc[:len(future_returns)].copy()
    df_labeled['future_return'] = future_returns
    df_labeled['trade_dir'] = trade_dirs

    # Drop NaNs
    df_labeled = df_labeled.dropna(subset=['future_return'])

    return df_labeled

In [None]:
lookahead_values = [5, 15, 20]

def label_and_save(lookahead):
    df_labeled = compute_future_return_labels(df, lookahead=lookahead, is_same_session_fn=is_same_session)
    df_labeled.to_parquet(f"labeled_data_{lookahead}.parquet")

for lookahead in lookahead_values:
    if os.path.exists(f"labeled_data_{lookahead}.parquet"):
        print(f"File labeled_data_{lookahead}.parquet already exists. Skipping...")
        continue
    else:
        print(f"Processing lookahead {lookahead}...")
        label_and_save(lookahead)


File labeled_data_5.parquet already exists. Skipping...
File labeled_data_15.parquet already exists. Skipping...
File labeled_data_20.parquet already exists. Skipping...


# Train

##### Real Training

In [None]:
def check_overfit(model, X_tr, X_te):
    train_preds = model.predict(X_tr)
    test_preds = model.predict(X_te)
    train_mse = mean_squared_error(y_train, train_preds)
    test_mse = mean_squared_error(y_test, test_preds)
    ratio = test_mse / train_mse if train_mse != 0 else float('inf')

    print(f"\n📉 Overfitting check:")
    print(f"Train MSE: {train_mse:.8f}")
    print(f"Test MSE: {test_mse:.8f}")
    print(f"Overfit ratio (Test / Train): {ratio:.2f}")
    if ratio > 1.5:
        print("⚠️ Potential overfitting detected.")
    elif ratio < 0.7:
        print("⚠️ Possibly underfitting (too simple).")
    else:
        print("✅ Generalization looks reasonable.")

In [None]:
def generate_oof_predictions(models, X, y, n_splits=5):
    """
    Generate out-of-fold predictions from a list of trained base models.
    Returns a matrix of shape (len(X), len(models)) with OOF predictions.
    """
    oof_preds = np.zeros((len(X), len(models)))
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for i, model in enumerate(models):
        fold_preds = np.zeros(len(X))
        for train_idx, val_idx in kf.split(X):
            clone_model = clone(model)  # Avoid reusing fitted model
            clone_model.fit(X[train_idx], y[train_idx])
            fold_preds[val_idx] = clone_model.predict(X[val_idx])
        oof_preds[:, i] = fold_preds
    return oof_preds

In [None]:
def run_lookahead(LOOKAHEAD):
    print(f"\n🔍 Running LOOKAHEAD={LOOKAHEAD}")
    logging.info(f"Loading labeled data for LOOKAHEAD={LOOKAHEAD}")

    # === Load and split data ===
    labeled = pd.read_parquet(f"labeled_data_{LOOKAHEAD}.parquet")
    labeled = labeled.replace([np.inf, -np.inf], np.nan)
    labeled = labeled.dropna(subset=features + ['future_return'])

    cutoff_date = pd.Timestamp("2025-05-01")
    train = labeled[labeled['datetime'] < cutoff_date]
    test = labeled[labeled['datetime'] >= cutoff_date]

    X_train_full, y_train = train[features], train['future_return']
    X_test_full, y_test = test[features], test['future_return']

    print(f"Train range: {train['datetime'].min()} to {train['datetime'].max()} | Rows: {len(train)}")
    print(f"Test range: {test['datetime'].min()} to {test['datetime'].max()} | Rows: {len(test)}")

    # === Step 1: Initial RF Training ===
    def objective(trial):
        try:
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
                'max_depth': trial.suggest_int('max_depth', 20, 40),
                'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 20),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 40),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
                'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 300),  # controls model complexity
            }
            model = RandomForestRegressor(**params, random_state=42, n_jobs=-5)
            tscv = TimeSeriesSplit(n_splits=3)
            return cross_val_score(model, X_train_full, y_train, cv=tscv, scoring='neg_mean_squared_error').mean()
        except Exception as e:
            logging.warning(f"Trial failed: {e}")
            return float('-inf')

    study = optuna.create_study(
        direction='maximize',
        study_name='rf_opt',
        storage=f'sqlite:///rf_opt_study{LOOKAHEAD}.db',
        load_if_exists=True
    )
    study.optimize(objective, n_trials=70)

    rf_best_params = study.best_params
    print(f"✅ Best hyperparameters: {rf_best_params}")

    # === Step 2: Fit on Full Feature Set to Extract Importance ===
    rf_full = RandomForestRegressor(**rf_best_params, random_state=42, n_jobs=-5)
    rf_full.fit(X_train_full, y_train)

    importance_df = pd.DataFrame({
        'feature': X_train_full.columns,
        'importance': rf_full.feature_importances_
    }).sort_values(by='importance', ascending=False)

    print("\n📊 Top 25 Feature Importances (from full set):")
    print(importance_df.head(25))

    # === Step 3: Retrain on Top-N Features ===
    top_features = importance_df.head(25)['feature'].tolist()
    X_train, X_test = X_train_full[top_features], X_test_full[top_features]

    rf_best = RandomForestRegressor(**rf_best_params, random_state=42, n_jobs=-5)
    rf_best.fit(X_train, y_train)

    check_overfit(rf_best, X_train, X_test)

    # === Step 5: Permutation Importance ===
    perm_df = permutation_importance(
        rf_best, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-5
    )
    perm_df = pd.DataFrame({
        'feature': X_test.columns,
        'importance': perm_df.importances_mean
    }).sort_values(by='importance', ascending=False)

    print("\n📊 Top 25 Permutation Important Features:")
    print(perm_df.head(25))

     # === Step 6: Combine RF + L1-selected features ===
    print("\n🧠 Combining RF + L1 features...")

    top_rf_features = importance_df.head(20)['feature'].tolist()

    if 'selected_features_l1' not in globals():
        print("⚠️ 'selected_features_l1' not defined. Using only RF top features.")
        combined_features = top_rf_features
    # else:
    #     combined_features = list(set(top_rf_features + selected_features_l1.tolist()))

    print(f"\n🔧 Combined selected features (RF + L1):")
    print(combined_features)

    X_train_combined = X_train_full[combined_features]
    X_test_combined = X_test_full[combined_features]

    # === Step 7: Train individual models on combined features ===
    print("\n⚙️ Training individual models...")

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_combined)
    X_test_scaled = scaler.transform(X_test_combined)

    def tune_xgb(X_train, y_train):
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 100, 500),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                'gamma': trial.suggest_float('gamma', 0, 5.0),  # regularization – helps pruning
                'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
            }
            model = XGBRegressor(**params, eval_metric='rmse', random_state=42)
            return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

        study = optuna.create_study(
            direction='maximize',
            study_name='xgb_opt',
            storage=f'sqlite:///xgb_opt_study{LOOKAHEAD}.db',
            load_if_exists=True
        )
        study.optimize(objective, n_trials=400)
        return study.best_params
    
    def tune_elasticnet(X_train, y_train):
        def objective(trial):
            params = {
                'alpha': trial.suggest_float('alpha', 1e-4, 1.0, log=True),
                'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0)
            }
            model = ElasticNet(**params, max_iter=1000)
            return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

        study = optuna.create_study(
            direction='maximize',
            study_name='elasticnet_opt',
            storage=f'sqlite:///elasticnet_opt_study{LOOKAHEAD}.db',
            load_if_exists=True
        )
        study.optimize(objective, n_trials=400)
        return study.best_params

    xgb_params = tune_xgb(X_train_scaled, y_train)
    enet_params = tune_elasticnet(X_train_scaled, y_train)

    xgb = XGBRegressor(**xgb_params, eval_metric='rmse', random_state=42)
    elasticnet = ElasticNet(**enet_params, max_iter=1000)
    rf_best_combined = RandomForestRegressor(**rf_best_params, random_state=42, n_jobs=-5)

    xgb.fit(X_train_scaled, y_train)
    elasticnet.fit(X_train_scaled, y_train)
    rf_best_combined.fit(X_train_combined, y_train)

    # Meta Model
    X_train_rf = pd.DataFrame(X_train_scaled, columns=X_train_combined.columns, index=X_train_combined.index)

    def tune_meta_xgb(X_train, y_train, base_models_preds):
        """
        Tune XGBRegressor as the meta-learner using predictions from base models as input.
        """
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'max_depth': trial.suggest_int('max_depth', 2, 6),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
            }
            model = XGBRegressor(**params, random_state=42)
            return cross_val_score(model, base_models_preds, y_train, cv=3, scoring='neg_mean_squared_error').mean()

        study = optuna.create_study(
            direction='maximize',
            study_name='meta_xgb_stack',
            storage=f'sqlite:///meta_xgb_stack_{LOOKAHEAD}.db',
            load_if_exists=True
        )
        study.optimize(objective, n_trials=400)
        return study.best_params
    
    base_models = [
        rf_best_combined,
        xgb,
        elasticnet
    ]

    X_meta = X_train_scaled  # All your models use scaled inputs
    base_models_preds_train = generate_oof_predictions(base_models, X_meta, y_train)

    simple_avg = base_models_preds_train.mean(axis=1)
    print("Simple OOF ensemble MAE:", mean_absolute_error(y_train, simple_avg))

    np.save("meta_features.npy", base_models_preds_train)

    meta_params = tune_meta_xgb(X_train_scaled, y_train, base_models_preds_train)
    print("🔍 Best meta-model params:", meta_params)

    meta_model = XGBRegressor(**meta_params, random_state=42)

    # === Step 8: Ensemble  Regressor ===
    stack = StackingRegressor(
        estimators=[('rf', rf_best_combined), ('xgb', xgb), ('enet', elasticnet)],
        final_estimator=meta_model,
        n_jobs=-5
    )
    stack.fit(X_train_scaled, y_train)

    # === Step 9: Evaluate all models ===
    def evaluate_model(name, model, Xtr, Xte, scaled=False):
        train_preds = model.predict(Xtr)
        test_preds = model.predict(Xte)
        train_mse = mean_squared_error(y_train, train_preds)
        test_mse = mean_squared_error(y_test, test_preds)
        overfit_ratio = test_mse / train_mse if train_mse != 0 else float('inf')

        print(f"\n📊 {name} Performance:")
        print(f"Train MSE: {train_mse:.8f}")
        print(f"Test MSE: {test_mse:.8f}")
        print(f"Overfit ratio (Test / Train): {overfit_ratio:.2f}")
        if overfit_ratio > 1.5:
            print("⚠️ Potential overfitting detected.")
        elif overfit_ratio < 0.7:
            print("⚠️ Possibly underfitting.")
        else:
            print("✅ Generalization looks reasonable.")
        return test_preds

    X_test_rf = pd.DataFrame(X_test_scaled, columns=X_train_combined.columns, index=X_test_combined.index)
    X_train_rf = pd.DataFrame(X_train_scaled, columns=X_train_combined.columns, index=X_train_combined.index)
    preds_rf = evaluate_model("RandomForest", rf_best_combined, X_train_rf, X_test_rf)
    # preds_rf = evaluate_model("RandomForest", rf_best_combined, X_train_scaled, X_test_scaled)

    preds_xgb = evaluate_model("XGBoost", xgb, X_train_scaled, X_test_scaled)
    preds_elasticnet = evaluate_model("ElasticNet", elasticnet, X_train_scaled, X_test_scaled)
    preds_stack   = evaluate_model("Stacking Ensemble", stack, X_train_scaled, X_test_scaled)

    # === Step 9.5: Isotopic Regression ===
    stack_preds = stack.predict(X_test_scaled)
    iso = IsotonicRegression(out_of_bounds='clip')
    iso.fit(stack_preds, y_test)

    calibrated_preds = iso.predict(stack_preds)
    # === Step 10: Choose the final model to backtest ===
    preds = calibrated_preds  
    X_test = X_test_scaled  

    # === Step 11: Backtest Strategy ===
    #thresholds = [0.00005, 0.0001, 0.0002, 0.0005, 0.001]
    thresholds = [0.0005, 0.001]

    X_test_df = pd.DataFrame(X_test_scaled, columns=combined_features, index=X_test_combined.index)
    
    all_results = []

    y_pred = stack.predict(X_test_df)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")

    for params in combinations:
        for thresh in thresholds:
            results = evaluate_regression_combo(
                X_test=X_test_df,
                preds=preds,
                labeled=labeled,
                df=df,
                avoid_funcs=avoid_funcs,
                SL_ATR_MULT=SL_ATR_MULT,
                TP_ATR_MULT=TP_ATR_MULT,
                TRAIL_START_MULT=TRAIL_START_MULT,
                TRAIL_STOP_MULT=TRAIL_STOP_MULT,
                TICK_VALUE=TICK_VALUE,
                is_same_session=is_same_session,
                long_thresh=thresh,
                short_thresh=-thresh
            )

            results['params'] = params
            all_results.append(results)

            print(
                f"\n✅ LOOKAHEAD={LOOKAHEAD} | Threshold={thresh}"
                f"\nPnL: ${results['pnl']:.2f}"
                f"\nTrades: {results['trades']}"
                f"\nWin Rate: {results['win_rate']:.2%}"
                f"\nExpectancy: {results['expectancy']:.2f}"
                f"\nProfit Factor: {results['profit_factor']:.2f}"
                f"\nSharpe Ratio: {results['sharpe']:.2f}"
                f"\nLong Trades: {results['long_trades']} | Short Trades: {results['short_trades']}"
            )

            print("Avoid Hits:")
            for name, count in results['avoid_hits'].items():
                print(f" - {name}: {count}")

            print("\n🔢 Top 5 PnL trades:")
            print(results['results'].sort_values(by='pnl', ascending=False).head(5))

            print("\n🔻 Bottom 5 PnL trades:")
            print(results['results'].sort_values(by='pnl', ascending=True).head(5))


    summary_df = pd.DataFrame([{
        'pnl': r['pnl'],
        'sharpe': r['sharpe'],
        'expectancy': r['expectancy'],
        'profit_factor': r['profit_factor'],
        'win_rate': r['win_rate'],
        'trades': r['trades'],
        **r['params']
    } for r in all_results])
    top = summary_df.sort_values(by='sharpe', ascending=False).head(5)
    print(top)

    metadata = {
        "lookahead": LOOKAHEAD,
        "train_range": [str(train["datetime"].min()), str(train["datetime"].max())],
        "test_range": [str(test["datetime"].min()), str(test["datetime"].max())],
        "features_used": combined_features,
        "rf_params": rf_best_params,
        "xgb_params": xgb_params,
        "enet_params": enet_params
    }
    with open(f"model_metadata_{LOOKAHEAD}.json", "w") as f:
        json.dump(metadata, f, indent=2)
        
    joblib.dump(meta_params, f"meta_model_params_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(stack, f"stack_model_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(scaler, f"scaler_LOOKAHEAD_{LOOKAHEAD}_05-14.pkl")
    joblib.dump(study.trials_dataframe(), f"xgb_trials_df_{LOOKAHEAD}_05-14.pkl")

    return {
        'lookahead': LOOKAHEAD,
        'pnl': results['pnl'],
        'win_rate': results['win_rate'],
        'expectancy': results['expectancy'],
        'profit_factor': results['profit_factor'],
        'sharpe': results['sharpe'],
        'trades': results['trades'],
        'best_params': rf_best_params,
        'preds_rf': preds_rf,
        'preds_xgb': preds_xgb,
        'preds_elasticnet': preds_elasticnet,
        'preds_stack': preds_stack
    }

In [None]:
lookahead_values = [5, 15]

lookahead_results = Parallel(n_jobs=1)(
    delayed(run_lookahead)(val) for val in lookahead_values
)

# Visualize

In [None]:
for result in lookahead_results:
    stack_preds = result['stack'].predict(X_test_scaled)
    rf_preds = result['models']['rf'].predict(X_test_scaled)
    xgb_preds = result['models']['xgb'].predict(X_test_scaled)
    enet_preds = result['models']['elasticnet'].predict(X_test_scaled)
    
    plt.figure(figsize=(12, 4))
    plt.plot(rf_preds[:100], label='RF')
    plt.plot(xgb_preds[:100], label='XGB')
    plt.plot(enet_preds[:100], label='ElasticNet')
    plt.plot(stack_preds[:100], label='Stack', linewidth=2)

In [None]:
for result in lookahead_results:
    df = result['results_df'].copy()
    df = df.sort_values(by='datetime')  # Ensure correct order
    df['cumulative_pnl'] = df['pnl'].cumsum()

    plt.figure(figsize=(12, 4))
    plt.plot(df['datetime'], df['cumulative_pnl'], label='Cumulative PnL', color='green')
    plt.title(f"Cumulative PnL (LOOKAHEAD={result['lookahead']})")
    plt.xlabel("Datetime")
    plt.ylabel("PnL")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
np.corrcoef([lookahead_results['preds_rf'], lookahead_results['preds_xgb'], lookahead_results['preds_elasticnet']])
preds_matrix = np.vstack([lookahead_results['preds_rf'], lookahead_results['preds_xgb'], lookahead_results['preds_elasticnet']])
corr_matrix = np.corrcoef(preds_matrix)

plt.figure(figsize=(6, 4))
sns.heatmap(corr_matrix, annot=True, xticklabels=['RF', 'XGB', 'ENet'], yticklabels=['RF', 'XGB', 'ENet'], cmap='coolwarm', fmt=".2f")
plt.title("Correlation Between Base Model Predictions")
plt.show()

# Test Model

# Sort and Plot

In [None]:
# Predictions
# y_pred = best_lookahead.predict(X_test)
best_lookahead = max(lookahead_results, key=lambda x: max(r['pnl'] for r in x['results']))
y_pred = best_lookahead['stack'].predict(X_test_scaled)

# Confusion Matrix
labels = sorted(class_mapping)  # Make sure the order matches
cm = confusion_matrix(y_test, y_pred, labels=labels)

# Display Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred, labels=labels, digits=2))

ValueError: max() arg is an empty sequence