# Import Libraries

In [None]:
import os
import pandas as pd
import platform
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
from datetime import timedelta
from collections import defaultdict
import joblib
import json
import warnings
import time
from scipy.stats.mstats import winsorize
import numba

# TA Indicators
from ta.volatility import BollingerBands, AverageTrueRange
from ta.trend import SMAIndicator, EMAIndicator, MACD, ADXIndicator, PSARIndicator, CCIIndicator
from ta.momentum import RSIIndicator, StochasticOscillator, PercentagePriceOscillator, ROCIndicator
#

# Tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, Dropout, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import StandardScaler, RobustScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import Huber, MeanSquaredError
import tensorflow as tf
#

# Scikit-learn
from sklearn.base import clone, BaseEstimator, RegressorMixin, ClassifierMixin
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, root_mean_squared_error, mean_squared_error, mean_absolute_error, r2_score, accuracy_score, classification_report, f1_score, confusion_matrix
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight

# Models and Training
from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
import optuna
import seaborn as sns
import shap
#

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", message=".*There are no meaningful features.*", category=UserWarning)
optuna.logging.set_verbosity(optuna.logging.INFO)

In [None]:
folder_path = "./../data/"
column_names = ['datetime', 'open', 'high', 'low', 'close', 'volume']
df_list = []

# Set emoji-compatible font based on OS (Optional, fine as is)
system = platform.system()
if system == 'Windows':
    plt.rcParams['font.family'] = 'Segoe UI Emoji'
elif system == 'Linux':
    plt.rcParams['font.family'] = 'Noto Color Emoji'

# Read all files
for filename in os.listdir(folder_path):
    if filename.endswith(('.csv', '.txt')):
        file_path = os.path.join(folder_path, filename)
        # Added error_bad_lines=False and warn_bad_lines=True for robustness
        df_temp = pd.read_csv(file_path, sep=';', header=None, names=column_names, 
                              on_bad_lines='warn') # Consider 'skip' or 'warn'
        df_list.append(df_temp)

# Combine and clean
df = pd.concat(df_list, ignore_index=True)
df['datetime'] = pd.to_datetime(df['datetime'], utc=True).dt.tz_convert('America/New_York')
df = df.drop_duplicates(subset='datetime', keep='first').reset_index(drop=True)
df = df.sort_values('datetime').reset_index(drop=True)
df[['open', 'high', 'low', 'close', 'volume']] = df[['open', 'high', 'low', 'close', 'volume']].astype(float)
df = df.set_index('datetime') # Set index ONCE before resampling

# === 📊 Resample to 5-minute and 15-minute candles ===
df_5min = df.resample('5min').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
}).dropna()

df_15min = df.resample('15min').agg({ # Resample from original for accuracy
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum'
}).dropna()

# Now df_5min and df_15min have datetime as their index. Ready for features!
print("5-minute data shape:", df_5min.shape)
print("15-minute data shape:", df_15min.shape)
print("\n5-minute data head:\n", df_5min.head())
print("\n15-minute data head:\n", df_15min.head())

# Initialize features

In [None]:
# --- Helper Functions for Custom Features (Keep these as they are) ---
def add_price_vs_ma(df, price_col='close', ma_col_name='EMA_20', new_col_name_suffix='_vs_EMA20'):
    if ma_col_name in df.columns and price_col in df.columns:
        df[price_col + new_col_name_suffix] = df[price_col] / df[ma_col_name]
    return df

def add_ma_vs_ma(df, ma1_col_name='EMA_10', ma2_col_name='EMA_20', new_col_name_suffix='_vs_EMA20'):
    if ma1_col_name in df.columns and ma2_col_name in df.columns:
        df[ma1_col_name + new_col_name_suffix] = df[ma1_col_name] / df[ma2_col_name]
    return df

def add_ma_slope(df, ma_col_name='EMA_10', new_col_name_suffix='_Slope_10', periods=1):
    if ma_col_name in df.columns:
        df[new_col_name_suffix] = df[ma_col_name].diff(periods) / periods
    return df

def add_rsi_signals(df, rsi_col_name='RSI_14', ob_level=70, os_level=30):
    if rsi_col_name in df.columns:
        df[rsi_col_name + f'_Is_Overbought_{ob_level}'] = (df[rsi_col_name] > ob_level).astype(int)
        df[rsi_col_name + f'_Is_Oversold_{os_level}'] = (df[rsi_col_name] < os_level).astype(int)
    return df

def add_stoch_signals(df, stoch_k_col_name='Stoch_K_14_3', ob_level=80, os_level=20):
    if stoch_k_col_name in df.columns:
        df[stoch_k_col_name + f'_Is_Overbought_{ob_level}'] = (df[stoch_k_col_name] > ob_level).astype(int)
        df[stoch_k_col_name + f'_Is_Oversold_{os_level}'] = (df[stoch_k_col_name] < os_level).astype(int)
    return df

def add_macd_cross_signal(df, macd_col_name='MACD_12_26_9', signal_col_name='MACD_Signal_12_26_9'):
    if macd_col_name in df.columns and signal_col_name in df.columns:
        crossed_above = (df[macd_col_name] > df[signal_col_name]) & (df[macd_col_name].shift(1) < df[signal_col_name].shift(1))
        crossed_below = (df[macd_col_name] < df[signal_col_name]) & (df[macd_col_name].shift(1) > df[signal_col_name].shift(1))
        df[macd_col_name + '_Cross_Signal'] = np.where(crossed_above, 1, np.where(crossed_below, -1, 0))
    return df

def add_price_vs_bb(df, price_col='close', bb_upper_col='BB_Upper_20_2', bb_lower_col='BB_Lower_20_2'):
    if price_col in df.columns and bb_upper_col in df.columns and bb_lower_col in df.columns:
        df[price_col + '_vs_BB_Upper'] = (df[price_col] > df[bb_upper_col]).astype(int)
        df[price_col + '_vs_BB_Lower'] = (df[price_col] < df[bb_lower_col]).astype(int)
    return df

def add_psar_flip_signal(df, psar_col_name='PSAR'):
    if psar_col_name in df.columns and 'close' in df.columns:
        uptrend_now = df[psar_col_name] < df['close']
        uptrend_prev = df[psar_col_name].shift(1) < df['close'].shift(1)
        flipped_to_up = uptrend_now & ~uptrend_prev
        flipped_to_down = ~uptrend_now & uptrend_prev
        df[psar_col_name + '_Flip_Signal'] = np.where(flipped_to_up, 1, np.where(flipped_to_down, -1, 0))
    return df

def add_daily_vwap(df, high_col='high', low_col='low', close_col='close', volume_col='volume', new_col_name='VWAP'):
    if not isinstance(df.index, pd.DatetimeIndex):
        print("Error: DataFrame index must be DatetimeIndex for daily VWAP.")
        return df
    tpv = ((df[high_col] + df[low_col] + df[close_col]) / 3) * df[volume_col]
    daily_grouped_tpv = tpv.groupby(df.index.date)
    cumulative_tpv = daily_grouped_tpv.cumsum()
    daily_grouped_volume = df[volume_col].groupby(df.index.date)
    cumulative_volume = daily_grouped_volume.cumsum()
    df[new_col_name] = cumulative_tpv / cumulative_volume
    df[new_col_name].replace([np.inf, -np.inf], np.nan, inplace=True) # Handle potential inf from division by zero volume at start of day
    return df

def add_candle_features(df):
    df['Candle_Range'] = df['high'] - df['low']
    df['Candle_Body'] = (df['close'] - df['open']).abs()
    df['Upper_Wick'] = df['high'] - np.maximum(df['open'], df['close'])
    df['Lower_Wick'] = np.minimum(df['open'], df['close']) - df['low']
    df['Body_vs_Range'] = (df['Candle_Body'] / df['Candle_Range']).replace([np.inf, -np.inf], np.nan).fillna(0)
    return df

def add_candlestick_patterns(df):
    if 'Candle_Body' not in df.columns or 'Candle_Range' not in df.columns: # Ensure dependencies exist
        df = add_candle_features(df)
    df['Is_Doji'] = ((df['Candle_Body'] / df['Candle_Range']).replace([np.inf, -np.inf], np.nan) < 0.1).astype(int)
    df['Is_Hammer'] = (
        (df['Upper_Wick'] < df['Candle_Body']) & \
        (df['Lower_Wick'] > 2 * df['Candle_Body']) & \
        ((df['close'] > (df['high'] - df['Candle_Range'].replace([np.inf, -np.inf], np.nan) * 0.3)) | \
         (df['open'] > (df['high'] - df['Candle_Range'].replace([np.inf, -np.inf], np.nan) * 0.3)))
    ).astype(int)
    df['Is_Engulfing_Bullish'] = (
        (df['close'] > df['open']) & (df['close'].shift(1) < df['open'].shift(1)) & \
        (df['close'] > df['open'].shift(1)) & (df['open'] < df['close'].shift(1))
    ).astype(int)
    df['Is_Engulfing_Bearish'] = (
        (df['close'] < df['open']) & (df['close'].shift(1) > df['open'].shift(1)) & \
        (df['close'] < df['open'].shift(1)) & (df['open'] > df['close'].shift(1))
    ).astype(int)
    return df

def add_return_features(df, price_col='close'):
    df[f'Log_Return_1'] = np.log(df[price_col].replace(0, np.nan) / df[price_col].shift(1).replace(0, np.nan))
    df[f'Log_Return_3'] = np.log(df[price_col].replace(0, np.nan) / df[price_col].shift(3).replace(0, np.nan))
    df[f'Log_Return_6'] = np.log(df[price_col].replace(0, np.nan) / df[price_col].shift(6).replace(0, np.nan))
    df[f'Simple_Return_1'] = df[price_col].pct_change(1)
    for col in [f'Log_Return_1', f'Log_Return_3', f'Log_Return_6', f'Simple_Return_1']:
        df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

def add_rolling_stats(df, price_col='close', window1=14, window2=30):
    returns = df[price_col].pct_change(1).replace([np.inf, -np.inf], np.nan)
    df[f'Rolling_Std_Dev_{window1}'] = returns.rolling(window=window1).std()
    df[f'Rolling_Skew_{window2}'] = returns.rolling(window=window2).skew()
    df[f'Rolling_Kurtosis_{window2}'] = returns.rolling(window=window2).kurt()
    return df

def add_lagged_features(df, cols_to_lag, lags=[1, 3, 6]):
    for col in cols_to_lag:
        if col in df.columns:
            for lag in lags:
                df[f'{col}_Lag_{lag}'] = df[col].shift(lag)
    return df

# --- Main Feature Generation Function ---
def add_all_features(df, suffix=''):
    if not isinstance(df.index, pd.DatetimeIndex):
        print(f"Warning: DataFrame for suffix '{suffix}' does not have a DatetimeIndex. Some features might not work as expected (e.g., VWAP).")
    df = df.copy()
    base_cols = ['open', 'high', 'low', 'close', 'volume']
    if not all(col in df.columns for col in base_cols):
         raise ValueError(f"DataFrame must contain {base_cols} for suffix {suffix}")

    df['Volume_SMA_20'] = df['volume'].rolling(window=20).mean()
    df = add_daily_vwap(df, new_col_name='VWAP')
    df = add_price_vs_ma(df, ma_col_name='VWAP', new_col_name_suffix='_vs_VWAP')

    bb_indicator = BollingerBands(close=df['close'], window=20, window_dev=2)
    df['BB_Upper_20_2'] = bb_indicator.bollinger_hband()
    df['BB_Lower_20_2'] = bb_indicator.bollinger_lband()
    df['BB_Mid_20'] = bb_indicator.bollinger_mavg()
    df['BB_Width_20_2'] = bb_indicator.bollinger_wband()
    df['Percent_B_20_2'] = bb_indicator.bollinger_pband()
    df = add_price_vs_bb(df, bb_upper_col='BB_Upper_20_2', bb_lower_col='BB_Lower_20_2')
    df['ATR_14'] = AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range()

    df['SMA_10'] = SMAIndicator(df['close'], window=10).sma_indicator()
    df['SMA_20'] = SMAIndicator(df['close'], window=20).sma_indicator()
    df['SMA_50'] = SMAIndicator(df['close'], window=50).sma_indicator()
    df['EMA_10'] = EMAIndicator(df['close'], window=10).ema_indicator()
    df['EMA_20'] = EMAIndicator(df['close'], window=20).ema_indicator()
    df['EMA_50'] = EMAIndicator(df['close'], window=50).ema_indicator()
    
    df = add_price_vs_ma(df, ma_col_name='EMA_20', new_col_name_suffix='_vs_EMA20')
    df = add_ma_vs_ma(df, ma1_col_name='EMA_10', ma2_col_name='EMA_20', new_col_name_suffix='_vs_EMA20')
    df = add_ma_slope(df, ma_col_name='EMA_10', new_col_name_suffix='_Slope_10')

    macd = MACD(close=df['close'], window_slow=26, window_fast=12, window_sign=9)
    df['MACD_12_26_9'] = macd.macd()
    df['MACD_Signal_12_26_9'] = macd.macd_signal()
    df['MACD_Hist_12_26_9'] = macd.macd_diff()
    df = add_macd_cross_signal(df)

    adx_indicator = ADXIndicator(df['high'], df['low'], df['close'], window=14)
    df['ADX_14'] = adx_indicator.adx()
    df['Plus_DI_14'] = adx_indicator.adx_pos()
    df['Minus_DI_14'] = adx_indicator.adx_neg()
    
    psar_indicator = PSARIndicator(df['high'], df['low'], df['close'])
    df['PSAR'] = psar_indicator.psar()
    df = add_psar_flip_signal(df)

    df['CCI_20'] = CCIIndicator(df['high'], df['low'], df['close'], window=20).cci()

    df['RSI_14'] = RSIIndicator(df['close'], window=14).rsi()
    df = add_rsi_signals(df, rsi_col_name='RSI_14')
    
    stoch = StochasticOscillator(df['high'], df['low'], df['close'], window=14, smooth_window=3)
    df['Stoch_K_14_3'] = stoch.stoch()
    df['Stoch_D_3'] = stoch.stoch_signal()
    df = add_stoch_signals(df, stoch_k_col_name='Stoch_K_14_3')

    # --- MODIFIED SECTION FOR PPO AND ROC ---
    df['PPO_12_26_9'] = PercentagePriceOscillator(df['close'], window_slow=26, window_fast=12, window_sign=9).ppo()
    df['ROC_10'] = ROCIndicator(df['close'], window=10).roc()

    # Explicitly handle potential non-numeric types and inf values for PPO and ROC
    for col_name in ['PPO_12_26_9', 'ROC_10']:
        if col_name in df.columns:
            df[col_name] = df[col_name].replace([np.inf, -np.inf], np.nan) # Replace inf with NaN
            df[col_name] = pd.to_numeric(df[col_name], errors='coerce')   # Convert to numeric, coerce errors to NaN
    # --- END OF MODIFIED SECTION ---

    df = add_candle_features(df)
    df = add_candlestick_patterns(df)
    df = add_return_features(df)
    df = add_rolling_stats(df)
    
    cols_to_lag = ['close', 'RSI_14', 'Candle_Body', 'Volume_SMA_20'] # Ensure these exist before lagging
    # Check if cols_to_lag are actually present before trying to lag them
    valid_cols_to_lag = [col for col in cols_to_lag if col in df.columns]
    df = add_lagged_features(df, valid_cols_to_lag, lags=[1,2,3])

    current_cols = list(df.columns)
    new_feature_cols = [col for col in current_cols if col not in base_cols] # Features are non-base cols
    
    rename_dict = {col: col + suffix for col in new_feature_cols}
    df.rename(columns=rename_dict, inplace=True)
    
    return df

# --- Time & Session Features (Keep as is) ---
def add_time_session_features(df):
    if not isinstance(df.index, pd.DatetimeIndex):
        print("Error: DataFrame index must be DatetimeIndex for time/session features.")
        return df
    df = df.copy()
    df['Hour_of_Day'] = df.index.hour
    df['Minute_of_Hour'] = df.index.minute
    df['Day_of_Week'] = df.index.dayofweek
    time_fraction_of_day = df['Hour_of_Day'] + df['Minute_of_Hour'] / 60.0
    df['Time_Sin'] = np.sin(2 * np.pi * time_fraction_of_day / 24.0)
    df['Time_Cos'] = np.cos(2 * np.pi * time_fraction_of_day / 24.0)
    df['Day_Sin'] = np.sin(2 * np.pi * df['Day_of_Week'] / 7.0)
    df['Day_Cos'] = np.cos(2 * np.pi * df['Day_of_Week'] / 7.0)
    df['Is_Asian_Session'] = ((df['Hour_of_Day'] >= 20) | (df['Hour_of_Day'] < 5)).astype(int)
    df['Is_London_Session'] = ((df['Hour_of_Day'] >= 3) & (df['Hour_of_Day'] < 12)).astype(int)
    df['Is_NY_Session'] = ((df['Hour_of_Day'] >= 8) & (df['Hour_of_Day'] < 17)).astype(int)
    df['Is_Overlap'] = ((df['Hour_of_Day'] >= 8) & (df['Hour_of_Day'] < 12)).astype(int)
    df['Is_US_Open_Hour'] = ((df['Hour_of_Day'] == 9) & (df['Minute_of_Hour'] >= 30) | (df['Hour_of_Day'] == 10) & (df['Minute_of_Hour'] < 30)).astype(int)
    df['Is_US_Close_Hour'] = ((df['Hour_of_Day'] == 15) | (df['Hour_of_Day'] == 16) & (df['Minute_of_Hour'] == 0)).astype(int)
    return df

In [None]:
# def avoid_news(row):
#     ts = row["datetime"]
#     return any(start <= ts <= end for (start, end) in news_windows)

# def avoid_hour_18_19(row):
#     """
#     Avoid trading in the first hour of the session (18:00 to 19:00 inclusive).
#     """
#     if not pd.api.types.is_datetime64_any_dtype(row['datetime']):
#         return False
#     hour = row['datetime'].hour
#     return hour == 18

avoid_funcs = {
    #'avoid_hour_18_19': avoid_hour_18_19
    #'news_window': avoid_news,
}

param_grid_strategy = {
    'SL_ATR_MULT': [1.0, 1.5, 0.5],
    'TP_ATR_MULT': [2.0, 3.0, 4.0],
    'TRAIL_START_MULT': [0.5, 1.0],
    'TRAIL_STOP_MULT': [0.5, 1.0],
    'TICK_VALUE': [5], 
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

# Cleanup

In [None]:
# This is Cell 9: Feature Engineering Execution

print("--- Starting Feature Engineering Execution ---")

# Ensure df_5min and df_15min are your REAL data from Cell 2
if 'df_5min' not in locals() or not isinstance(df_5min, pd.DataFrame) or df_5min.empty:
    raise ValueError("REAL df_5min (from Cell 2) is not loaded, not a DataFrame, or is empty before feature engineering!")
if 'df_15min' not in locals() or not isinstance(df_15min, pd.DataFrame) or df_15min.empty:
    raise ValueError("REAL df_15min (from Cell 2) is not loaded, not a DataFrame, or is empty before feature engineering!")

# Ensure required functions are defined
if 'add_all_features' not in locals() or 'add_time_session_features' not in locals():
    raise NameError("Functions 'add_all_features' or 'add_time_session_features' are not defined. Ensure Cell 5 was run.")

print(f"Input df_5min shape for features: {df_5min.shape}")
# Ensure df_5min has base OHLCV columns before passing to add_all_features
expected_cols_5min = ['open', 'high', 'low', 'close', 'volume']
if not all(col in df_5min.columns for col in expected_cols_5min):
    raise ValueError(f"df_5min is missing one of the required columns: {expected_cols_5min}. Found: {df_5min.columns.tolist()}")
df_5min_features = add_all_features(df_5min.copy(), suffix='_5m')
print(f"Output df_5min_features shape: {df_5min_features.shape}")
# print(f"Columns in df_5min_features: {df_5min_features.columns.tolist()}")


print(f"\nInput df_15min shape for features: {df_15min.shape}")
# Ensure df_15min has base OHLCV columns
expected_cols_15min = ['open', 'high', 'low', 'close', 'volume']
if not all(col in df_15min.columns for col in expected_cols_15min):
    raise ValueError(f"df_15min is missing one of the required columns: {expected_cols_15min}. Found: {df_15min.columns.tolist()}")
df_15min_features = add_all_features(df_15min.copy(), suffix='_15m')
print(f"Output df_15min_features shape: {df_15min_features.shape}")
# print(f"Columns in df_15min_features: {df_15min_features.columns.tolist()}")


print("\nAdding time and session features to 5-minute data...")
# The add_time_session_features function will add columns like 'Hour_of_Day', etc.
# These do not get a '_5m' suffix from add_all_features by design if they are added separately.
# df_5min_features already contains features with '_5m' suffix and base 'open', 'high', 'low', 'close', 'volume'.
df_5min_final_features = add_time_session_features(df_5min_features.copy())
print(f"Output df_5min_final_features shape: {df_5min_final_features.shape}")
# print(f"Columns in df_5min_final_features: {df_5min_final_features.columns.tolist()}")


print("\nMerging 5-minute and 15-minute features...")
# Select only suffixed feature columns from df_15min_features for merging
# This avoids duplicating 'open', 'high', 'low', 'close', 'volume' if they were not suffixed in add_all_features
# (Our current add_all_features *does* suffix all newly created features,
# and keeps 'open', 'high', etc. as is on the df it processes.)
# The columns from df_15min_features will be like 'ATR_14_15m', 'SMA_10_15m', etc.
cols_to_merge_15m = [col for col in df_15min_features.columns if col.endswith('_15m')]

if not cols_to_merge_15m:
    print("Warning: No columns ending with '_15m' found in df_15min_features to merge.")
    # If this happens, df_merged will essentially be df_5min_final_features
    # This might indicate an issue with the suffixing in add_all_features for the 15min run.

# Ensure inputs to merge are valid and have sorted DatetimeIndex
if not isinstance(df_5min_final_features.index, pd.DatetimeIndex) or \
   not isinstance(df_15min_features.index, pd.DatetimeIndex):
    raise TypeError("Indexes of DataFrames to merge must be DatetimeIndex.")

if df_5min_final_features.empty:
    raise ValueError("df_5min_final_features DataFrame for merging is empty!")
# Only try to merge if there are 15m columns to merge
if cols_to_merge_15m and df_15min_features[cols_to_merge_15m].empty :
     print("Warning: The selection of 15m feature columns resulted in an empty DataFrame. Merging will likely result in only 5m features.")


df_merged = pd.merge_asof(
    left=df_5min_final_features.sort_index(),
    right=df_15min_features[cols_to_merge_15m].sort_index() if cols_to_merge_15m else pd.DataFrame(index=df_5min_final_features.index), # Ensure right is DataFrame
    left_index=True,
    right_index=True,
    direction='backward' # Use the 15-min data from the start of the 15-min bar
)

print("--- Feature Engineering Execution COMPLETE ---")
print(f"Final df_merged shape: {df_merged.shape}")
# print("Final df_merged columns:", df_merged.columns.tolist()) # Uncomment to verify all columns
# print("Final df_merged head:\n", df_merged.head())

In [None]:
import pandas as pd
import numpy as np
import numba # Make sure numba is imported in your first cell (Cell 2)

# --- Target Labeling Function Definitions ---

# --- Add Target Labels to REAL df_merged ---
# This section will ONLY run if your REAL df_merged (from Cell 9) exists and is not empty.
print("\n--- Adding Target Labels to REAL df_merged ---")
if 'df_merged' in locals() and isinstance(df_merged, pd.DataFrame) and not df_merged.empty:
    print(f"Using REAL df_merged (shape: {df_merged.shape}) for target labeling.")
    for req_col_real in ['open', 'close', 'high', 'low', 'ATR_14_5m']: 
        if req_col_real not in df_merged.columns:
            raise KeyError(f"Essential column '{req_col_real}' for labeling is MISSING from REAL df_merged. Available: {df_merged.columns.tolist()}")

    # 1. Regression Target
    lookahead_val = 6 
    regression_col_name = f'reg_target_lookahead{lookahead_val}'
    print(f"\nAdding Regression Target to REAL df_merged: {regression_col_name} ...")
    df_merged[regression_col_name] = compute_regression_labels(
        df_merged.copy(), 
        price_col_entry='open', price_col_exit='close', lookahead=lookahead_val,
        vol_col='ATR_14_5m', min_vol_threshold=0.1, cap_outliers=True,
        lower_cap_percentile=1.0, upper_cap_percentile=99.0, same_day_trade=True
    )
    print(f"Finished adding {regression_col_name}. Description:")
    print(df_merged[regression_col_name].describe(percentiles=[]))

    # 2. Classification Target (using the descriptive name)
    pt_val = 2.0
    sl_val = 1.5
    vb_val = 12 
    classification_col_name = f'clf_target_numba_pt{pt_val}sl{sl_val}vb{vb_val}'
    print(f"\nAdding Classification Target to REAL df_merged: {classification_col_name} ...")
    df_merged[classification_col_name] = compute_classification_labels_triple_barrier_numba(
        df_prices=df_merged.copy(), 
        entry_price_col='close', high_col='high', low_col='low', atr_col='ATR_14_5m',
        pt_atr_mult=pt_val, sl_atr_mult=sl_val, vertical_barrier_periods=vb_val,
        min_target_return_pct=0.0005
    )
    print(f"Finished adding {classification_col_name}. Value Counts:")
    print(df_merged[classification_col_name].value_counts(dropna=False))

    print("\nColumns in REAL df_merged AFTER adding all targets:", df_merged.columns.tolist())
    print(f"REAL df_merged shape after adding targets: {df_merged.shape}")
else:
    print("CRITICAL ERROR: REAL 'df_merged' was NOT labeled because it was not found or was empty when this cell was run. Check the output of Cell 9 (Feature Engineering Execution).")
# --- END OF Adding Target Labels to REAL df_merged ---

# Train

##### Indicator list for model

In [None]:
feature_list = [
    # I. Technical Indicators (5m & 15m)
    'SMA_10_5m', 'SMA_20_5m', 'SMA_50_5m', 'EMA_10_5m', 'EMA_20_5m', 'EMA_50_5m',
    'Price_vs_EMA20_5m', 'EMA10_vs_EMA20_5m', 'EMA_Slope_10_5m',
    'RSI_14_5m', 'RSI_Is_Overbought_70_5m', 'RSI_Is_Oversold_30_5m',
    'Stoch_K_14_3_5m', 'Stoch_D_3_5m', 'Stoch_Is_Overbought_80_5m', 'Stoch_Is_Oversold_20_5m',
    'CCI_20_5m', 'MACD_12_26_9_5m', 'MACD_Signal_12_26_9_5m', 'MACD_Hist_12_26_9_5m',
    'MACD_Cross_Signal_5m', 'PPO_12_26_9_5m', 'ROC_10_5m',
    'ATR_14_5m', 'BB_Upper_20_2_5m', 'BB_Lower_20_2_5m', 'BB_Mid_20_5m',
    'BB_Width_20_2_5m', 'Price_vs_BB_Upper_5m', 'Price_vs_BB_Lower_5m', 'Percent_B_20_2_5m',
    'ADX_14_5m', 'Plus_DI_14_5m', 'Minus_DI_14_5m', 'PSAR_5m', 'PSAR_Flip_Signal_5m',
    'VWAP_5m', 'Price_vs_VWAP_5m', 'Volume_SMA_20_5m',

    'SMA_10_15m', 'SMA_20_15m', 'SMA_50_15m', 'EMA_10_15m', 'EMA_20_15m', 'EMA_50_15m',
    'Price_vs_EMA20_15m', 'EMA10_vs_EMA20_15m', 'EMA_Slope_10_15m',
    'RSI_14_15m', 'RSI_Is_Overbought_70_15m', 'RSI_Is_Oversold_30_15m',
    'Stoch_K_14_3_15m', 'Stoch_D_3_15m', 'Stoch_Is_Overbought_80_15m', 'Stoch_Is_Oversold_20_15m',
    'CCI_20_15m', 'MACD_12_26_9_15m', 'MACD_Signal_12_26_9_15m', 'MACD_Hist_12_26_9_15m',
    'MACD_Cross_Signal_15m', 'PPO_12_26_9_15m', 'ROC_10_15m',
    'ATR_14_15m', 'BB_Upper_20_2_15m', 'BB_Lower_20_2_15m', 'BB_Mid_20_15m',
    'BB_Width_20_2_15m', 'Price_vs_BB_Upper_15m', 'Price_vs_BB_Lower_15m', 'Percent_B_20_2_15m',
    'ADX_14_15m', 'Plus_DI_14_15m', 'Minus_DI_14_15m', 'PSAR_15m', 'PSAR_Flip_Signal_15m',
    'VWAP_15m', 'Price_vs_VWAP_15m', 'Volume_SMA_20_15m',

    # II. Price Action & Basic Features (Mostly 5m, some 15m)
    'Candle_Range_5m', 'Candle_Body_5m', 'Upper_Wick_5m', 'Lower_Wick_5m', 'Body_vs_Range_5m',
    'Log_Return_1_5m', 'Log_Return_3_5m', 'Log_Return_6_5m', 'Simple_Return_1_5m',
    'Is_Doji_5m', 'Is_Hammer_5m', 'Is_Engulfing_Bullish_5m', 'Is_Engulfing_Bearish_5m',

    'Candle_Range_15m', 'Candle_Body_15m', 'Upper_Wick_15m', 'Lower_Wick_15m', 'Body_vs_Range_15m',
    'Log_Return_1_15m', # This is a 15-min return
    'Simple_Return_1_15m',

    # III. Statistical Features (5m & 15m)
    'Rolling_Std_Dev_14_5m', 'Rolling_Skew_30_5m', 'Rolling_Kurtosis_30_5m',
    'Close_Lag_1_5m', 'Close_Lag_3_5m', 'Close_Lag_6_5m', 'RSI_Lag_1_5m',

    'Rolling_Std_Dev_14_15m', 'Rolling_Skew_30_15m', 'Rolling_Kurtosis_30_15m',
    'Close_Lag_1_15m', 'Close_Lag_3_15m', 'RSI_Lag_1_15m',

    # IV. Time & Session Features (No suffix needed - apply to 5m base)
    'Hour_of_Day', 'Minute_of_Hour',
    'Time_Sin', 'Time_Cos',
    'Day_of_Week', 'Day_Sin', 'Day_Cos',
    'Is_Asian_Session', 'Is_London_Session', 'Is_NY_Session', 'Is_Overlap',
    'Is_US_Open_Hour', 'Is_US_Close_Hour'
]

print(f"Total number of potential features listed: {len(feature_list)}")

In [None]:
df_final = df_merged.dropna()
print(f"Shape before dropna: {df_merged.shape}, after dropna: {df_final.shape}")

In [None]:
print("\n--- Separating Features (X) and Targets (y) ---")

# DEFINE THE ACTUAL TARGET COLUMN NAMES AS THEY ARE IN df_final
lookahead_val = 6  # Ensure this matches the lookahead used
pt_val = 2.0       # Ensure this matches pt_atr_mult
sl_val = 1.5       # Ensure this matches sl_atr_mult
vb_val = 12        # Ensure this matches vertical_barrier_periods

actual_regression_target_col = f'reg_target_lookahead{lookahead_val}'
actual_classification_target_col = f'clf_target_numba_pt{pt_val}sl{sl_val}vb{vb_val}'

# Check if these columns actually exist in df_final
if actual_regression_target_col not in df_final.columns:
    raise KeyError(f"Regression target column '{actual_regression_target_col}' NOT FOUND in df_final. Available: {df_final.columns.tolist()}")
if actual_classification_target_col not in df_final.columns:
    raise KeyError(f"Classification target column '{actual_classification_target_col}' NOT FOUND in df_final. Available: {df_final.columns.tolist()}")
    
target_cols_for_exclusion = [actual_regression_target_col, actual_classification_target_col]

# Define columns to exclude from features (targets + base OHLCV from original 5-min data)
exclude_cols = target_cols_for_exclusion + ['open', 'high', 'low', 'close', 'volume']
feature_columns = [col for col in df_final.columns if col not in exclude_cols]

if not feature_columns:
    raise ValueError("No feature columns found after exclusion. Check column names and logic.")
# --- Critical Check: Ensure target is not in features ---
if actual_classification_target_col in feature_columns:
    raise ValueError(f"CRITICAL ERROR: Target column '{actual_classification_target_col}' found in feature_columns! Leakage will occur.")
if actual_regression_target_col in feature_columns:
    raise ValueError(f"CRITICAL ERROR: Target column '{actual_regression_target_col}' found in feature_columns! Leakage will occur.")
# --- End of Critical Check ---

X = df_final[feature_columns]
y_reg = df_final[actual_regression_target_col]
y_clf = df_final[actual_classification_target_col]

print(f"Features (X) shape: {X.shape}")
print(f"Number of features: {len(feature_columns)}") # Should be less than before
print(f"Regression Target (y_reg) shape: {y_reg.shape}, Name: {y_reg.name}")
print(f"Classification Target (y_clf) shape: {y_clf.shape}, Name: {y_clf.name}")

print("\n--- Splitting Data (Train-Validation-Test) ---")

# 3. Split data into training, validation, and testing sets (Time-Based Split)
#    We use a simple percentage split based on time. NO SHUFFLING!
train_pct = 0.70
val_pct = 0.15
# test_pct = 0.15 (implicit)

data_len = len(X)
train_end_idx = int(data_len * train_pct)
val_end_idx = train_end_idx + int(data_len * val_pct)

X_train, X_val, X_test = X[:train_end_idx], X[train_end_idx:val_end_idx], X[val_end_idx:]
y_reg_train, y_reg_val, y_reg_test = y_reg[:train_end_idx], y_reg[train_end_idx:val_end_idx], y_reg[val_end_idx:]
y_clf_train, y_clf_val, y_clf_test = y_clf[:train_end_idx], y_clf[train_end_idx:val_end_idx], y_clf[val_end_idx:]

print(f"Training set shapes: X={X_train.shape}, y_clf={y_clf_train.shape}")
print(f"Validation set shapes: X={X_val.shape}, y_clf={y_clf_val.shape}")
print(f"Test set shapes: X={X_test.shape}, y_clf={y_clf_test.shape}")

# Verify the date ranges (optional but recommended)
print(f"\nTraining data runs from {X_train.index.min()} to {X_train.index.max()}")
print(f"Validation data runs from {X_val.index.min()} to {X_val.index.max()}")
print(f"Test data runs from {X_test.index.min()} to {X_test.index.max()}")

##### Real Training

In [None]:
if 'X_train' not in locals() or 'y_clf_train' not in locals():
    print("Dummy data for X_train, y_clf_train etc. is being created as they were not found.")
    num_train_samples = 1000
    num_val_samples = 200
    num_features = X.shape[1] if 'X' in locals() else 140 # Get num_features from previous X
    
    X_train = pd.DataFrame(np.random.rand(num_train_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_clf_train = pd.Series(np.random.randint(0, 3, num_train_samples))
    
    X_val = pd.DataFrame(np.random.rand(num_val_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_clf_val = pd.Series(np.random.randint(0, 3, num_val_samples))
    
    X_test = pd.DataFrame(np.random.rand(num_val_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_clf_test = pd.Series(np.random.randint(0, 3, num_val_samples))
    print("Dummy data created. Shapes:")
    print(f"X_train: {X_train.shape}, y_clf_train: {y_clf_train.shape}")
    print(f"X_val: {X_val.shape}, y_clf_val: {y_clf_val.shape}")


print("\n--- Training LightGBM Classifier ---")

# 1. Instantiate the LGBMClassifier
#    We'll start with some basic parameters. Hyperparameter tuning is crucial later.
lgbm_clf = lgb.LGBMClassifier(
    objective='multiclass', # For 0, 1, 2 labels
    metric='multi_logloss', # Common metric for multiclass
    n_estimators=1000,      # Number of boosting trees
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,           # No limit on depth
    random_state=42,        # For reproducibility
    n_jobs=-1,              # Use all available cores
    # Early stopping can be added during fit for more robust training
    # class_weight='balanced' # Consider if your classes are imbalanced
)

# Check class distribution in training data
print("\nTraining data class distribution:")
print(y_clf_train.value_counts(normalize=True))

# Handle potential class imbalance if significant
# If class_weight='balanced' is not enough, or you want to try SMOTE (later)
# For now, let's proceed.
print("--- Dtypes in X_train before fitting ---")
problem_cols = ['PPO_12_26_9_5m', 'ROC_10_5m', 'PPO_12_26_9_15m', 'ROC_10_15m']
for col in problem_cols:
    if col in X_train.columns:
        print(f"Dtype of {col}: {X_train[col].dtype}")
    else:
        print(f"Column {col} not found in X_train.")

# This is the existing line:
lgbm_clf.fit(
    X_train, y_clf_train,
    # ... rest of the parameters
)
# 2. Train the model
print("\nStarting model training...")
lgbm_clf.fit(
    X_train, y_clf_train,
    eval_set=[(X_val, y_clf_val)], # Evaluate on validation set during training
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=1)] # Stop if no improvement for 50 rounds
)
print("Model training completed.")

# 3. Make predictions on the validation set
print("\nMaking predictions on validation set...")
y_pred_val = lgbm_clf.predict(X_val)
y_pred_proba_val = lgbm_clf.predict_proba(X_val) # Get probabilities

# 4. Evaluate the model
print("\n--- Model Evaluation on Validation Set ---")
accuracy_val = accuracy_score(y_clf_val, y_pred_val)
print(f"Validation Accuracy: {accuracy_val:.4f}")

print("\nClassification Report (Validation):")
# target_names=['No Trade (0)', 'Short (1)', 'Long (2)'] # For better readability
# Ensure your labels are 0, 1, 2 and map accordingly if different
class_names = ['No Trade (0)', 'Short (1)', 'Long (2)'] 
# Check unique values in y_clf_val and y_pred_val to ensure they are all present for the report
present_labels = sorted(list(set(y_clf_val) | set(y_pred_val)))
current_class_names = [class_names[i] for i in present_labels if i < len(class_names)]

if not current_class_names: # Fallback if labels are unexpected
    current_class_names = [f"Class {i}" for i in present_labels]
    
print(classification_report(y_clf_val, y_pred_val, target_names=current_class_names, labels=present_labels, zero_division=0))

print("\nConfusion Matrix (Validation):")
cm = confusion_matrix(y_clf_val, y_pred_val, labels=present_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=current_class_names, yticklabels=current_class_names)
plt.title('Confusion Matrix (Validation Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# 5. Feature Importance (Optional but very useful)
print("\n--- Feature Importances ---")
feature_importances = pd.Series(lgbm_clf.feature_importances_, index=X_train.columns)
top_n = 20
print(f"Top {top_n} features:")
print(feature_importances.sort_values(ascending=False).head(top_n))

plt.figure(figsize=(10, top_n // 2 if top_n > 10 else 5))
feature_importances.sort_values(ascending=False).head(top_n).plot(kind='barh')
plt.title(f'Top {top_n} Feature Importances')
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.show()

In [None]:
if 'X_train' not in locals() or 'y_clf_train' not in locals() or \
   'X_val' not in locals() or 'y_clf_val' not in locals() or \
   X_train.empty or y_clf_train.empty or X_val.empty or y_clf_val.empty:
    
    print("WARNING: Real training/validation data not found or empty. Creating DUMMY data for Optuna demonstration.")
    num_train_samples = 1000
    num_val_samples = 200
    # Try to get num_features from a global X if it exists and has been shaped, otherwise default
    num_features = X.shape[1] if 'X' in locals() and hasattr(X, 'shape') else 140 
    
    X_train = pd.DataFrame(np.random.rand(num_train_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_clf_train = pd.Series(np.random.randint(0, 3, num_train_samples))
    
    X_val = pd.DataFrame(np.random.rand(num_val_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_clf_val = pd.Series(np.random.randint(0, 3, num_val_samples))
    print("Dummy data created for Optuna. Shapes:")
    print(f"X_train: {X_train.shape}, y_clf_train: {y_clf_train.shape}")
    print(f"X_val: {X_val.shape}, y_clf_val: {y_clf_val.shape}")
else:
    print("Using existing X_train, y_clf_train, X_val, y_clf_val for Optuna.")
    print(f"X_train shape: {X_train.shape}, y_clf_train shape: {y_clf_train.shape}")
    print(f"X_val shape: {X_val.shape}, y_clf_val shape: {y_clf_val.shape}")


# 1. Define the objective function for Optuna
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss', # Internal metric for LightGBM training
        'verbosity': -1,          # Suppress LightGBM's own training output during Optuna search
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1, # Use all cores
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0, step=0.05), # For bagging
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0, step=0.05), # Feature fraction
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True), # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True), # L2 regularization
        # 'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']) # Optional
    }

    model = lgb.LGBMClassifier(**params)
    
    model.fit(
        X_train, y_clf_train,
        eval_set=[(X_val, y_clf_val)],
        eval_metric='multi_logloss', # Or your preferred LightGBM eval metric
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)] # Use non-verbose early stopping
    )
    
    preds = model.predict(X_val)
    # We want to maximize F1-score, so Optuna will try to maximize this value
    # Using 'macro' average for F1-score as it considers all classes equally
    f1 = f1_score(y_clf_val, preds, average='macro', zero_division=0)
    
    return f1 # Optuna will try to maximize this

# 2. Create an Optuna study
#    We specify direction="maximize" because we want to maximize F1-score.
#    If you were minimizing logloss, it would be "minimize".
study_name = 'lgbm_clf_optimization_v1' # You can name your study
storage_name = f"sqlite:///{study_name}.db" # Optional: Save study to a database to resume later

# Check if a study with this name already exists (if using SQLite storage)
try:
    study = optuna.load_study(study_name=study_name, storage=storage_name)
    print(f"Resuming existing study: {study_name}")
except KeyError: # Study not found
    study = optuna.create_study(direction="maximize", study_name=study_name, storage=storage_name)
    print(f"Starting new study: {study_name}")


# 3. Run the optimization
#    n_trials is the number of different hyperparameter combinations Optuna will try.
#    Increase this for a more thorough search, but it will take longer.
print(f"\nStarting Optuna optimization with, for example, 50 trials...")
study.optimize(objective, n_trials=50, timeout=None) # Set a timeout in seconds if needed, e.g., timeout=3600 for 1 hour

# 4. Print the best trial results
print("\n--- Optuna Optimization Finished ---")
print(f"Number of finished trials: {len(study.trials)}")

best_trial = study.best_trial
print(f"Best trial F1-score (macro): {best_trial.value:.4f}")
print("Best hyperparameters found:")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

# You can now train a final model using these best hyperparameters on the full training set (X_train + X_val)
# or just use the model from the best trial if LightGBM stored it internally (check Optuna/LightGBM docs for that).
# For simplicity, let's instantiate and show how to train a model with best params:

print("\nTraining a new model with the best hyperparameters found by Optuna...")
best_params = best_trial.params
# Add fixed params back if they were not part of the search space but are needed by LGBMClassifier
best_params['objective'] = 'multiclass'
best_params['metric'] = 'multi_logloss' 
best_params['verbosity'] = -1
best_params['random_state'] = 42
best_params['n_jobs'] = -1
# If you had class_weight in objective and it's not 'balanced', you might need to handle it or remove it
# if 'class_weight' in best_params and best_params['class_weight'] is None:
#     del best_params['class_weight']


final_lgbm_clf = lgb.LGBMClassifier(**best_params)

# Optional: Combine train and validation for final model training, then evaluate on test
# X_train_full = pd.concat([X_train, X_val])
# y_clf_train_full = pd.concat([y_clf_train, y_clf_val])
# final_lgbm_clf.fit(X_train_full, y_clf_train_full, callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
# print("Final model trained on combined train+validation data.")

# For now, just retrain on X_train and evaluate on X_val to see effect of best params
final_lgbm_clf.fit(X_train, y_clf_train, eval_set=[(X_val, y_clf_val)], 
                   callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
y_pred_val_best = final_lgbm_clf.predict(X_val)

print("\n--- Evaluation of Model with Best Hyperparameters (on Validation Set) ---")
accuracy_val_best = accuracy_score(y_clf_val, y_pred_val_best)
print(f"Validation Accuracy with Best Params: {accuracy_val_best:.4f}")
print("\nClassification Report with Best Params (Validation):")
class_names = ['No Trade (0)', 'Short (1)', 'Long (2)'] 
present_labels_best = sorted(list(set(y_clf_val) | set(y_pred_val_best)))
current_class_names_best = [class_names[i] for i in present_labels_best if i < len(class_names)]
if not current_class_names_best: 
    current_class_names_best = [f"Class {i}" for i in present_labels_best]
print(classification_report(y_clf_val, y_pred_val_best, target_names=current_class_names_best, labels=present_labels_best, zero_division=0))

In [None]:
if 'X_train' not in locals() or 'y_reg_train' not in locals() or \
   'X_val' not in locals() or 'y_reg_val' not in locals() or \
   X_train.empty or y_reg_train.empty or X_val.empty or y_reg_val.empty:
    
    print("WARNING: Real training/validation data not found or empty. Creating DUMMY data for Optuna Regressor demonstration.")
    num_train_samples = 1000
    num_val_samples = 200
    num_features = X.shape[1] if 'X' in locals() and hasattr(X, 'shape') else 140
    
    X_train = pd.DataFrame(np.random.rand(num_train_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_reg_train = pd.Series(np.random.rand(num_train_samples) * 0.001 - 0.0005) # Small random returns
    
    X_val = pd.DataFrame(np.random.rand(num_val_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
    y_reg_val = pd.Series(np.random.rand(num_val_samples) * 0.001 - 0.0005)
    print("Dummy data created for Optuna Regressor. Shapes:")
    print(f"X_train: {X_train.shape}, y_reg_train: {y_reg_train.shape}")
    print(f"X_val: {X_val.shape}, y_reg_val: {y_reg_val.shape}")
else:
    print("Using existing X_train, y_reg_train, X_val, y_reg_val for Optuna Regressor.")
    print(f"X_train shape: {X_train.shape}, y_reg_train shape: {y_reg_train.shape}")
    print(f"X_val shape: {X_val.shape}, y_reg_val shape: {y_reg_val.shape}")

# 1. Define the objective function for Optuna Regressor
def objective_regressor(trial):
    params = {
        'objective': 'regression_l1', # MAE, often more robust to outliers than L2 (MSE)
        'metric': 'rmse',             # LightGBM will use RMSE for its internal evaluation and early stopping
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 5.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 5.0, log=True),
    }

    model_reg = lgb.LGBMRegressor(**params)
    
    model_reg.fit(
        X_train, y_reg_train,
        eval_set=[(X_val, y_reg_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    preds_val = model_reg.predict(X_val)
    mse = mean_squared_error(y_reg_val, preds_val) # Calculate Mean Squared Error
    rmse = np.sqrt(mse) # Calculate RMSE by taking the square root
    
    return rmse # Optuna will try to MINIMIZE this

# 2. Create an Optuna study for the regressor
study_name_reg = 'lgbm_regressor_optimization_v1'
storage_name_reg = f"sqlite:///{study_name_reg}.db" 

try:
    study_reg = optuna.load_study(study_name=study_name_reg, storage=storage_name_reg)
    print(f"Resuming existing regressor study: {study_name_reg}")
except KeyError: 
    study_reg = optuna.create_study(direction="minimize", study_name=study_name_reg, storage=storage_name_reg) # We want to MINIMIZE RMSE
    print(f"Starting new regressor study: {study_name_reg}")

# 3. Run the optimization
print(f"\nStarting Optuna optimization for LGBMRegressor (e.g., 50 trials)...")
study_reg.optimize(objective_regressor, n_trials=50, timeout=None) 

# 4. Print the best regressor trial results
print("\n--- Optuna Regressor Optimization Finished ---")
print(f"Number of finished trials: {len(study_reg.trials)}")

best_trial_reg = study_reg.best_trial
print(f"Best trial RMSE: {best_trial_reg.value:.6f}") # RMSE is typically small for normalized returns
print("Best hyperparameters found for LGBMRegressor:")
for key, value in best_trial_reg.params.items():
    print(f"    {key}: {value}")

# Store the best parameters for later use
best_regressor_params = best_trial_reg.params
# Add fixed params back
best_regressor_params['objective'] = 'regression_l1'
best_regressor_params['metric'] = 'rmse'
best_regressor_params['verbosity'] = -1
best_regressor_params['random_state'] = 42
best_regressor_params['n_jobs'] = -1

print("\nBest regressor parameters stored.")

In [None]:
if 'best_regressor_params' not in locals():
    print("WARNING: 'best_regressor_params' not found. Using placeholder default parameters for LGBMRegressor.")
    best_regressor_params = {
        'objective': 'regression_l1', 'metric': 'rmse', 'verbosity': -1, 
        'random_state': 42, 'n_jobs': -1, 'n_estimators': 500, 
        'learning_rate': 0.05, 'num_leaves': 31, 'boosting_type':'gbdt',
        'max_depth': -1, 'min_child_samples': 20, 'subsample': 1.0,
        'colsample_bytree': 1.0, 'reg_alpha': 0.0, 'reg_lambda': 0.0
    }

# Fallback for data if not defined
if 'X_train' not in locals() or X_train.empty:
    print("WARNING: Real X_train not found or empty. Creating DUMMY data for demonstration.")
    num_features = 140 
    X_train = pd.DataFrame(np.random.rand(1000, num_features), columns=[f'feature_{i}' for i in range(num_features)], index=pd.date_range(start='2023-01-01', periods=1000, freq='5min'))
    y_reg_train = pd.Series(np.random.rand(1000) * 0.001 - 0.0005, index=X_train.index)
    X_val = pd.DataFrame(np.random.rand(200, num_features), columns=[f'feature_{i}' for i in range(num_features)], index=pd.date_range(start=X_train.index[-1] + pd.Timedelta(minutes=5), periods=200, freq='5min'))
    X_test = pd.DataFrame(np.random.rand(200, num_features), columns=[f'feature_{i}' for i in range(num_features)], index=pd.date_range(start=X_val.index[-1] + pd.Timedelta(minutes=5), periods=200, freq='5min'))
    print("Dummy X_train, y_reg_train, X_val, X_test created.")


print("\n--- Generating Out-of-Sample Regressor Predictions ---")

# 1. Get out-of-sample predictions for the training set (X_train) using a manual loop
n_splits_cv = 5 
tscv = TimeSeriesSplit(n_splits=n_splits_cv)

# Initialize a Series to store OOS predictions for X_train, filled with NaNs
y_pred_reg_train_oos = pd.Series(np.nan, index=X_train.index, name='predicted_regression_target')

print(f"Generating OOS predictions for X_train using manual TimeSeriesSplit (n_splits={n_splits_cv})...")
for fold_num, (train_index, test_index) in enumerate(tscv.split(X_train, y_reg_train)):
    print(f"  Processing Fold {fold_num + 1}/{n_splits_cv}...")
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_reg_train.iloc[train_index], y_reg_train.iloc[test_index]
    
    regressor_fold = lgb.LGBMRegressor(**best_regressor_params)
    regressor_fold.fit(X_train_fold, y_train_fold,
                       eval_set=[(X_test_fold, y_test_fold)], # Optional: use test_fold as eval_set for early stopping
                       eval_metric='rmse', # Make sure this matches objective_regressor
                       callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=False)] # Reduced stopping_rounds for CV
                      )
    
    fold_predictions = regressor_fold.predict(X_test_fold)
    y_pred_reg_train_oos.iloc[test_index] = fold_predictions

# Any initial segments not covered by a test set in TimeSeriesSplit will remain NaN.
# This is expected and correct for OOS predictions.
print("Finished OOS predictions for X_train.")
print(f"Number of NaNs in y_pred_reg_train_oos (expected for initial folds): {y_pred_reg_train_oos.isnull().sum()}")


# 2. Train a final regressor on the full X_train to predict on X_val and X_test
final_regressor = lgb.LGBMRegressor(**best_regressor_params)
print("\nTraining final regressor on full X_train...")
# For the final model to predict on X_val/X_test, we can use X_val for early stopping if y_reg_val is available
if 'y_reg_val' in locals() and not y_reg_val.empty:
    print("Using X_val, y_reg_val for early stopping of final_regressor.")
    final_regressor.fit(X_train, y_reg_train,
                        eval_set=[(X_val, y_reg_val)],
                        eval_metric='rmse',
                        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
else:
    print("y_reg_val not available for early stopping, fitting final_regressor on X_train only.")
    final_regressor.fit(X_train, y_reg_train) 
print("Finished training final regressor.")


print("\nGenerating predictions for X_val and X_test...")
y_pred_reg_val_values = final_regressor.predict(X_val)
y_pred_reg_val = pd.Series(y_pred_reg_val_values, index=X_val.index, name='predicted_regression_target')

# Ensure X_test is available for prediction
if 'X_test' in locals() and not X_test.empty:
    y_pred_reg_test_values = final_regressor.predict(X_test)
    y_pred_reg_test = pd.Series(y_pred_reg_test_values, index=X_test.index, name='predicted_regression_target')
    print("Finished predictions for X_val and X_test.")
else:
    print("X_test not found or empty. Skipping predictions for X_test.")
    y_pred_reg_test = pd.Series(dtype='float64', name='predicted_regression_target') # Empty series if no X_test


# --- Display some info ---
print("\nSample of OOS Regressor Predictions for X_train:")
print(y_pred_reg_train_oos.head())
print(f"Shape: {y_pred_reg_train_oos.shape}")

print("\nSample of Regressor Predictions for X_val:")
print(y_pred_reg_val.head())
print(f"Shape: {y_pred_reg_val.shape}")
if not y_pred_reg_test.empty:
    print("\nSample of Regressor Predictions for X_test:")
    print(y_pred_reg_test.head())
    print(f"Shape: {y_pred_reg_test.shape}")

In [None]:
if 'X_train' not in locals() or X_train.empty:
    print("WARNING: Real X_train not found or empty. Creating DUMMY X_train for demonstration.")
    # Infer num_features from y_pred_reg_train_oos if possible, else default
    num_features_orig = X_train.shape[1] if ('X_train' in locals() and hasattr(X_train, 'shape')) else 140
    
    X_train = pd.DataFrame(np.random.rand(len(y_pred_reg_train_oos) if 'y_pred_reg_train_oos' in locals() and not y_pred_reg_train_oos.empty else 200, num_features_orig), 
                           columns=[f'feature_{i}' for i in range(num_features_orig)], 
                           index=y_pred_reg_train_oos.index if 'y_pred_reg_train_oos' in locals() and not y_pred_reg_train_oos.empty else pd.date_range('2023-01-01', periods=200, freq='5min'))
    X_val = pd.DataFrame(np.random.rand(len(y_pred_reg_val) if 'y_pred_reg_val' in locals() and not y_pred_reg_val.empty else 50, num_features_orig), 
                         columns=[f'feature_{i}' for i in range(num_features_orig)], 
                         index=y_pred_reg_val.index if 'y_pred_reg_val' in locals() and not y_pred_reg_val.empty else pd.date_range('2023-02-01', periods=50, freq='5min'))
    X_test = pd.DataFrame(np.random.rand(len(y_pred_reg_test) if 'y_pred_reg_test' in locals() and not y_pred_reg_test.empty else 50, num_features_orig), 
                          columns=[f'feature_{i}' for i in range(num_features_orig)], 
                          index=y_pred_reg_test.index if 'y_pred_reg_test' in locals() and not y_pred_reg_test.empty else pd.date_range('2023-03-01', periods=50, freq='5min'))
    print("Dummy X_train, X_val, X_test created.")

# Check if prediction Series exist
if 'y_pred_reg_train_oos' not in locals() or \
   'y_pred_reg_val' not in locals() or \
   'y_pred_reg_test' not in locals():
    raise NameError("Regressor predictions (e.g., y_pred_reg_train_oos, y_pred_reg_val, y_pred_reg_test) not found. "
                    "Ensure the previous cell (generating OOS predictions) was run successfully.")

print("\n--- Adding Regressor Predictions as Features ---")

# It's good practice to work on copies if you might re-run cells
X_train_augmented = X_train.copy()
X_val_augmented = X_val.copy()
X_test_augmented = X_test.copy()

new_feature_name = 'predicted_regression_target' # This is the name used in y_pred_reg_* Series

# Add the predictions as a new column
# The indices should align as the predictions were created using the X sets' indices
X_train_augmented[new_feature_name] = y_pred_reg_train_oos
X_val_augmented[new_feature_name] = y_pred_reg_val
X_test_augmented[new_feature_name] = y_pred_reg_test

print("\nShapes after adding regressor prediction feature:")
print(f"X_train_augmented shape: {X_train_augmented.shape} (Original X_train shape: {X_train.shape})")
print(f"X_val_augmented shape: {X_val_augmented.shape}   (Original X_val shape: {X_val.shape})")
print(f"X_test_augmented shape: {X_test_augmented.shape}  (Original X_test shape: {X_test.shape})")

print(f"\nHead of X_train_augmented with new '{new_feature_name}' feature:")
# Display the new feature and a couple of existing features for verification
cols_to_display = [new_feature_name]
if len(X_train.columns) > 0: cols_to_display.append(X_train.columns[0])
if len(X_train.columns) > 1: cols_to_display.append(X_train.columns[1])
# Ensure all columns in cols_to_display actually exist in X_train_augmented
cols_to_display = [col for col in cols_to_display if col in X_train_augmented.columns]
if cols_to_display:
    print(X_train_augmented[cols_to_display].head())
else:
    print("Could not display head, original columns not found (likely due to dummy data path).")


print(f"\nChecking for NaNs in '{new_feature_name}' feature after adding:")
print(f"NaNs in X_train_augmented['{new_feature_name}']: {X_train_augmented[new_feature_name].isnull().sum()}")
print(f"NaNs in X_val_augmented['{new_feature_name}']: {X_val_augmented[new_feature_name].isnull().sum()}") # Should be 0
print(f"NaNs in X_test_augmented['{new_feature_name}']: {X_test_augmented[new_feature_name].isnull().sum()}")   # Should be 0

# IMPORTANT: Handle NaNs in X_train_augmented before training the classifier
# The NaNs in 'predicted_regression_target' for X_train_augmented come from the initial
# folds of TimeSeriesSplit where no OOS prediction was made.
# You must align y_clf_train if you drop these rows.

# Example of handling NaNs:
# print("\n--- Handling NaNs in Augmented Training Data ---")
# print(f"Shape of X_train_augmented before NaN drop: {X_train_augmented.shape}")
# print(f"Shape of y_clf_train before NaN drop: {y_clf_train.shape}")

# # Identify rows in X_train_augmented where the new feature is NaN
# nan_mask_train = X_train_augmented[new_feature_name].isnull()

# # Drop these rows from both X_train_augmented and y_clf_train
# X_train_augmented_final = X_train_augmented[~nan_mask_train]
# y_clf_train_final = y_clf_train[~nan_mask_train] # Ensure y_clf_train is available and aligned

# print(f"Shape of X_train_augmented_final after NaN drop: {X_train_augmented_final.shape}")
# print(f"Shape of y_clf_train_final after NaN drop: {y_clf_train_final.shape}")
# print(f"Number of NaNs dropped from training data: {nan_mask_train.sum()}")

# Now, X_train_augmented_final and y_clf_train_final would be used for re-tuning/re-training the classifier.
# X_val_augmented and X_test_augmented should not have NaNs in this new feature.

In [None]:
if 'X_train_augmented' not in locals() or 'y_clf_train' not in locals() or \
   'X_val_augmented' not in locals() or 'y_clf_val' not in locals() or \
   X_train_augmented.empty or y_clf_train.empty or X_val_augmented.empty or y_clf_val.empty:
    
    print("WARNING: Real augmented training/validation data not found or empty. Creating DUMMY data for Optuna demonstration.")
    num_train_samples_aug = 1000
    num_val_samples_aug = 200
    num_features_aug = X_train_augmented.shape[1] if ('X_train_augmented' in locals() and hasattr(X_train_augmented, 'shape')) else 148
    
    # Create dummy X_train_augmented with the new feature
    X_train_augmented_cols = [f'feature_{i}' for i in range(num_features_aug -1)] + ['predicted_regression_target']
    X_train_augmented = pd.DataFrame(np.random.rand(num_train_samples_aug, num_features_aug), columns=X_train_augmented_cols)
    # Introduce some NaNs into the dummy 'predicted_regression_target'
    nan_indices = np.random.choice(X_train_augmented.index, size=int(num_train_samples_aug * 0.15), replace=False)
    X_train_augmented.loc[nan_indices, 'predicted_regression_target'] = np.nan
    
    y_clf_train = pd.Series(np.random.randint(0, 3, num_train_samples_aug)) # Ensure y_clf_train aligns with X_train_augmented initially
    
    X_val_augmented_cols = [f'feature_{i}' for i in range(num_features_aug -1)] + ['predicted_regression_target']
    X_val_augmented = pd.DataFrame(np.random.rand(num_val_samples_aug, num_features_aug), columns=X_val_augmented_cols)
    y_clf_val = pd.Series(np.random.randint(0, 3, num_val_samples_aug))
    
    print("Dummy augmented data created. Shapes before NaN handling:")
    print(f"X_train_augmented: {X_train_augmented.shape}, y_clf_train: {y_clf_train.shape}")
    print(f"X_val_augmented: {X_val_augmented.shape}, y_clf_val: {y_clf_val.shape}")


# 1. Handle NaNs in X_train_augmented (from the 'predicted_regression_target' feature)
#    and align y_clf_train.
print("\n--- Handling NaNs in Augmented Training Data ---")
new_feature_name = 'predicted_regression_target'

if new_feature_name not in X_train_augmented.columns:
    raise KeyError(f"Column '{new_feature_name}' not found in X_train_augmented. Ensure previous cell ran correctly.")

print(f"Shape of X_train_augmented before NaN drop: {X_train_augmented.shape}")
print(f"Shape of y_clf_train before NaN drop: {y_clf_train.shape}")
print(f"NaNs in X_train_augmented['{new_feature_name}'] before drop: {X_train_augmented[new_feature_name].isnull().sum()}")

# Create a mask for rows where the new feature is NaN
nan_mask_train = X_train_augmented[new_feature_name].isnull()

# Drop these rows from both X_train_augmented and y_clf_train
# It's crucial that y_clf_train has the same index as X_train_augmented before this step.
# If y_clf_train is a numpy array or has a different index, this alignment will fail.
# Assuming y_clf_train's index matches X_train's (and thus X_train_augmented's) initial index.
if not X_train_augmented.index.equals(y_clf_train.index):
    print("WARNING: Index of X_train_augmented and y_clf_train do not match! Realigning y_clf_train. This might happen if y_clf_train was not from df_final.")
    # This is a fallback. Ideally, y_clf_train comes from df_final and aligns with X_train.
    # If X_train was created from df_final, and y_clf_train from df_final, their original indices match.
    # The issue might be if y_clf_train was converted to numpy array and lost its index.
    # Let's assume for now they were created correctly and align.
    # If y_clf_train is a numpy array, you'd need to do:
    # y_clf_train_np = y_clf_train[~nan_mask_train.values] # if y_clf_train is numpy
    # X_train_augmented_final = X_train_augmented[~nan_mask_train]
    pass # Assuming pandas Series with matching index for now.

X_train_augmented_final = X_train_augmented[~nan_mask_train]
y_clf_train_final = y_clf_train[~nan_mask_train]

print(f"\nShape of X_train_augmented_final after NaN drop: {X_train_augmented_final.shape}")
print(f"Shape of y_clf_train_final after NaN drop: {y_clf_train_final.shape}")
num_nans_dropped = nan_mask_train.sum()
print(f"Number of rows dropped from training data due to NaNs in '{new_feature_name}': {num_nans_dropped}")

# X_val_augmented and X_test_augmented should not have NaNs in this new feature,
# but it's good practice to check and ensure alignment if needed.
if X_val_augmented[new_feature_name].isnull().any():
    print(f"WARNING: NaNs found in '{new_feature_name}' of X_val_augmented. This is unexpected.")
    # Handle them if necessary, e.g., X_val_augmented = X_val_augmented.dropna(subset=[new_feature_name])
    # and align y_clf_val accordingly. For now, assuming it's clean.

# 2. Define the objective function for Optuna (for the classifier with augmented features)
def objective_classifier_augmented(trial):
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators_aug', 200, 2500, step=100), # Wider range
        'learning_rate': trial.suggest_float('learning_rate_aug', 0.005, 0.1, log=True), # Smaller LR
        'num_leaves': trial.suggest_int('num_leaves_aug', 20, 200), # Wider range
        'max_depth': trial.suggest_int('max_depth_aug', 4, 15),
        'min_child_samples': trial.suggest_int('min_child_samples_aug', 5, 100),
        'subsample': trial.suggest_float('subsample_aug', 0.5, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree_aug', 0.5, 1.0, step=0.05),
        'reg_alpha': trial.suggest_float('reg_alpha_aug', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda_aug', 1e-3, 10.0, log=True),
        'class_weight': None # Include class_weight
    }

    model_clf_aug = lgb.LGBMClassifier(**params)
    
    model_clf_aug.fit(
        X_train_augmented_final, y_clf_train_final, # Use NaN-handled training data
        eval_set=[(X_val_augmented, y_clf_val)],    # Use augmented validation data
        eval_metric='multi_logloss',
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    preds_val_aug = model_clf_aug.predict(X_val_augmented)
    f1_macro_aug = f1_score(y_clf_val, preds_val_aug, average='macro', zero_division=0)
    
    return f1_macro_aug

# 3. Create and run the Optuna study for the augmented classifier
study_name_clf_aug = 'lgbm_clf_augmented_optimization_cw_none_v1'
storage_name_clf_aug = f"sqlite:///{study_name_clf_aug}.db" 

try:
    study_clf_aug = optuna.load_study(study_name=study_name_clf_aug, storage=storage_name_clf_aug)
    print(f"\nResuming existing augmented classifier study: {study_name_clf_aug}")
except KeyError: 
    study_clf_aug = optuna.create_study(direction="maximize", study_name=study_name_clf_aug, storage=storage_name_clf_aug)
    print(f"\nStarting new augmented classifier study: {study_name_clf_aug}")

print(f"\nStarting Optuna optimization for Augmented LGBMClassifier (e.g., 50 trials)...")
# Adjust n_trials as needed
study_clf_aug.optimize(objective_classifier_augmented, n_trials=50, timeout=None) 

# 4. Print the best trial results for the augmented classifier
print("\n--- Optuna Augmented Classifier Optimization Finished ---")
print(f"Number of finished trials: {len(study_clf_aug.trials)}")

best_trial_clf_aug = study_clf_aug.best_trial
print(f"Best trial F1-score (macro) for augmented classifier: {best_trial_clf_aug.value:.4f}")
print("Best hyperparameters found for Augmented LGBMClassifier:")
best_classifier_params_augmented = {} # Initialize
for key, value in best_trial_clf_aug.params.items():
    # Remove the '_aug' suffix for actual model parameter names
    param_name = key.replace('_aug', '')
    best_classifier_params_augmented[param_name] = value
    print(f"    {param_name}: {value}")


# Add fixed params back
best_classifier_params_augmented['objective'] = 'multiclass'
best_classifier_params_augmented['metric'] = 'multi_logloss' 
best_classifier_params_augmented['verbosity'] = -1
best_classifier_params_augmented['random_state'] = 42
best_classifier_params_augmented['n_jobs'] = -1

print("\nBest augmented classifier parameters stored in 'best_classifier_params_augmented'.")

# 5. Train and evaluate the final augmented classifier
print("\nTraining a new model with the best hyperparameters for the augmented classifier...")
final_lgbm_clf_augmented = lgb.LGBMClassifier(**best_classifier_params_augmented)

# Using NaN-handled training data and augmented validation data
final_lgbm_clf_augmented.fit(X_train_augmented_final, y_clf_train_final, 
                             eval_set=[(X_val_augmented, y_clf_val)], 
                             callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])

y_pred_val_best_aug = final_lgbm_clf_augmented.predict(X_val_augmented)

print("\n--- Evaluation of Augmented Model with Best Hyperparameters (on Validation Set) ---")
accuracy_val_best_aug = accuracy_score(y_clf_val, y_pred_val_best_aug)
print(f"Validation Accuracy with Best Params (Augmented): {accuracy_val_best_aug:.4f}")

print("\nClassification Report with Best Params (Augmented Validation):")
class_names = ['No Trade (0)', 'Short (1)', 'Long (2)'] 
present_labels_best_aug = sorted(list(set(y_clf_val) | set(y_pred_val_best_aug)))
current_class_names_best_aug = [class_names[i] for i in present_labels_best_aug if i < len(class_names)]
if not current_class_names_best_aug: 
    current_class_names_best_aug = [f"Class {i}" for i in present_labels_best_aug]
print(classification_report(y_clf_val, y_pred_val_best_aug, target_names=current_class_names_best_aug, labels=present_labels_best_aug, zero_division=0))

print("\nConfusion Matrix (Augmented Validation):")
cm_aug = confusion_matrix(y_clf_val, y_pred_val_best_aug, labels=present_labels_best_aug)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_aug, annot=True, fmt='d', cmap='Blues', xticklabels=current_class_names_best_aug, yticklabels=current_class_names_best_aug)
plt.title('Confusion Matrix - Augmented Model (Validation Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print("\n--- Feature Importances (Augmented Model) ---")
if hasattr(final_lgbm_clf_augmented, 'feature_importances_'):
    feature_importances_aug = pd.Series(final_lgbm_clf_augmented.feature_importances_, index=X_train_augmented_final.columns)
    top_n_aug = 20
    print(f"Top {top_n_aug} features (Augmented Model):")
    print(feature_importances_aug.sort_values(ascending=False).head(top_n_aug))

    plt.figure(figsize=(10, top_n_aug // 2 if top_n_aug > 10 else 5)) # Adjust figure size
    feature_importances_aug.sort_values(ascending=False).head(top_n_aug).plot(kind='barh')
    plt.title(f'Top {top_n_aug} Feature Importances (Augmented Model)')
    plt.xlabel('Importance')
    plt.gca().invert_yaxis() # To display the most important feature at the top
    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()
else:
    print("Could not retrieve feature importances from the augmented model.")

In [None]:
print("\n--- Training Final Augmented Model on Full Train+Val Data ---")
# Ensure y_clf_train_final and y_clf_val are aligned with X_train_augmented_final and X_val_augmented
X_train_val_augmented = pd.concat([X_train_augmented_final, X_val_augmented])
y_clf_train_val = pd.concat([y_clf_train_final, y_clf_val])

final_model_for_test = lgb.LGBMClassifier(**best_classifier_params_augmented)
final_model_for_test.fit(X_train_val_augmented, y_clf_train_val) # No early stopping here, train on all available data before test

print("\n--- Final Evaluation on Test Set (Augmented Model) ---")
y_pred_test_aug = final_model_for_test.predict(X_test_augmented)
accuracy_test_aug = accuracy_score(y_clf_test, y_pred_test_aug)
print(f"Test Accuracy (Augmented): {accuracy_test_aug:.4f}")
print("\nClassification Report (Augmented Test Set):")
# (Use your class_names and present_labels logic for the report)
print(classification_report(y_clf_test, y_pred_test_aug, target_names=class_names, labels=present_labels_best_aug, zero_division=0)) # Adjust labels if needed
# Plot confusion matrix for test set

In [None]:
# Assuming 'final_lgbm_clf_augmented' is your best tuned classifier
# And X_val_augmented, y_clf_val are your validation sets

print("\n--- Applying Custom Probability Thresholds ---")

# Get probabilities from the validation set
y_pred_proba_val_aug = final_lgbm_clf_augmented.predict_proba(X_val_augmented)

# Define your custom thresholds. These are examples; you'll need to tune them.
# The idea is to require a higher confidence for classes where precision is low.
# Format: {class_0_threshold, class_1_threshold, class_2_threshold}
# If no probability exceeds its threshold, it can default to 'No Trade' or the class with max prob.
# A simple approach: if max_proba < threshold_for_that_class -> predict No Trade (0)
# More complex: specific threshold for each class to be chosen.

# Example thresholds (these need to be found through experimentation on validation set output):
# Let's say class 0 is 'No Trade', 1 is 'Short', 2 is 'Long'
# thresholds = {
#     0: 0.40,  # If max_proba is for class 0, and it's > 0.40, predict 0
#     1: 0.35,  # If max_proba is for class 1, and it's > 0.35, predict 1
#     2: 0.45   # If max_proba is for class 2, and it's > 0.45, predict 2
#}
# A simpler approach: a single confidence threshold for taking any trade.
# If max_proba < general_confidence_threshold, predict 'No Trade'.

# Let's try a strategy:
# Predict the class with max probability, BUT if that max probability
# is less than a specific threshold for THAT class, override to 'No Trade (0)'.
# This aims to increase precision for 'Short' and 'Long' by being more selective.

# These thresholds would be determined by analyzing predict_proba output vs actuals on validation
# For example, for "Long" (class 2), you might find that when probability is > 0.55, precision is much better.
custom_thresholds = {
    0: 0.0,  # No minimum confidence for predicting 'No Trade' if it's already the highest
    1: 0.38, # Example: require at least 38% confidence to call a 'Short'
    2: 0.38  # Example: require at least 38% confidence to call a 'Long'
}
# The default 'No Trade' threshold (0.0) means if class 0 has highest prob, it's chosen.
# If class 1 has highest prob but < 0.38, it becomes 0.
# If class 2 has highest prob but < 0.38, it becomes 0.

y_pred_val_custom_thresholds = np.zeros(len(y_pred_proba_val_aug), dtype=int)
for i in range(len(y_pred_proba_val_aug)):
    probs = y_pred_proba_val_aug[i]
    predicted_class = np.argmax(probs) # Class with highest probability
    
    if probs[predicted_class] >= custom_thresholds.get(predicted_class, 0.0): # Check against threshold for that class
        y_pred_val_custom_thresholds[i] = predicted_class
    else:
        y_pred_val_custom_thresholds[i] = 0 # Default to 'No Trade' if confidence is too low

print("\nClassification Report with Custom Thresholds (Validation):")
# (Use your class_names and present_labels logic for the report)
present_labels_custom = sorted(list(set(y_clf_val) | set(y_pred_val_custom_thresholds)))
current_class_names_custom = [class_names[i] for i in present_labels_custom if i < len(class_names)]
if not current_class_names_custom: 
    current_class_names_custom = [f"Class {i}" for i in present_labels_custom]

print(classification_report(y_clf_val, y_pred_val_custom_thresholds, target_names=current_class_names_custom, labels=present_labels_custom, zero_division=0))

print("\nConfusion Matrix with Custom Thresholds (Validation):")
cm_custom = confusion_matrix(y_clf_val, y_pred_val_custom_thresholds, labels=present_labels_custom)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_custom, annot=True, fmt='d', cmap='Blues', xticklabels=current_class_names_custom, yticklabels=current_class_names_custom)
plt.title('Confusion Matrix - Custom Thresholds (Validation Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Example of iterating thresholds
# You would run this on your validation set probabilities

possible_thresholds = [0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65]
best_f1_long = 0
best_thresholds_for_long_f1 = {}

# Assuming final_lgbm_clf_augmented is your trained model from the class_weight=None Optuna run
y_pred_proba_val_aug = final_lgbm_clf_augmented.predict_proba(X_val_augmented)

print("--- Searching for Optimal Thresholds (Example for Long F1) ---")
for short_thresh in possible_thresholds:
    for long_thresh in possible_thresholds:

        custom_thresholds = {
            0: 0.0, # Default for 'No Trade' if it's highest prob
            1: short_thresh,
            2: long_thresh
        }

        y_pred_val_custom = np.zeros(len(y_pred_proba_val_aug), dtype=int)
        default_no_trade_class = 0

        for i in range(len(y_pred_proba_val_aug)):
            probs = y_pred_proba_val_aug[i]
            predicted_class_raw = np.argmax(probs)

            if probs[predicted_class_raw] >= custom_thresholds.get(predicted_class_raw, 0.0):
                y_pred_val_custom[i] = predicted_class_raw
            else:
                y_pred_val_custom[i] = default_no_trade_class

        # Evaluate based on your priority
        # For example, let's look at precision for Long, recall for Long, F1 for Long
        report = classification_report(y_clf_val, y_pred_val_custom, output_dict=True, zero_division=0)

        f1_long = report.get('2', {}).get('f1-score', 0) # Get F1 for class '2' (Long)
        precision_long = report.get('2', {}).get('precision', 0)
        recall_long = report.get('2', {}).get('recall', 0)

        precision_short = report.get('1', {}).get('precision', 0)
        recall_short = report.get('1', {}).get('recall', 0)

        # Example: Find thresholds that maximize F1 for Longs while keeping its precision above a certain level
        if f1_long > best_f1_long and precision_long > 0.55: # Example criteria
             best_f1_long = f1_long
             best_thresholds_for_long_f1 = {'short': short_thresh, 'long': long_thresh, 
                                            'long_f1': f1_long, 'long_precision': precision_long, 'long_recall': recall_long,
                                            'short_precision': precision_short, 'short_recall': recall_short}

        # You could print or store results for each combination to analyze
        # print(f"Thresh S:{short_thresh:.2f}/L:{long_thresh:.2f} -> P_L:{precision_long:.2f} R_L:{recall_long:.2f} F1_L:{f1_long:.2f} | P_S:{precision_short:.2f} R_S:{recall_short:.2f}")

print("\nBest thresholds found based on example criteria (max Long F1 with P_L > 0.55):")
print(best_thresholds_for_long_f1)

In [None]:
if 'final_lgbm_clf_augmented' not in locals():
    print("WARNING: 'final_lgbm_clf_augmented' not found. Loading a dummy model for demonstration.")
    # This would typically be your trained LightGBM model
    class DummyModel:
        def predict_proba(self, X):
            return np.random.rand(len(X), 3) / np.sum(np.random.rand(len(X), 3), axis=1, keepdims=True)
    final_lgbm_clf_augmented = DummyModel()

if 'X_test_augmented' not in locals() or 'y_clf_test' not in locals() or X_test_augmented.empty:
    print("WARNING: Real X_test_augmented or y_clf_test not found. Creating DUMMY data.")
    num_test_samples = 1000
    num_features_aug = X_test_augmented.shape[1] if ('X_test_augmented' in locals() and hasattr(X_test_augmented, 'shape')) else 148
    X_test_augmented_cols = [f'feature_{i}' for i in range(num_features_aug -1)] + ['predicted_regression_target']
    X_test_augmented = pd.DataFrame(np.random.rand(num_test_samples, num_features_aug), columns=X_test_augmented_cols)
    y_clf_test = pd.Series(np.random.randint(0, 3, num_test_samples))
    print("Dummy test data created.")


print("\n--- Applying Chosen Thresholds to Test Set Predictions ---")

# 1. Get probability predictions on the TEST SET
y_pred_proba_test_aug = final_lgbm_clf_augmented.predict_proba(X_test_augmented)

# 2. Define the chosen thresholds from your validation set tuning
chosen_thresholds = {
    # class_label: min_probability_required
    0: 0.0,          # If 'No Trade' has highest prob, accept it (or your tuned value)
    1: 0.35,         # Your 'best' threshold for Short
    2: 0.35          # Your 'best' threshold for Long
}
default_no_trade_class = 0

print(f"Using custom thresholds: {chosen_thresholds}")

# 3. Apply thresholds to get final class predictions for the test set
y_pred_test_custom_thresholds = np.zeros(len(y_pred_proba_test_aug), dtype=int)
for i in range(len(y_pred_proba_test_aug)):
    probs = y_pred_proba_test_aug[i]
    predicted_class_raw = np.argmax(probs) 
    
    if probs[predicted_class_raw] >= chosen_thresholds.get(predicted_class_raw, 0.0):
        y_pred_test_custom_thresholds[i] = predicted_class_raw
    else:
        y_pred_test_custom_thresholds[i] = default_no_trade_class

# 4. Evaluate performance on the TEST SET with these custom thresholds
print("\n--- Final Evaluation on Test Set with Custom Thresholds ---")
accuracy_test_custom = accuracy_score(y_clf_test, y_pred_test_custom_thresholds)
print(f"Test Accuracy with Custom Thresholds: {accuracy_test_custom:.4f}")

print("\nClassification Report with Custom Thresholds (Test Set):")
class_names = ['No Trade (0)', 'Short (1)', 'Long (2)'] 
present_labels_test_custom = sorted(list(set(y_clf_test) | set(y_pred_test_custom_thresholds)))
current_class_names_test_custom = [class_names[i] for i in present_labels_test_custom if i < len(class_names)]
if not current_class_names_test_custom: 
    current_class_names_test_custom = [f"Class {i}" for i in present_labels_test_custom]

print(classification_report(y_clf_test, y_pred_test_custom_thresholds, target_names=current_class_names_test_custom, labels=present_labels_test_custom, zero_division=0))

print("\nConfusion Matrix with Custom Thresholds (Test Set):")
cm_test_custom = confusion_matrix(y_clf_test, y_pred_test_custom_thresholds, labels=present_labels_test_custom)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test_custom, annot=True, fmt='d', cmap='Blues', xticklabels=current_class_names_test_custom, yticklabels=current_class_names_test_custom)
plt.title('Confusion Matrix - Custom Thresholds (Test Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score


if 'final_lgbm_clf_augmented' not in locals():
    raise NameError("Trained model 'final_lgbm_clf_augmented' not found.")
if 'X_val_augmented' not in locals() or 'y_clf_val' not in locals():
    raise NameError("Validation data (X_val_augmented, y_clf_val) not found.")

print("--- Starting Systematic Threshold Optimization on Validation Set ---")

# Get probability predictions on the validation set
y_pred_proba_val = final_lgbm_clf_augmented.predict_proba(X_val_augmented)

# Define ranges for thresholds to search
# You can make these ranges and steps more or less granular
short_thresholds_to_try = np.arange(0.30, 0.70, 0.05) 
long_thresholds_to_try = np.arange(0.30, 0.70, 0.05)
default_no_trade_class = 0 # Your 'No Trade' class label

best_overall_score = -1  # Initialize with a value that will be easily beaten
best_thresholds = {}
best_metrics = {}

results_list = []

for short_thresh in short_thresholds_to_try:
    for long_thresh in long_thresholds_to_try:
        current_custom_thresholds = {
            0: 0.0, # Or a minimum threshold for 'No Trade' if desired, e.g., 0.34
            1: short_thresh,
            2: long_thresh
        }
        
        y_pred_val_custom = np.zeros(len(y_pred_proba_val), dtype=int)
        for i in range(len(y_pred_proba_val)):
            probs = y_pred_proba_val[i]
            predicted_class_raw = np.argmax(probs)
            
            if probs[predicted_class_raw] >= current_custom_thresholds.get(predicted_class_raw, 0.0):
                y_pred_val_custom[i] = predicted_class_raw
            else:
                y_pred_val_custom[i] = default_no_trade_class
        
        # Calculate metrics - focus on precision and F1 for actionable classes
        precision_s = precision_score(y_clf_val, y_pred_val_custom, labels=[1], average='macro', zero_division=0)
        recall_s = recall_score(y_clf_val, y_pred_val_custom, labels=[1], average='macro', zero_division=0)
        f1_s = f1_score(y_clf_val, y_pred_val_custom, labels=[1], average='macro', zero_division=0)
        
        precision_l = precision_score(y_clf_val, y_pred_val_custom, labels=[2], average='macro', zero_division=0)
        recall_l = recall_score(y_clf_val, y_pred_val_custom, labels=[2], average='macro', zero_division=0)
        f1_l = f1_score(y_clf_val, y_pred_val_custom, labels=[2], average='macro', zero_division=0)
        
        # Count number of trades signaled
        num_short_trades = np.sum(y_pred_val_custom == 1)
        num_long_trades = np.sum(y_pred_val_custom == 2)

        results_list.append({
            'short_thresh': short_thresh, 'long_thresh': long_thresh,
            'P_Short': precision_s, 'R_Short': recall_s, 'F1_Short': f1_s, 'N_Short': num_short_trades,
            'P_Long': precision_l, 'R_Long': recall_l, 'F1_Long': f1_l, 'N_Long': num_long_trades,
            'Overall_F1_Macro': f1_score(y_clf_val, y_pred_val_custom, average='macro', zero_division=0)
        })

        # Define your criteria for "best" - this is subjective
        # Example: Maximize sum of F1_Short and F1_Long, but only if P_Short and P_Long > 0.55
        current_score = f1_s + f1_l 
        if precision_s > 0.55 and precision_l > 0.55: # Minimum precision constraint
            if current_score > best_overall_score:
                best_overall_score = current_score
                best_thresholds = {'short': short_thresh, 'long': long_thresh}
                best_metrics = {
                    'P_Short': precision_s, 'R_Short': recall_s, 'F1_Short': f1_s, 'N_Short': num_short_trades,
                    'P_Long': precision_l, 'R_Long': recall_l, 'F1_Long': f1_l, 'N_Long': num_long_trades
                }

# Create a DataFrame from results for easier analysis
df_threshold_results = pd.DataFrame(results_list)
print("\n--- Threshold Tuning Results (Validation Set) ---")
# Sort by a metric you care about, e.g., highest F1_Long with P_Long > 0.55
print(df_threshold_results.sort_values(by=['F1_Long', 'P_Long'], ascending=[False, False]).head(10))


print("\nBest Thresholds Found (example criteria: Maximize Short_F1+Long_F1 with P_Short & P_Long > 0.55):")
if best_thresholds:
    print(f"Thresholds: Short={best_thresholds['short']:.2f}, Long={best_thresholds['long']:.2f}")
    print("Metrics with these thresholds on Validation Set:")
    for metric, value in best_metrics.items():
        if isinstance(value, float):
            print(f"  {metric}: {value:.4f}")
        else:
            print(f"  {metric}: {value}")
else:
    print("No thresholds met the example criteria (e.g., P_Short & P_Long > 0.55). Adjust criteria or threshold range.")


In [None]:
if 'final_lgbm_clf_augmented' not in locals():
    print("WARNING: 'final_lgbm_clf_augmented' not found. Loading a dummy model for demonstration.")
    class DummyModel: # Replace with loading your actual trained model if running in a new session
        def predict_proba(self, X):
            # Simulate probabilities that are somewhat skewed
            raw_probs = np.random.rand(len(X), 3)
            raw_probs[:,1] *= 1.1 # Slightly higher chance for short
            return raw_probs / np.sum(raw_probs, axis=1, keepdims=True)
    final_lgbm_clf_augmented = DummyModel()

if 'X_test_augmented' not in locals() or 'y_clf_test' not in locals() or X_test_augmented.empty:
    print("WARNING: Real X_test_augmented or y_clf_test not found. Creating DUMMY data.")
    num_test_samples = 1000
    # Attempt to get num_features from X_test_augmented if it partially exists, or default
    num_features_aug = X_test_augmented.shape[1] if ('X_test_augmented' in locals() and hasattr(X_test_augmented, 'shape')) else 148
    
    X_test_augmented_cols = [f'feature_{i}' for i in range(num_features_aug -1)] + ['predicted_regression_target']
    X_test_augmented = pd.DataFrame(np.random.rand(num_test_samples, num_features_aug), columns=X_test_augmented_cols)
    y_clf_test = pd.Series(np.random.randint(0, 3, num_test_samples))
    print("Dummy test data created.")


print("\n--- Applying YOUR CHOSEN Thresholds to Test Set Predictions ---")

# 1. Get probability predictions on the TEST SET
y_pred_proba_test = final_lgbm_clf_augmented.predict_proba(X_test_augmented)

# 2. Define THE CHOSEN thresholds from your validation set tuning
chosen_thresholds = {
    # class_label: min_probability_required
    0: 0.0,  # If 'No Trade' has highest prob, accept it (or your tuned value for No Trade if any)
    1: 0.40, # Your 'best' threshold for Short from validation tuning
    2: 0.35  # Your 'best' threshold for Long from validation tuning
}
default_no_trade_class = 0

print(f"Using custom thresholds for Test Set: Short={chosen_thresholds[1]:.2f}, Long={chosen_thresholds[2]:.2f}")

# 3. Apply thresholds to get final class predictions for the test set
y_pred_test_custom = np.zeros(len(y_pred_proba_test), dtype=int)
for i in range(len(y_pred_proba_test)):
    probs = y_pred_proba_test[i]
    predicted_class_raw = np.argmax(probs) 
    
    if probs[predicted_class_raw] >= chosen_thresholds.get(predicted_class_raw, 0.0):
        y_pred_test_custom[i] = predicted_class_raw
    else:
        y_pred_test_custom[i] = default_no_trade_class

# 4. Evaluate performance on the TEST SET with these custom thresholds
print("\n--- Final Evaluation on Test Set with YOUR CHOSEN Custom Thresholds ---")
accuracy_test_custom = accuracy_score(y_clf_test, y_pred_test_custom)
print(f"Test Accuracy with Custom Thresholds: {accuracy_test_custom:.4f}")

print("\nClassification Report with Custom Thresholds (Test Set):")
class_names = ['No Trade (0)', 'Short (1)', 'Long (2)'] 
# Ensure present_labels_test_custom uses y_clf_test and y_pred_test_custom
present_labels_test_custom = sorted(list(set(y_clf_test) | set(y_pred_test_custom)))
current_class_names_test_custom = [class_names[i] for i in present_labels_test_custom if i < len(class_names)]
if not current_class_names_test_custom: 
    current_class_names_test_custom = [f"Class {i}" for i in present_labels_test_custom]

print(classification_report(y_clf_test, y_pred_test_custom, target_names=current_class_names_test_custom, labels=present_labels_test_custom, zero_division=0))

print("\nConfusion Matrix with Custom Thresholds (Test Set):")
cm_test_custom = confusion_matrix(y_clf_test, y_pred_test_custom, labels=present_labels_test_custom)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test_custom, annot=True, fmt='d', cmap='Blues', xticklabels=current_class_names_test_custom, yticklabels=current_class_names_test_custom)
plt.title('Confusion Matrix - Custom Thresholds (Test Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()