In [1]:
from itertools import product
import pandas as pd
from datetime import datetime
from utils.data_loader import load_and_resample_data
from utils.labeling_utils import label_and_save
from utils.pipeline import apply_feature_engineering
from optimization.classification import tune_rf, tune_xgboost, tune_lgbm, tune_catboost
from utils.metrics import evaluate_model, classification_insights, visualize_results
from config import ROOT, DB_DIR, RAW_DIR, DATA_DIR
from sklearn.ensemble import RandomForestClassifier

MARKET = "NQ"
LOOKAHEAD = 6

avoid_funcs = {
    #'avoid_hour_18_19': avoid_hour_18_19
    #'news_window': avoid_news,
}

param_grid_strategy = {
    'SL_ATR_MULT': [1.0, 1.5, 2.0],  # Wider stops
    'TP_ATR_MULT': [3.0, 4.0, 5.0, 8.0],   # More conservative targets
    'TRAIL_START_MULT': [1.0, 1.5],    # Let winners run
    'TRAIL_STOP_MULT': [0.8, 1.0],     # Tighter trailing stops
    'TICK_VALUE': [20],
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

In [2]:
# load raw + resampled
raw_1m, resampled = load_and_resample_data(
    market     = MARKET,
    timeframes = ["5min","15min","1h"],
    csv_dir    = (f"{RAW_DIR}/NQ")
)

# one‐liner feature merge
df_features = apply_feature_engineering(
    resampled,
    timeframes=["5min","15min","1h"],
    base_tf="5min",
    features=None  # or e.g. ["add_rsi_signals","add_ma_slope"]
)

print("Result:", df_features.shape)
print("NaNs:", df_features.isna().any(axis=1).sum())


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Result: (382362, 374)
NaNs: 2472


In [3]:
lookahead_options = [12, 6, 3]
for lookahead in lookahead_options:
    labeled = label_and_save(
        df_input_features=df_features,
        lookahead_period=lookahead,
        vol_col_name='ATR_14_5min',
        pt_multiplier=2.0,
        sl_multiplier=1.0,
        min_return_percentage=0.0005,
        output_file_suffix=f'{lookahead}{MARKET}',
        feature_columns_for_dropna=[],
        market=MARKET
    )


--- Processing for output suffix: 12NQ ---

Adding Classification Target: clf_target_numba_12 ...
clf_target_numba_12 NaNs: 0
Rows after dropping NaNs from targets: 382362
✅ Saved ../parquet/NQ\labeled_data_12NQ.parquet with 382362 rows

--- Processing for output suffix: 6NQ ---

Adding Classification Target: clf_target_numba_6 ...
clf_target_numba_6 NaNs: 0
Rows after dropping NaNs from targets: 382362
✅ Saved ../parquet/NQ\labeled_data_6NQ.parquet with 382362 rows

--- Processing for output suffix: 3NQ ---

Adding Classification Target: clf_target_numba_3 ...
clf_target_numba_3 NaNs: 0
Rows after dropping NaNs from targets: 382362
✅ Saved ../parquet/NQ\labeled_data_3NQ.parquet with 382362 rows


In [4]:
# selected_indicators_class = [
#     "Session_5min", "Time_of_Day_5min",
#         "Open_of_Day_5min", "High_of_Day_5min", "Low_of_Day_5min",
#         "Prev_High_5min", "Prev_Low_5min",
#         "Price_vs_Open_5min", "Price_vs_Session_High_Low_5min",
#         "Volume_Spike_5min", "CVD_3_5min",
#         "Stop_Hunt_5min", "Bull_Engulfing_5min", "Bear_Engulfing_5min",
#         "FVG_Exists_5min"
# ]

selected_indicators_class = [ # PRETTY GOOD RESULTS
    'POC_Dist_Current_Points_1h', 'POC_Dist_Current_Points_5min', 'Day_of_Week', 
    'POC_Dist_Current_Points_15min', 'Day_Sin', 'RSI_7_5min', 'DMN_14_1h', 
    'Trend_Score_15min', 'Trend_Strength_5min', 'Prev_Swing_Dist_15min', 'Time_Sin', 
    'Volume_Trend_15min', 'Is_Trending_5min', 'Is_Trending_15min'
    ]

selected_indicators_reg = [
    'open', 'high', 'low', 'close', 'volume', 'Volume_SMA_20_5min', 'BBB_20_2.0_5min', 'BBP_20_2.0_5min', 
    'close_vs_BB_Upper_5min', 'close_vs_BB_Lower_5min', 'ATR_14_5min', 'EMA_21_Slope_21_3_5min', 'MACDh_12_26_9_5min', 
    'MACD_12_26_9_Cross_Signal_5min', 'ADX_14_5min', 'Plus_DI_14_5min', 'DMN_14_5min', 'RSI_14_5min', 
    'RSI_7_Is_Overbought_70_5min', 'RSI_7_Is_Oversold_30_5min', 'RSI_14_Is_Overbought_70_5min', 'RSI_14_Is_Oversold_30_5min', 
    'CHOP_14_1_100_5min', 'CHOP_7_1_100_5min', 'Is_Choppy_14_5min', 'Is_Choppy_7_5min', 'STOCHk_14_3_3_Is_Overbought_80_5min', 
    'STOCHk_14_3_3_Is_Oversold_20_5min', 'PPO_12_26_9_5min', 'ROC_10_5min', 'Candle_Range_5min', 'Candle_Body_5min', 'Upper_Wick_5min', 
    'Lower_Wick_5min', 'Body_vs_Range_5min', 'CDL_DOJI_10_0.1_5min', 'CDL_HAMMER_5min', 'CDL_ENGULFING_5min', 'Rolling_Skew_30_5min', 
    'Rolling_Kurtosis_30_5min', 'Candle_Body_Lag_1_5min', 'Candle_Body_Lag_2_5min', 'Candle_Body_Lag_3_5min', 'Is_Trending_5min', 
    'Is_Choppy_5min', 'Is_High_Vol_5min', 'cum_tpv_5min', 'VWAP_Session_Dist_5min', 'Vol_Delta_1_5min', 'Vol_Delta_2_5min', 
    'Vol_Delta_3_5min', 'Vol_zscore_20_5min', 'High_Vol_Event_20_5min', 'Vol_zscore_10_5min', 'High_Vol_Event_10_5min', 
    'Vol_zscore_5_5min', 'POC_Dist_Current_Points_5min', 'POC_Dist_Previous_Points_5min', 'Close_Pos_%_5min', 'Rel_Vol_20_5min', 
    'CVD_3_5min', 'Upper_Wick_%_5min', 'Lower_Wick_%_5min', 'Trend_Direction_5min', 'Trend_Strength_5min', 'Vol_Regime_5min', 
    'Volume_Trend_5min', 'Trend_Alignment_5min', 'Mean_Reversion_5min', 'Trend_Score_5min', 'Hour_of_Day', 'Minute_of_Hour', 
    'Day_of_Week', 'Time_Sin', 'Time_Cos', 'Day_Sin', 'Day_Cos', 'Is_Asian_Session', 'Is_London_Session', 'Is_NY_Session', 'Is_Overlap', 
    'Is_US_Open_Hour', 'Is_US_Close_Hour', 'Volume_SMA_20_15min', 'BBB_20_2.0_15min', 'BBP_20_2.0_15min', 'close_vs_BB_Upper_15min', 
    'close_vs_BB_Lower_15min', 'ATR_14_15min', 'MACDh_12_26_9_15min', 'MACD_12_26_9_Cross_Signal_15min', 'ADX_14_15min', 'Plus_DI_14_15min', 
    'Minus_DI_14_15min', 'RSI_14_15min', 'RSI_7_Is_Overbought_70_15min', 'RSI_7_Is_Oversold_30_15min', 'RSI_14_Is_Overbought_70_15min', 
    'RSI_14_Is_Oversold_30_15min', 'CHOP_14_1_100_15min', 'CHOP_7_1_100_15min', 'Is_Choppy_14_15min', 'Is_Choppy_7_15min', 
    'STOCHk_14_3_3_Is_Overbought_80_15min', 'STOCHk_14_3_3_Is_Oversold_20_15min', 'PPO_12_26_9_15min', 'ROC_10_15min', 
    'Candle_Range_15min', 'Candle_Body_15min', 'Upper_Wick_15min', 'Lower_Wick_15min', 'Body_vs_Range_15min', 'CDL_DOJI_10_0.1_15min', 
    'CDL_HAMMER_15min', 'CDL_ENGULFING_15min', 'Rolling_Skew_30_15min', 'Rolling_Kurtosis_30_15min', 'Candle_Body_Lag_1_15min', 
    'Candle_Body_Lag_2_15min', 'Candle_Body_Lag_3_15min', 'Is_Trending_15min', 'Is_Choppy_15min', 'Is_High_Vol_15min', 'TPV_15min', 
    'VWAP_Session_Dist_15min', 'Vol_Delta_1_15min', 'Vol_Delta_2_15min', 'Vol_Delta_3_15min', 'Vol_zscore_20_15min', 'High_Vol_Event_20_15min',
    'Vol_zscore_10_15min', 'High_Vol_Event_10_15min', 'Vol_zscore_5_15min', 'POC_Dist_Current_Points_15min', 'POC_Dist_Previous_Points_15min', 
    'Close_Pos_%_15min', 'Rel_Vol_20_15min', 'CVD_3_15min', 'Upper_Wick_%_15min', 'Lower_Wick_%_15min', 'Trend_Direction_15min', 
    'Trend_Strength_15min', 'Vol_Regime_15min', 'Volume_Trend_15min', 'Trend_Alignment_15min', 'Trend_Score_15min', 'Volume_SMA_20_1h', 
    'BBB_20_2.0_1h', 'BBP_20_2.0_1h', 'close_vs_BB_Upper_1h', 'close_vs_BB_Lower_1h', 'ATR_14_1h', 'MACDh_12_26_9_1h', 
    'MACD_12_26_9_Cross_Signal_1h', 'ADX_14_1h', 'Plus_DI_14_1h', 'Minus_DI_14_1h', 'RSI_14_1h', 'RSI_7_Is_Overbought_70_1h', 
    'RSI_7_Is_Oversold_30_1h', 'RSI_14_Is_Overbought_70_1h', 'RSI_14_Is_Oversold_30_1h', 'CHOP_14_1_100_1h', 'CHOP_7_1_100_1h', 
    'Is_Choppy_14_1h', 'Is_Choppy_7_1h', 'STOCHk_14_3_3_Is_Overbought_80_1h', 'STOCHk_14_3_3_Is_Oversold_20_1h', 'PPO_12_26_9_1h', 
    'PPOh_12_26_9_1h', 'ROC_10_1h', 'Candle_Range_1h', 'Candle_Body_1h', 'Upper_Wick_1h', 'Lower_Wick_1h', 'Body_vs_Range_1h', 
    'CDL_DOJI_10_0.1_1h', 'CDL_HAMMER_1h', 'CDL_ENGULFING_1h', 'Rolling_Std_Dev_14_1h', 'Rolling_Skew_30_1h', 'Rolling_Kurtosis_30_1h', 
    'Candle_Body_Lag_1_1h', 'Candle_Body_Lag_2_1h', 'Candle_Body_Lag_3_1h', 'Is_Trending_1h', 'Is_Choppy_1h', 'Is_High_Vol_1h', 'cum_tpv_1h', 
    'VWAP_Session_Dist_1h', 'Vol_Delta_1_1h', 'Vol_Delta_2_1h', 'Vol_Delta_3_1h', 'Vol_zscore_20_1h', 'High_Vol_Event_20_1h', 'Vol_zscore_10_1h', 
    'High_Vol_Event_10_1h', 'Vol_zscore_5_1h', 'POC_Dist_Current_Points_1h', 'POC_Dist_Previous_Points_1h', 'Close_Pos_%_1h', 'CVD_3_1h', 
    'Upper_Wick_%_1h', 'Lower_Wick_%_1h', 'Trend_Direction_1h', 'Trend_Strength_1h', 'Vol_Regime_1h', 'Volume_Trend_1h', 'Trend_Alignment_1h', 'Trend_Score_1h'
    ]

In [5]:
# === Classification Starts Here ===
labeled = pd.read_parquet(f"../parquet/{MARKET}/labeled_data_{LOOKAHEAD}{MARKET}.parquet")

# === Ensure datetime column exists and is parsed ===
if labeled.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(labeled.index):
    labeled = labeled.reset_index()
if 'datetime' not in labeled.columns:
    raise KeyError("❌ 'datetime' column is missing.")

labeled['datetime'] = pd.to_datetime(labeled['datetime'])
labeled = labeled.sort_values('datetime')

# === Train/test split ===
cutoff_date = pd.Timestamp("2025-01-01", tz="America/New_York")
train = labeled[labeled['datetime'] < cutoff_date]
test = labeled[labeled['datetime'] >= cutoff_date]

train = train.set_index('datetime')
test = test.set_index('datetime')

# === Feature selection ===
X_train = train[selected_indicators_class]
X_test = test[selected_indicators_class]

colum_finder = "clf_target" # reg
label_column = [col for col in labeled.columns if col.startswith(colum_finder)] 
if not label_column:
    raise ValueError(f"❌ No regression target column found starting with {colum_finder}.")
lab_column = label_column[0]
print(f"📌 Using regression target column: {lab_column}")

y_train = train[lab_column]
y_test = test[lab_column]

print(f"Train range: {train.index.min()} to {train.index.max()} | Rows: {len(train)}")
print(f"Test range: {test.index.min()} to {test.index.max()} | Rows: {len(test)}")


📌 Using regression target column: clf_target_numba_6
Train range: 2020-01-01 18:00:00-05:00 to 2024-12-31 17:00:00-05:00 | Rows: 351271
Test range: 2025-01-01 18:00:00-05:00 to 2025-06-10 20:00:00-04:00 | Rows: 31091


In [6]:
# pick one:
rf_best = tune_rf(
  X_train, y_train, n_trials=1,
  unique_id=f"03",
  market=MARKET
)
print("RF best:", rf_best)

[I 2025-06-18 13:48:03,515] Using an existing study with name 'rf_opt_class_NQ_03' instead of creating a new one.


4 splits are possible to use for optimization.



[I 2025-06-18 13:50:30,522] Trial 11 finished with value: 0.4319475618091908 and parameters: {'n_estimators': 800, 'max_depth': 12, 'max_leaf_nodes': 100, 'min_samples_split': 40, 'min_samples_leaf': 25, 'max_features': 'sqrt', 'class_weight': 'balanced', 'criterion': 'entropy'}. Best is trial 1 with value: 0.43348287485283415.


RF best: {'n_estimators': 300, 'max_depth': 7, 'max_leaf_nodes': 100, 'min_samples_split': 40, 'min_samples_leaf': 30, 'max_features': 0.5, 'class_weight': 'balanced', 'criterion': 'entropy'}


In [7]:
rf = RandomForestClassifier(**rf_best, random_state=42)
rf.fit(X_train, y_train)

In [8]:
preds_rf = evaluate_model("RandomForest", rf, X_train, X_test, y_train, y_test)

results = classification_insights(
    model=rf,
    X_train=X_train, y_train=y_train,
    X_test= X_test,  y_test= y_test,
    class_names=[ "flat", "long", "short" ]   # optional
)


📊 RandomForest Classification Accuracy:
Train Accuracy: 0.4944
Test Accuracy: 0.4661
=== Target distribution (train) ===
clf_target_numba_6
0    0.654011
2    0.178799
1    0.167190
Name: proportion, dtype: float64

=== Target distribution (test) ===
clf_target_numba_6
0    0.659001
2    0.184652
1    0.156347
Name: proportion, dtype: float64

=== Prediction distribution (test) ===
pred
0    0.411341
2    0.306487
1    0.282172
Name: proportion, dtype: float64

Accuracy: 0.4661
F1 Score (macro): 0.4129

Classification report:
              precision    recall  f1-score   support

        flat       0.76      0.48      0.59     20489
        long       0.24      0.44      0.31      4861
       short       0.27      0.45      0.34      5741

    accuracy                           0.47     31091
   macro avg       0.43      0.46      0.41     31091
weighted avg       0.59      0.47      0.50     31091

Confusion matrix:
[[9773 5143 5573]
 [1369 2127 1365]
 [1647 1503 2591]]


In [9]:
from utils.backtest import evaluate_classification
all_results = []
preds_rf = rf.predict_proba(X_test)

if not isinstance(labeled.index, pd.DatetimeIndex):
    if "datetime" in labeled.columns:
        labeled = labeled.set_index("datetime")
    else:
        raise ValueError("No 'datetime' column to set as index!")

# 3. Make it timezone-aware & convert to New York
#    — if it’s naive, assume UTC; if it already has a tz, just convert
if labeled.index.tz is None:
    labeled.index = labeled.index.tz_localize("UTC")
labeled.index = labeled.index.tz_convert("America/New_York")

# 2) Filter out any test timestamps not found in labeled.index
# 3) Now call evaluation on the aligned subset
print(f"\n🔎 Predicted return range for STACK: min={preds_rf.min():.15f}, max={preds_rf.max():.15f}")
results = evaluate_classification(
    X_test,
    preds_rf,
    labeled,
    avoid_funcs,
    TRAIL_START_MULT=0,
    TRAIL_STOP_MULT=0,
    TICK_VALUE=6
)

all_results.append(results)
print(
    f"\nPnL: ${results['pnl']:.2f}"
    f"\nTrades: {results['trades']}"
    f"\nWin Rate: {results['win_rate']:.2%}"
    f"\nExpectancy: {results['expectancy']:.2f}"
    f"\nProfit Factor: {results['profit_factor']:.2f}"\
    f"\nSharpe Ratio: {results['sharpe']:.2f}"
    f"\nLong Trades: {results['long_trades']} | Short Trades: {results['short_trades']}"
    f"\n"
)

print("Avoid Hits:")
for name, count in results['avoid_hits'].items():
    print(f" - {name}: {count}")

if not results['results'].empty and 'pnl' in results['results'].columns:
    print("\n🔢 Top 5 PnL trades:")
    print(results['results'].sort_values(by='pnl', ascending=False).head(5))

    print("\n🔻 Bottom 5 PnL trades:")
    print(results['results'].sort_values(by='pnl', ascending=True).head(5))
else:
    print("\n⚠️ No trades executed, skipping PnL trade breakdown.")


summary_df = pd.DataFrame([{
    'pnl': r['pnl'],
    'sharpe': r['sharpe'],
    'expectancy': r['expectancy'],
    'profit_factor': r['profit_factor'],
    'win_rate': r['win_rate'],
    'trades': r['trades'],
    'results': r['results'],
} for r in all_results])
top = summary_df.sort_values(by='sharpe', ascending=False).head(10)
print("\n🏁 Top 10 Configurations Across All Lookaheads:")
print(top[['pnl', 'sharpe', 'expectancy', 'profit_factor', 'win_rate', 'trades']])


🔎 Predicted return range for STACK: min=0.076688895621209, max=0.794208808843630

PnL: $160887.84
Trades: 2522
Win Rate: 53.33%
Expectancy: 63.79
Profit Factor: 1.67
Sharpe Ratio: 9.85
Long Trades: 1213 | Short Trades: 1309

Avoid Hits:

🔢 Top 5 PnL trades:
                    entry_time                 exit_time  side  entry_price  \
1521 2025-04-07 10:30:00-04:00 2025-04-07 14:20:00-04:00  long     17314.00   
1575 2025-04-09 13:45:00-04:00 2025-04-09 14:30:00-04:00  long     18651.75   
1577 2025-04-09 15:00:00-04:00 2025-04-09 15:35:00-04:00  long     18892.00   
1573 2025-04-09 13:15:00-04:00 2025-04-09 13:30:00-04:00  long     18100.00   
1519 2025-04-07 09:55:00-04:00 2025-04-07 10:15:00-04:00  long     17338.25   

        exit_price          pnl     mfe    mae    gross_pnl  position_size  \
1521  17821.067575  3038.425449  3060.0  985.5  3042.405449            1.0   
1575  19087.957688  2613.266126  2763.0    0.0  2617.246126            1.0   
1577  19257.203740  2187.242437 

In [11]:
visualize_results(all_results)

exit_reason
TP             1329
SL             1166
SESSION_END      27
Name: count, dtype: int64


UnboundLocalError: cannot access local variable 'best_result' where it is not associated with a value