In [None]:
from itertools import product
import pandas as pd
from datetime import datetime
from utils.data_loader import load_and_resample_data
from utils.helpers import save_model_pkl
from utils.labeling_utils import label_and_save
from utils.pipeline import apply_feature_engineering
from optimization.classification import tune_rf, tune_xgboost, tune_lgbm, tune_catboost
from utils.metrics import evaluate_model, classification_insights, visualize_results
from config import ROOT, DB_DIR, RAW_DIR, DATA_DIR
from sklearn.ensemble import RandomForestClassifier

MARKET = "<your-market>"
LOOKAHEAD = 6

avoid_funcs = {
    #'avoid_hour_18_19': avoid_hour_18_19
    #'news_window': avoid_news,
}

param_grid_strategy = {
    'SL_ATR_MULT': [1.0, 1.5, 2.0],  # Wider stops
    'TP_ATR_MULT': [3.0, 4.0, 5.0, 8.0],   # More conservative targets
    'TRAIL_START_MULT': [1.0, 1.5],    # Let winners run
    'TRAIL_STOP_MULT': [0.8, 1.0],     # Tighter trailing stops
    'TICK_VALUE': [20],
}

keys, values = zip(*param_grid_strategy.items())
combinations = [dict(zip(keys, v)) for v in product(*values)]

In [None]:
# load raw + resampled
raw_1m, resampled = load_and_resample_data(
    market     = MARKET,
    timeframes = ["5min","15min","1h"], # This is an example, use the resamples you think appropiate
    csv_dir    = (f"{RAW_DIR}/{MARKET}")
)

# one‐liner feature merge
df_features = apply_feature_engineering(
    resampled,
    timeframes=["5min","15min","1h"], # This is an example, use the resamples you think appropiate
    base_tf="5min", # This is an example, use the resamples you think appropiate
    features=None  # or e.g. ["add_rsi_signals","add_ma_slope"]
)

print("Result:", df_features.shape)
print("NaNs:", df_features.isna().any(axis=1).sum())


In [None]:
lookahead_options = [12, 6, 3] # This is an example, use the values you think appropiate
for lookahead in lookahead_options:
    labeled = label_and_save(
        df_input_features=df_features,
        lookahead_period=lookahead,
        vol_col_name='ATR_14_5min', # This is an example, use the values you think appropiate
        pt_multiplier=2.0, # This is an example, use the values you think appropiate
        sl_multiplier=1.0, # This is an example, use the values you think appropiate
        min_return_percentage=0.0005, # This is an example, use the values you think appropiate
        output_file_suffix=f'{lookahead}{MARKET}',
        feature_columns_for_dropna=[],
        market=MARKET
    )

In [None]:
selected_indicators_class = []

selected_indicators_reg = []

In [None]:
# === Classification Starts Here ===
labeled = pd.read_parquet(f"../parquet/{MARKET}/labeled_data_{LOOKAHEAD}{MARKET}.parquet")

# === Ensure datetime column exists and is parsed ===
if labeled.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(labeled.index):
    labeled = labeled.reset_index()
if 'datetime' not in labeled.columns:
    raise KeyError("❌ 'datetime' column is missing.")

labeled['datetime'] = pd.to_datetime(labeled['datetime'])
labeled = labeled.sort_values('datetime')

# === Train/test split ===
cutoff_date = pd.Timestamp("2025-01-01", tz="America/New_York") # This is an example, use the values you think appropiate
train = labeled[labeled['datetime'] < cutoff_date]
test = labeled[labeled['datetime'] >= cutoff_date]

train = train.set_index('datetime')
test = test.set_index('datetime')

# === Feature selection ===
X_train = train[selected_indicators_class]
X_test = test[selected_indicators_class]

colum_finder = "clf_target"  # This is an example, use the values you think appropiate
label_column = [col for col in labeled.columns if col.startswith(colum_finder)] 
if not label_column:
    raise ValueError(f"❌ No regression target column found starting with {colum_finder}.")
lab_column = label_column[0]
print(f"📌 Using regression target column: {lab_column}")

y_train = train[lab_column]
y_test = test[lab_column]

print(f"Train range: {train.index.min()} to {train.index.max()} | Rows: {len(train)}")
print(f"Test range: {test.index.min()} to {test.index.max()} | Rows: {len(test)}")


In [None]:
# pick one:
rf_best = tune_rf(
  X_train, y_train, n_trials=1,
  unique_id=f"00",
  market=MARKET
)
print("RF best:", rf_best)

In [None]:
rf = RandomForestClassifier(**rf_best, random_state=42)
rf.fit(X_train, y_train)

In [None]:
preds_rf = evaluate_model("RandomForest", rf, X_train, X_test, y_train, y_test)

results = classification_insights(
    model=rf,
    X_train=X_train, y_train=y_train,
    X_test= X_test,  y_test= y_test,
    class_names=[ "flat", "long", "short" ]    # This is an example, use the values you think appropiate
)

In [None]:
from utils.backtest import evaluate_classification
all_results = []
preds_rf = rf.predict_proba(X_test)

if not isinstance(labeled.index, pd.DatetimeIndex):
    if "datetime" in labeled.columns:
        labeled = labeled.set_index("datetime")
    else:
        raise ValueError("No 'datetime' column to set as index!")

# 3. Make it timezone-aware & convert to New York
#    — if it’s naive, assume UTC; if it already has a tz, just convert
if labeled.index.tz is None:
    labeled.index = labeled.index.tz_localize("UTC")
labeled.index = labeled.index.tz_convert("America/New_York")

# 2) Filter out any test timestamps not found in labeled.index
# 3) Now call evaluation on the aligned subset
print(f"\n🔎 Predicted return range for STACK: min={preds_rf.min():.15f}, max={preds_rf.max():.15f}")
results = evaluate_classification(
    X_test,
    preds_rf,
    labeled,
    avoid_funcs,
    TRAIL_START_MULT=0, # This is an example, use the values you think appropiate
    TRAIL_STOP_MULT=0, # This is an example, use the values you think appropiate
    TICK_VALUE=6 # This is an example, use the values you think appropiate
)

all_results.append(results)
print(
    f"\nPnL: ${results['pnl']:.2f}"
    f"\nTrades: {results['trades']}"
    f"\nWin Rate: {results['win_rate']:.2%}"
    f"\nExpectancy: {results['expectancy']:.2f}"
    f"\nProfit Factor: {results['profit_factor']:.2f}"\
    f"\nSharpe Ratio: {results['sharpe']:.2f}"
    f"\nLong Trades: {results['long_trades']} | Short Trades: {results['short_trades']}"
    f"\n"
)

print("Avoid Hits:")
for name, count in results['avoid_hits'].items():
    print(f" - {name}: {count}")

if not results['results'].empty and 'pnl' in results['results'].columns:
    print("\n🔢 Top 5 PnL trades:")
    print(results['results'].sort_values(by='pnl', ascending=False).head(5))

    print("\n🔻 Bottom 5 PnL trades:")
    print(results['results'].sort_values(by='pnl', ascending=True).head(5))
else:
    print("\n⚠️ No trades executed, skipping PnL trade breakdown.")


summary_df = pd.DataFrame([{
    'pnl': r['pnl'],
    'sharpe': r['sharpe'],
    'expectancy': r['expectancy'],
    'profit_factor': r['profit_factor'],
    'win_rate': r['win_rate'],
    'trades': r['trades'],
    'results': r['results'],
} for r in all_results])
top = summary_df.sort_values(by='sharpe', ascending=False).head(10)
print("\n🏁 Top 10 Configurations Across All Lookaheads:")
print(top[['pnl', 'sharpe', 'expectancy', 'profit_factor', 'win_rate', 'trades']])

In [None]:
visualize_results(all_results)

In [None]:
save_model_pkl(rf, MARKET, "classifier", LOOKAHEAD, "00")