In [None]:
# Cell 1: Imports and model definitions
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor

CLASSIFIERS = {
    'logistic': LogisticRegression(max_iter=1000, n_jobs=-4, verbose=2),
    'rf': RandomForestClassifier(n_estimators=100, n_jobs=-4, verbose=2),
    'xgb': XGBClassifier(
        eval_metric='logloss',
        tree_method='hist',
        device='cuda',
        n_jobs=1
    ),
    'lgbm': LGBMClassifier(device='gpu', n_jobs=1, verbose=2),
    'mlp': MLPClassifier(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42,
        verbose=True
    )
}

REGRESSORS = {
    'linear': LinearRegression(),
    'rf': RandomForestRegressor(n_estimators=100, n_jobs=-4, verbose=2),
    'xgb': XGBRegressor(
        tree_method='hist',
        device='cuda',
        n_jobs=1
    ),
    'lgbm': LGBMRegressor(device='gpu', n_jobs=1, verbose=2),
    'mlp': MLPRegressor(
        hidden_layer_sizes=(128, 64),
        activation='relu',
        solver='adam',
        max_iter=300,
        random_state=42,
        verbose=True
    )
}


In [None]:
# Cell 2: Helper functions for training & evaluation
def evaluate_classifier(models, feature_sets, X, y, test_size=0.2, random_state=42):
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    for fs_name, features in feature_sets.items():
        Xtr, Xte = X_train[features], X_test[features]

        df_train = pd.concat([Xtr.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
        target_col = y.name
        class_counts = df_train[target_col].value_counts()
        max_count = class_counts.max()

        dfs = []
        for cls in class_counts.index:
            df_cls = df_train[df_train[target_col] == cls]
            df_cls_upsampled = resample(
                df_cls,
                replace=True,
                n_samples=max_count,
                random_state=random_state
            )
            dfs.append(df_cls_upsampled)

        df_bal = pd.concat(dfs).sample(frac=1, random_state=random_state)
        y_bal = df_bal[target_col]
        X_bal = df_bal.drop(columns=[target_col])

        for name, model in models.items():
            m = model.__class__(**model.get_params())
            m.fit(X_bal, y_bal)
            preds = m.predict(Xte)
            results[(fs_name, name)] = {
                'accuracy': accuracy_score(y_test, preds),
                'f1_macro': f1_score(y_test, preds, average='macro')
            }
    return results

def evaluate_regressor(models, feature_sets, X, y, test_size=0.2, random_state=42):
    results = {}
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    for fs_name, features in feature_sets.items():
        Xtr, Xte = X_train[features], X_test[features]
        for name, model in models.items():
            m = model.__class__(**model.get_params())
            m.fit(Xtr, y_train)
            preds = m.predict(Xte)
            results[(fs_name, name)] = {
                'r2': r2_score(y_test, preds),
                'mse': mean_squared_error(y_test, preds)
            }
    return results

In [None]:
# from langchain_openai import ChatOpenAI
# import os
# import yaml
# import pandas as pd
# from pprint import pprint

# from ai_data_science_team.agents import FeatureEngineeringAgent

df = pd.read_parquet(f"../../notebooks/parquet/labeled_data_6NQ.parquet")
# df

# MODEL    = "gpt-4o-mini"
# LOG      = True
# LOG_PATH = os.path.join(os.getcwd(), "logs/")

# os.environ["OPENAI_API_KEY"] = "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"

# llm = ChatOpenAI(model = MODEL)

# llm

# feature_engineering_agent = FeatureEngineeringAgent(
#     model = llm, 
#     log=LOG, 
#     log_path=LOG_PATH
# )

# feature_engineering_agent


# feature_engineering_agent.invoke_agent(
#     data_raw=df,
#     user_instructions="Inspect the data. Make any new features and transformations that you think will be useful for predicting the target variable.",
#     target_variable="Churn",
#     max_retries=3,
#     retry_count=0,
# )

In [None]:
df.dropna(inplace=True)
print(df.columns.to_list())
feature_sets = {
    "ema_crossover": [
        "EMA_9_5min", "EMA_21_5min", "EMA_50_15min", "EMA_200_1h",
        "EMA_9_minus_21_5min",  # Custom feature: EMA_9 - EMA_21
        "MACDh_12_26_9_5min",
        "RSI_7_5min", "RSI_14_15min",
        "Candle_Body_vs_Range_5min", "Upper_Wick_5min", "Lower_Wick_5min",
        "volume", "Rel_Vol_20_5min",
        "ADX_14_15min", "Is_Choppy_14_5min"
    ],

    "support_resistance": [
        "Prev_Swing_High_5min", "Prev_Swing_Low_5min",
        "Dist_to_Closest_SR_5min",  # Custom feature: distance to nearest zone
        "ATR_14_5min", "ATR_14_15min",
        "volume", "Rel_Vol_20_5min", "CVD_3_5min",
        "Bull_Engulfing_5min", "Bear_Engulfing_5min", "Doji_5min",
        "Body_vs_Range_5min", "Upper_Wick_5min", "Lower_Wick_5min",
        "RSI_14_5min", "RSI_14_Divergence_5min",  # If you compute divergence
        "Time_of_Day_5min", "Session_5min"
    ],

    "ict_po3": [
        "Session_5min", "Time_of_Day_5min",
        "Open_of_Day_5min", "High_of_Day_5min", "Low_of_Day_5min",
        "Prev_High_5min", "Prev_Low_5min",
        "Price_vs_Open_5min", "Price_vs_Session_High_Low_5min",
        "Volume_Spike_5min", "CVD_3_5min",
        "Stop_Hunt_5min", "Bull_Engulfing_5min", "Bear_Engulfing_5min",
        "FVG_Exists_5min"
    ],

    "vwap_bounce": [
        "VWAP_Session_5min", "close_vs_VWAP_D_5min",
        "RSI_14_5min", "MACDh_12_26_9_5min", "Is_Choppy_14_5min",
        "Candle_Body_vs_Range_5min", "Upper_Wick_5min", "Lower_Wick_5min",
        "volume", "Rel_Vol_20_5min", "CVD_3_5min",
        "EMA_21_5min", "EMA_21_Slope_21_5min",
        "Time_of_Day_5min", "Session_5min"
    ],

    "fair_value_gap": [
        "FVG_Exists_5min", "FVG_Size_5min", "FVG_Position_5min",  # Position = above/below price
        "ATR_14_5min", "Candle_Range_5min", "Volume_Spike_5min",
        "EMA_21_5min", "EMA_50_15min", "EMA_Trend_Confirmed_5min",
        "RSI_14_5min", "MACDh_12_26_9_5min",
        "Prev_Swing_Low_5min", "Prev_Swing_High_5min", "Dist_to_Closest_SR_5min",
        "Session_5min"
    ]
}

X = df.drop(columns=['clf_target_numba_6', 'reg_target_lookahead6'])
X = X.apply(lambda col: col.astype('category').cat.codes if col.dtypes == 'object' else col)
y_clf = df['clf_target_numba_6']
y_reg = df['reg_target_lookahead6']

clf_results = evaluate_classifier(CLASSIFIERS, feature_sets, X, y_clf)
reg_results = evaluate_regressor(REGRESSORS, feature_sets, X, y_reg)

print(clf_results)
print(reg_results)