In [20]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [21]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

DATASET_LABEL = "Cleaned_Features_for_ML"   # or "Cleaned_Features_for_ML_20ANOVA"

..\data


In [22]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [23]:
# ===============================================================
# 0) IMPORTS
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, fbeta_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from time import time
from pathlib import Path
import json

# >>> NEW: ARIMA FEATURE <<<
from statsmodels.tsa.arima.model import ARIMA

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ===============================================================
#  SAVE RESULTS HELPERS (JSON + CSV EXPORT)
# ===============================================================
OUT = Path("../data")
OUT.mkdir(parents=True, exist_ok=True)

In [24]:
def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    return obj

def save_results_to_json(results_dict, filename="model_results.json"):
    results_serializable = make_json_serializable(results_dict)
    filepath = OUT / filename
    with open(filepath, "w") as f:
        json.dump(results_serializable, f, indent=4)
    print(f"[INFO] Saved JSON to: {filepath.resolve()}")

def save_results_to_csv(results_dict, filename="model_results.csv"):
    rows = []
    for model_name, res in results_dict.items():
        cm = np.array(res["confusion_matrix"])
        cr = res["classification_report"]
        roc_auc = res.get("roc_auc")
        f2 = res.get("f2_score")
        comp_time = res.get("computation_time_sec")

        acc = cr.get("accuracy")
        pos_key = "1" if "1" in cr else None

        if pos_key:
            precision_1 = cr[pos_key]["precision"]
            recall_1 = cr[pos_key]["recall"]
            f1_1 = cr[pos_key]["f1-score"]
        else:
            precision_1 = recall_1 = f1_1 = None

        tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": res["dataset_label"],
            "Model": model_name,
            "Accuracy": acc,
            "Precision (class 1)": precision_1,
            "Recall (class 1)": recall_1,
            "F1-score (class 1)": f1_1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn, "FP": fp, "FN": fn, "TP": tp,
        })

    df = pd.DataFrame(rows)
    filepath = OUT / filename
    df.to_csv(filepath, index=False)
    print(f"[INFO] Saved CSV to: {filepath.resolve()}")

# ===============================================================
# RESULTS STORAGE
# ===============================================================
results = {}

def save_results(model_name, y_true, y_pred, y_prob, comp_time, dataset_label):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True)

    entry = {
        "confusion_matrix": cm,
        "classification_report": report,
        "roc_auc": roc_auc_score(y_true, y_prob),
        "f2_score": fbeta_score(y_true, y_pred, beta=2),
        "computation_time_sec": comp_time,
        "dataset_label": dataset_label
    }
    results[model_name] = entry
    print(f"[INFO] Saved model results for: {model_name}")

In [7]:
# ===============================================================
# 1) PREP DATA + TARGET (NO LEAKAGE)
# ===============================================================

# Sort chronologically
data = data.sort_index()

required_cols = ["Apple", "Return", "Direction"]
for c in required_cols:
    if c not in data.columns:
        raise ValueError(f"Missing column: {c}")

# Forward 8-day average → target creation
data["Direction_Forward8"] = data["Direction"].rolling(8).mean().shift(-8)

data["Trend_8D"] = np.where(
    data["Direction_Forward8"] >= 0.6, 1,
    np.where(data["Direction_Forward8"] <= 0.4, 0, np.nan)
)

data = data.dropna(subset=["Trend_8D"]).copy()
data["Trend_8D"] = data["Trend_8D"].astype(int)

print("Data shape after Trend_8D creation:", data.shape)

# ===============================================================
# 1bis) ARIMA FORECAST FEATURE (ON RETURN)
# ===============================================================
print("Fitting ARIMA(2,1,2) on Return to create forecast feature...")

# Work on a simple 0..N-1 index for ARIMA
arima_series = data["Return"].reset_index(drop=True)

# Fit ARIMA(2,1,2) once on the full series
arima_model = ARIMA(
    arima_series,
    order=(2, 1, 2),
    enforce_stationarity=False,
    enforce_invertibility=False
)

arima_fit = arima_model.fit()

# In-sample one-step-ahead predictions:
# prediction at t uses info up to t-1 (no future leak)
pred_obj = arima_fit.get_prediction(start=1, end=len(arima_series) - 1)
arima_pred_mean = pred_obj.predicted_mean  # length N-1

# Align back to the original data index
arima_full = pd.Series(np.nan, index=data.index)
arima_full.iloc[1:] = arima_pred_mean.values

# This column is the ARIMA one-step-ahead forecast of Return_t.
# It will later be shifted with the rest of X_full (X_full.shift(1)),
# so the model effectively uses the *previous day's* forecast.
data["ARIMA_Return_Forecast"] = arima_full

print("ARIMA NaNs (before global feature lagging):",
      data["ARIMA_Return_Forecast"].isna().sum())

# Optional: save ARIMA forecast for inspection / plots
arima_out = pd.DataFrame({
    "ARIMA_Return_Forecast": data["ARIMA_Return_Forecast"]
})
arima_out.to_csv(OUT / "arima_forecasts.csv", index=True)
print(f"[INFO] Saved ARIMA forecasts to: {(OUT / 'arima_forecasts.csv').resolve()}")

# ===============================================================
# 2) FEATURE MATRIX
# ===============================================================

# Columns known or suspected to use future info
future_cols = [
    "Volatility_20d", "MA20", "MA50",
    "Momentum", "RSI"
]

base_remove = ["Apple", "Return", "Direction", "Direction_Forward8"]

remove_cols = base_remove + future_cols

# ARIMA_Return_Forecast is *kept* as a predictive feature here
X_full = data.drop(columns=remove_cols, errors="ignore")
y_full = data["Trend_8D"]

# ENSURE NO FUTURE DATA: we lag ALL features by 1 day
X_full = X_full.shift(1)

# Drop initial lag NaNs (including ARIMA's first NaNs)
valid_idx = X_full.dropna().index
X_full, y_full = X_full.loc[valid_idx], y_full.loc[valid_idx]

print("Shape after feature cleaning (including ARIMA feature):", X_full.shape)

# Train/test split (chronological)
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, shuffle=False
)

DATASET_LABEL = "APPLE_TREND_8D_ARIMA_FEAT"

# ===============================================================
# 3) MODELS
# ===============================================================

base_models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)),

    # Scaling required for KNN
    ('KNN', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ])),

    ('CART', DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)),

    # Scaling required for SVC
    ('SVC', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE))
    ])),

    # Scaling required for MLP
    ('MLP', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                              max_iter=5000,
                              random_state=RANDOM_STATE))
    ])),

    ('ABR',  AdaBoostClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('GBR',  GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('RFR',  RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('ETR',  ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE)),
]

print("\n=========== BASE MODEL EVALUATION (WITH ARIMA FEATURE) ===========\n")

for name, model in base_models:
    print(f"\n--- Training {name} ---")
    t0 = time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0

    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

# ===============================================================
# EXPORT RESULTS
# ===============================================================
save_results_to_json(results)
save_results_to_csv(results)

# ===============================================================
# 5) COMPARISON TABLE
# ===============================================================
rows = []
for m_name, res in results.items():
    cr = res["classification_report"]
    f1_pos = cr["1"]["f1-score"] if "1" in cr else None

    rows.append({
        "Model": m_name,
        "Accuracy": cr["accuracy"],
        "F1 (class 1)": f1_pos,
        "F2": res["f2_score"],
        "ROC-AUC": res["roc_auc"],
        "Time (sec)": res["computation_time_sec"]
    })

df_results = pd.DataFrame(rows).sort_values("ROC-AUC", ascending=False)

def style_results_table(df):
    """Return a styled version of df with blue gradients per metric."""
    metric_cols = ["Accuracy", "F1 (class 1)", "ROC-AUC", "F2", "Time (sec)"]
    
    styler = (
        df.style
        .background_gradient(cmap="Blues", subset=metric_cols)
        .format({
            "Accuracy": "{:.4f}",
            "F1 (class 1)": "{:.4f}",
            "ROC-AUC": "{:.4f}",
            "F2": "{:.4f}",
            "Time (sec)": "{:.4f}"
        })
        .set_properties(**{
            "text-align": "center",
            "border": "1px solid #ccc"
        })
        .set_table_styles([
            {"selector": "th", 
             "props": [
                 ("background-color", "#1f4e79"),
                 ("color", "white"),
                 ("text-align", "center"),
                 ("padding", "6px")
             ]},
            {"selector": "td", 
             "props": [
                 ("padding", "6px")
             ]},
        ])
    )
    return styler

print("\n=========== COMPARISON TABLE (WITH ARIMA FEATURE) ===========\n")
style_results_table(df_results)

Data shape after Trend_8D creation: (2970, 46)
Fitting ARIMA(2,1,2) on Return to create forecast feature...
ARIMA NaNs (before global feature lagging): 1
[INFO] Saved ARIMA forecasts to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\arima_forecasts.csv
Shape after feature cleaning (including ARIMA feature): (2968, 38)



--- Training LR ---
[INFO] Saved model results for: LR

--- Training KNN ---
[INFO] Saved model results for: KNN

--- Training CART ---
[INFO] Saved model results for: CART

--- Training SVC ---
[INFO] Saved model results for: SVC

--- Training MLP ---
[INFO] Saved model results for: MLP

--- Training ABR ---
[INFO] Saved model results for: ABR

--- Training GBR ---
[INFO] Saved model results for: GBR

--- Training RFR ---
[INFO] Saved model results for: RFR

--- Training ETR ---
[INFO] Saved model results for: ETR
[INFO] Saved JSON to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\model_results.json
[INFO] Saved CSV to: C:\Users\dax_a\D

Unnamed: 0,Model,Accuracy,F1 (class 1),F2,ROC-AUC,Time (sec)
8,ETR,0.8923,0.9075,0.9059,0.8958,0.8175
3,SVC,0.8889,0.9041,0.8994,0.8949,0.9366
0,LR,0.8939,0.9091,0.9083,0.8897,0.0157
5,ABR,0.8805,0.897,0.8931,0.8891,2.3286
7,RFR,0.8923,0.9075,0.9059,0.8853,2.3117
6,GBR,0.8822,0.8997,0.9028,0.8844,5.2027
4,MLP,0.6852,0.7823,0.8842,0.8101,3.2132
1,KNN,0.6212,0.5562,0.4554,0.7552,0.5681
2,CART,0.7761,0.8224,0.8603,0.639,0.0544


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [13]:
# ===============================================================
# 0) IMPORTS
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, fbeta_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier,
    VotingClassifier, BaggingClassifier, StackingClassifier
)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from statsmodels.tsa.arima.model import ARIMA
from time import time
from pathlib import Path
import json

import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# Suppress only non-critical warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ===============================================================
#  SAVE RESULTS HELPERS (JSON + CSV EXPORT)
# ===============================================================
OUT = Path("../data")
OUT.mkdir(parents=True, exist_ok=True)

def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    return obj

def save_results_to_json(results_dict, filename="model_results.json"):
    results_serializable = make_json_serializable(results_dict)
    filepath = OUT / filename
    with open(filepath, "w") as f:
        json.dump(results_serializable, f, indent=4)
    print(f"[INFO] Saved JSON to: {filepath.resolve()}")

def save_results_to_csv(results_dict, filename="model_results.csv"):
    rows = []
    for model_name, res in results_dict.items():
        cm = np.array(res["confusion_matrix"])
        cr = res["classification_report"]
        roc_auc = res.get("roc_auc")
        f2 = res.get("f2_score")
        comp_time = res.get("computation_time_sec")

        acc = cr.get("accuracy")
        pos_key = "1" if "1" in cr else None

        if pos_key:
            precision_1 = cr[pos_key]["precision"]
            recall_1 = cr[pos_key]["recall"]
            f1_1 = cr[pos_key]["f1-score"]
        else:
            precision_1 = recall_1 = f1_1 = None

        tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": res["dataset_label"],
            "Model": model_name,
            "Accuracy": acc,
            "Precision (class 1)": precision_1,
            "Recall (class 1)": recall_1,
            "F1-score (class 1)": f1_1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn, "FP": fp, "FN": fn, "TP": tp,
        })

    df = pd.DataFrame(rows)
    filepath = OUT / filename
    df.to_csv(filepath, index=False)
    print(f"[INFO] Saved CSV to: {filepath.resolve()}")

# ===============================================================
# RESULTS STORAGE
# ===============================================================
results = {}

def save_results(model_name, y_true, y_pred, y_prob, comp_time, dataset_label):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True)

    entry = {
        "confusion_matrix": cm,
        "classification_report": report,
        "roc_auc": roc_auc_score(y_true, y_prob),
        "f2_score": fbeta_score(y_true, y_pred, beta=2),
        "computation_time_sec": comp_time,
        "dataset_label": dataset_label
    }
    results[model_name] = entry
    print(f"[INFO] Saved model results for: {model_name}")

# ===============================================================
# 1) PREP DATA + TARGET
# ===============================================================
data = data.sort_index()

required_cols = ["Apple", "Return", "Direction"]
for c in required_cols:
    if c not in data.columns:
        raise ValueError(f"Missing column: {c}")

# Forward 8-day average (target helper, removed later)
data["Direction_Forward8"] = data["Direction"].rolling(8).mean().shift(-8)

data["Trend_8D"] = np.where(
    data["Direction_Forward8"] >= 0.6, 1,
    np.where(data["Direction_Forward8"] <= 0.4, 0, np.nan)
)

data = data.dropna(subset=["Trend_8D"]).copy()
data["Trend_8D"] = data["Trend_8D"].astype(int)

print("Data shape after Trend_8D creation:", data.shape)

# ===============================================================
# 1bis) ARIMA FORECAST FEATURE
# ===============================================================
print("Fitting ARIMA(2,1,2) on Return...")

ret_series = data["Return"].reset_index(drop=True)

arima_model = ARIMA(
    ret_series, order=(2,1,2),
    enforce_stationarity=False,
    enforce_invertibility=False
)
arima_fit = arima_model.fit()

pred_obj = arima_fit.get_prediction(start=1, end=len(ret_series)-1)
arima_pred = pred_obj.predicted_mean

arima_full = pd.Series(np.nan, index=data.index)
arima_full.iloc[1:] = arima_pred.values

data["ARIMA_Return_Forecast"] = arima_full
print("ARIMA NaNs:", data["ARIMA_Return_Forecast"].isna().sum())

pd.DataFrame({"ARIMA_Return_Forecast": data["ARIMA_Return_Forecast"]}) \
    .to_csv(OUT / "arima_forecasts.csv")

# ===============================================================
# 2) FEATURE MATRIX
# ===============================================================
# Remove raw price/returns but KEEP ARIMA feature
base_remove = ["Apple", "Return", "Direction", "Direction_Forward8"]
X_full = data.drop(columns=base_remove, errors="ignore")
y_full = data["Trend_8D"]

# Lag all features → prevent leakage
X_full = X_full.shift(1)

valid_idx = X_full.dropna().index
X_full = X_full.loc[valid_idx]
y_full = y_full.loc[valid_idx]

print("Shape after lagging features:", X_full.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, shuffle=False
)

DATASET_LABEL = "APPLE_TREND_8D_ARIMA"

# ===============================================================
# 3) BASE MODELS
# ===============================================================
base_models = [
    ('LR', LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)),

    ('KNN', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ])),

    ('CART', DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)),

    ('SVC', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE))
    ])),

    ('MLP', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                              max_iter=5000,
                              random_state=RANDOM_STATE))
    ])),

    ('ABR', AdaBoostClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('GBR', GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('RFR', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('ETR', ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE)),
]

print("\n=========== BASE MODEL EVALUATION ===========\n")

for name, model in base_models:
    print(f"\n--- Training {name} ---")
    t0 = time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0
    save_results(name, y_test, y_pred, y_prob, comp_time, DATASET_LABEL)

# ===============================================================
# 4) ENSEMBLE & STACKING MODELS
# ===============================================================
print("\n=========== ENSEMBLES & STACKING ===========\n")

pipe_lr = Pipeline([("scaler", StandardScaler()),
                    ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))])

pipe_svc = Pipeline([("scaler", StandardScaler()),
                     ("clf", SVC(probability=True, random_state=RANDOM_STATE))])

pipe_mlp = Pipeline([("scaler", StandardScaler()),
                     ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                                           max_iter=5000,
                                           random_state=RANDOM_STATE))])

# Soft Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ('lr', pipe_lr),
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    voting='soft'
)

# Bagging models
bagging_cart = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

bagging_lr = BaggingClassifier(
    estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

# Stacking model
stacking_clf = StackingClassifier(
    estimators=[
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
        ('gbr', GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    final_estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    stack_method="predict_proba"
)

ensemble_models = [
    ("VOTING_SOFT", voting_clf),
    ("BAGGING_CART", bagging_cart),
    ("BAGGING_LR", bagging_lr),
    ("STACKING", stacking_clf)
]

for name, model in ensemble_models:
    print(f"\n--- Training {name} ---")
    t0 = time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0
    save_results(name, y_test, y_pred, y_prob, comp_time, DATASET_LABEL)

# ===============================================================
# 5) EXPORT RESULTS
# ===============================================================
save_results_to_json(results)
save_results_to_csv(results)

# ===============================================================
# 6) COMPARISON TABLE (Safe Formatting)
# ===============================================================

rows = []
for m_name, res in results.items():
    cr = res["classification_report"]
    f1_pos = None
    if "1" in cr:
        f1_pos = cr["1"].get("f1-score", None)

    rows.append({
        "Model": m_name,
        "Accuracy": cr.get("accuracy", np.nan),
        "F1 (class 1)": f1_pos,
        "F2": res.get("f2_score", np.nan),
        "ROC-AUC": res.get("roc_auc", np.nan),
        "Time (sec)": res.get("computation_time_sec", np.nan)
    })

df_results = pd.DataFrame(rows)

# Convert all non-numerical values to NaN
for col in ["Accuracy", "F1 (class 1)", "F2", "ROC-AUC", "Time (sec)"]:
    df_results[col] = pd.to_numeric(df_results[col], errors="coerce")

df_results = df_results.sort_values("ROC-AUC", ascending=False)

def style_results_table(df):
    metric_cols = ["Accuracy", "F1 (class 1)", "F2", "ROC-AUC", "Time (sec)"]

    styler = (
        df.style
        .background_gradient(cmap="Blues", subset=metric_cols)
        .format({col: "{:.4f}".format for col in metric_cols})
        .set_properties(**{"text-align": "center", "border": "1px solid #ccc"})
        .set_table_styles([
            {"selector": "th",
             "props": [("background-color", "#1f4e79"),
                       ("color", "white"),
                       ("text-align", "center"),
                       ("padding", "6px")]},
            {"selector": "td",
             "props": [("padding", "6px")]}
        ])
    )
    return styler

print("\n=========== MODEL COMPARISON TABLE ===========\n")
style_results_table(df_results)


Data shape after Trend_8D creation: (829, 47)
Fitting ARIMA(2,1,2) on Return...
ARIMA NaNs: 1
Shape after lagging features: (827, 43)



--- Training LR ---
[INFO] Saved model results for: LR

--- Training KNN ---
[INFO] Saved model results for: KNN

--- Training CART ---
[INFO] Saved model results for: CART

--- Training SVC ---
[INFO] Saved model results for: SVC

--- Training MLP ---
[INFO] Saved model results for: MLP

--- Training ABR ---
[INFO] Saved model results for: ABR

--- Training GBR ---
[INFO] Saved model results for: GBR

--- Training RFR ---
[INFO] Saved model results for: RFR

--- Training ETR ---
[INFO] Saved model results for: ETR



--- Training VOTING_SOFT ---
[INFO] Saved model results for: VOTING_SOFT

--- Training BAGGING_CART ---
[INFO] Saved model results for: BAGGING_CART

--- Training BAGGING_LR ---
[INFO] Saved model results for: BAGGING_LR

--- Training STACKING ---
[INFO] Saved model results for: STACKING
[INFO] Saved JSON to: C:\Users\dax_a\Documents\Git

Unnamed: 0,Model,Accuracy,F1 (class 1),F2,ROC-AUC,Time (sec)
6,GBR,0.6867,0.6977,0.6085,0.9254,1.8859
7,RFR,0.3614,0.0185,0.0117,0.9232,0.793
8,ETR,0.4036,0.1538,0.1027,0.9179,0.381
12,STACKING,0.3916,0.1217,0.0803,0.9132,14.1159
10,BAGGING_CART,0.3976,0.1525,0.1025,0.9086,1.7897
3,SVC,0.3554,0.0,0.0,0.8928,0.1124
9,VOTING_SOFT,0.4398,0.256,0.1794,0.8882,0.9035
0,LR,0.8976,0.9187,0.9057,0.8796,0.0111
11,BAGGING_LR,0.8976,0.9187,0.9057,0.8787,1.7156
5,ABR,0.3554,0.0,0.0,0.8482,1.062
