In [2]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

DATASET_LABEL = "Cleaned_Features_for_ML"

..\data


In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [5]:
# ===============================================================
# 0) IMPORTS
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, fbeta_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier,
    VotingClassifier, BaggingClassifier, StackingClassifier
)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from time import time
from pathlib import Path
import json

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ===============================================================
#  SAVE RESULTS HELPERS (JSON + CSV EXPORT)
# ===============================================================
OUT = Path("../data")
OUT.mkdir(parents=True, exist_ok=True)

def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    return obj

def save_results_to_json(results_dict, filename="model_results.json"):
    results_serializable = make_json_serializable(results_dict)
    filepath = OUT / filename
    with open(filepath, "w") as f:
        json.dump(results_serializable, f, indent=4)
    print(f"[INFO] Saved JSON to: {filepath.resolve()}")

def save_results_to_csv(results_dict, filename="model_results.csv"):
    rows = []
    for model_name, res in results_dict.items():
        cm = np.array(res["confusion_matrix"])
        cr = res["classification_report"]
        roc_auc = res.get("roc_auc")
        f2 = res.get("f2_score")
        comp_time = res.get("computation_time_sec")

        acc = cr.get("accuracy")
        pos_key = "1" if "1" in cr else None

        if pos_key:
            precision_1 = cr[pos_key]["precision"]
            recall_1 = cr[pos_key]["recall"]
            f1_1 = cr[pos_key]["f1-score"]
        else:
            precision_1 = recall_1 = f1_1 = None

        tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": res["dataset_label"],
            "Model": model_name,
            "Accuracy": acc,
            "Precision (class 1)": precision_1,
            "Recall (class 1)": recall_1,
            "F1-score (class 1)": f1_1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn, "FP": fp, "FN": fn, "TP": tp,
        })

    df = pd.DataFrame(rows)
    filepath = OUT / filename
    df.to_csv(filepath, index=False)
    print(f"[INFO] Saved CSV to: {filepath.resolve()}")

# ===============================================================
# RESULTS STORAGE
# ===============================================================
results = {}

def save_results(model_name, y_true, y_pred, y_prob, comp_time, dataset_label):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True)

    entry = {
        "confusion_matrix": cm,
        "classification_report": report,
        "roc_auc": roc_auc_score(y_true, y_prob),
        "f2_score": fbeta_score(y_true, y_pred, beta=2),
        "computation_time_sec": comp_time,
        "dataset_label": dataset_label
    }
    results[model_name] = entry
    print(f"[INFO] Saved model results for: {model_name}")

# ===============================================================
# 1) PREP DATA + TARGET
# ===============================================================

# Assumes `data` is already loaded as a DataFrame with Date index
data = data.sort_index()

required_cols = ["Apple", "Return", "Direction"]
for c in required_cols:
    if c not in data.columns:
        raise ValueError(f"Missing column: {c}")

# Forward 8-day average of Direction (future info → ONLY for target)
data["Direction_Forward8"] = data["Direction"].rolling(8).mean().shift(-8)

# Binary Trend_8D: 1 if >= 0.6, 0 if <= 0.4, NaN in-between
data["Trend_8D"] = np.where(
    data["Direction_Forward8"] >= 0.6, 1,
    np.where(data["Direction_Forward8"] <= 0.4, 0, np.nan)
)

# Keep only rows with defined Trend_8D
data = data.dropna(subset=["Trend_8D"]).copy()
data["Trend_8D"] = data["Trend_8D"].astype(int)

print("Data shape after Trend_8D creation:", data.shape)

# ===============================================================
# 2) FEATURE MATRIX
# ===============================================================

# Remove also raw price/return/Direction and forward target helper
base_remove = ["Apple", "Return", "Direction", "Direction_Forward8"]
remove_cols = base_remove

# Build feature matrix
X_full = data.drop(columns=remove_cols, errors="ignore")
y_full = data["Trend_8D"]

# ENSURE ONLY PAST INFORMATION (features at t-1, target at t)
X_full = X_full.shift(1)

# Drop rows with NaNs induced by lag
valid_idx = X_full.dropna().index
X_full = X_full.loc[valid_idx]
y_full = y_full.loc[valid_idx]

print("Shape after feature cleaning (lagged features):", X_full.shape)

# Chronological train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, shuffle=False
)

DATASET_LABEL = "APPLE_TREND_8D"

# ===============================================================
# 3) BASE MODELS
# ===============================================================

base_models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)),

    ('KNN', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ])),

    ('CART', DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)),

    ('SVC', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE))
    ])),

    ('MLP', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                              max_iter=5000,
                              random_state=RANDOM_STATE))
    ])),

    ('ABR',  AdaBoostClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('GBR',  GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('RFR',  RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('ETR',  ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE))
]

print("\n=========== BASE MODEL EVALUATION ===========\n")

for name, model in base_models:
    print(f"\n--- Training {name} ---")
    t0 = time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0

    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

# ===============================================================
# 4) ENSEMBLE & STACKING MODELS
# ===============================================================

print("\n=========== ENSEMBLE & STACKING MODELS ===========\n")

# Helper pipelines for ensembles
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))
])

pipe_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(probability=True, random_state=RANDOM_STATE))
])

pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors=5))
])

pipe_mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                          max_iter=5000,
                          random_state=RANDOM_STATE))
])

# 1) Soft Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ('lr', pipe_lr),
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    voting='soft'
)

# 2) Bagging on CART and Logistic Regression
bagging_cart = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

bagging_lr = BaggingClassifier(
    estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

# 3) Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
        ('gbr', GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    final_estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    stack_method='predict_proba',
    passthrough=False
)

ensemble_models = [
    ("VOTING_SOFT", voting_clf),
    ("BAGGING_CART", bagging_cart),
    ("BAGGING_LR", bagging_lr),
    ("STACKING", stacking_clf)
]

for name, model in ensemble_models:
    print(f"\n--- Training {name} ---")
    t0 = time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0

    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

# ===============================================================
# 5) EXPORT RESULTS
# ===============================================================
save_results_to_json(results)
save_results_to_csv(results)

# ===============================================================
# 6) COMPARISON TABLE
# ===============================================================
rows = []
for m_name, res in results.items():
    cr = res["classification_report"]
    f1_pos = cr["1"]["f1-score"] if "1" in cr else None

    rows.append({
        "Model": m_name,
        "Accuracy": cr["accuracy"],
        "F1 (class 1)": f1_pos,
        "F2": res["f2_score"],
        "ROC-AUC": res["roc_auc"],
        "Time (sec)": res["computation_time_sec"]
    })

df_results = pd.DataFrame(rows).sort_values("ROC-AUC", ascending=False)

def style_results_table(df):
    """Return a styled version of df with blue gradients per metric."""
    # Columns to format with heatmap
    metric_cols = ["Accuracy", "F1 (class 1)", "ROC-AUC", "F2", "Time (sec)"]
    
    # Build a Styler
    styler = (
        df.style
        .background_gradient(cmap="Blues", subset=metric_cols)  # blue heatmap
        .format({
            "Accuracy": "{:.4f}",
            "F1 (class 1)": "{:.4f}",
            "ROC-AUC": "{:.4f}",
            "F2": "{:.4f}",
            "Time (sec)": "{:.4f}"
        })
        .set_properties(**{
            "text-align": "center",
            "border": "1px solid #ccc"
        })
        .set_table_styles([
            {"selector": "th", 
             "props": [
                 ("background-color", "#1f4e79"),
                 ("color", "white"),
                 ("text-align", "center"),
                 ("padding", "6px")
             ]},
            {"selector": "td", 
             "props": [
                 ("padding", "6px")
             ]},
        ])
    )
    return styler


print("\n=========== MODEL COMPARISON TABLE ===========\n")
style_results_table(df_results)



Data shape after Trend_8D creation: (2970, 46)
Shape after feature cleaning (lagged features): (2969, 42)



--- Training LR ---
[INFO] Saved model results for: LR

--- Training KNN ---
[INFO] Saved model results for: KNN

--- Training CART ---
[INFO] Saved model results for: CART

--- Training SVC ---
[INFO] Saved model results for: SVC

--- Training MLP ---
[INFO] Saved model results for: MLP

--- Training ABR ---
[INFO] Saved model results for: ABR

--- Training GBR ---
[INFO] Saved model results for: GBR

--- Training RFR ---
[INFO] Saved model results for: RFR

--- Training ETR ---
[INFO] Saved model results for: ETR



--- Training VOTING_SOFT ---
[INFO] Saved model results for: VOTING_SOFT

--- Training BAGGING_CART ---
[INFO] Saved model results for: BAGGING_CART

--- Training BAGGING_LR ---
[INFO] Saved model results for: BAGGING_LR

--- Training STACKING ---
[INFO] Saved model results for: STACKING
[INFO] Saved JSON to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT

Unnamed: 0,Model,Accuracy,F1 (class 1),F2,ROC-AUC,Time (sec)
10,BAGGING_CART,0.8939,0.9091,0.9083,0.9015,5.6472
8,ETR,0.8939,0.9091,0.9083,0.901,0.8292
7,RFR,0.8906,0.9051,0.898,0.9004,2.4917
9,VOTING_SOFT,0.8939,0.9091,0.9083,0.8978,3.5371
3,SVC,0.8788,0.8938,0.8813,0.8934,1.0036
0,LR,0.8939,0.9091,0.9083,0.8911,0.0599
11,BAGGING_LR,0.8939,0.9091,0.9083,0.8887,3.1706
12,STACKING,0.8737,0.8915,0.8891,0.887,47.2689
5,ABR,0.8687,0.8889,0.895,0.8845,2.6951
6,GBR,0.867,0.8873,0.8927,0.8823,6.1772


In [6]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML_20ANOVA.csv', index_col=0, parse_dates=True)

DATASET_LABEL = "Cleaned_Features_for_ML_20ANOVA"

..\data


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   HangSeng                 4085 non-null   float64
 1   LQD                      4085 non-null   float64
 2   US10Y                    4085 non-null   float64
 3   BND                      4085 non-null   float64
 4   MA20                     4085 non-null   float64
 5   IEF                      4085 non-null   float64
 6   TLT                      4085 non-null   float64
 7   Imports_GDP_Pct          4085 non-null   float64
 8   Nikkei225                4085 non-null   float64
 9   Exports_GDP_Pct          4085 non-null   float64
 10  Recession_Probability    4085 non-null   float64
 11  Inflation_Annual_Pct     4085 non-null   float64
 12  Volatility_20d           4085 non-null   float64
 13  Fed_Funds_Rate           4085 non-null   float64
 14  Yield_

In [8]:
# ===============================================================
# 0) IMPORTS
# ===============================================================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, fbeta_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier,
    VotingClassifier, BaggingClassifier, StackingClassifier
)

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from time import time
from pathlib import Path
import json

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ===============================================================
#  SAVE RESULTS HELPERS (JSON + CSV EXPORT)
# ===============================================================
OUT = Path("../data")
OUT.mkdir(parents=True, exist_ok=True)

def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    return obj

def save_results_to_json(results_dict, filename="model_results.json"):
    results_serializable = make_json_serializable(results_dict)
    filepath = OUT / filename
    with open(filepath, "w") as f:
        json.dump(results_serializable, f, indent=4)
    print(f"[INFO] Saved JSON to: {filepath.resolve()}")

def save_results_to_csv(results_dict, filename="model_results.csv"):
    rows = []
    for model_name, res in results_dict.items():
        cm = np.array(res["confusion_matrix"])
        cr = res["classification_report"]
        roc_auc = res.get("roc_auc")
        f2 = res.get("f2_score")
        comp_time = res.get("computation_time_sec")

        acc = cr.get("accuracy")
        pos_key = "1" if "1" in cr else None

        if pos_key:
            precision_1 = cr[pos_key]["precision"]
            recall_1 = cr[pos_key]["recall"]
            f1_1 = cr[pos_key]["f1-score"]
        else:
            precision_1 = recall_1 = f1_1 = None

        tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": res["dataset_label"],
            "Model": model_name,
            "Accuracy": acc,
            "Precision (class 1)": precision_1,
            "Recall (class 1)": recall_1,
            "F1-score (class 1)": f1_1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn, "FP": fp, "FN": fn, "TP": tp,
        })

    df = pd.DataFrame(rows)
    filepath = OUT / filename
    df.to_csv(filepath, index=False)
    print(f"[INFO] Saved CSV to: {filepath.resolve()}")

# ===============================================================
# RESULTS STORAGE
# ===============================================================
results = {}

def save_results(model_name, y_true, y_pred, y_prob, comp_time, dataset_label):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True)

    entry = {
        "confusion_matrix": cm,
        "classification_report": report,
        "roc_auc": roc_auc_score(y_true, y_prob),
        "f2_score": fbeta_score(y_true, y_pred, beta=2),
        "computation_time_sec": comp_time,
        "dataset_label": dataset_label
    }
    results[model_name] = entry
    print(f"[INFO] Saved model results for: {model_name}")

# ===============================================================
# 1) PREP DATA + TARGET
# ===============================================================

# Assumes `data` is already loaded as a DataFrame with Date index
data = data.sort_index()

required_cols = ["Apple", "Return", "Direction"]
for c in required_cols:
    if c not in data.columns:
        raise ValueError(f"Missing column: {c}")

# Forward 8-day average of Direction (future info → ONLY for target)
data["Direction_Forward8"] = data["Direction"].rolling(8).mean().shift(-8)

# Binary Trend_8D: 1 if >= 0.6, 0 if <= 0.4, NaN in-between
data["Trend_8D"] = np.where(
    data["Direction_Forward8"] >= 0.6, 1,
    np.where(data["Direction_Forward8"] <= 0.4, 0, np.nan)
)

# Keep only rows with defined Trend_8D
data = data.dropna(subset=["Trend_8D"]).copy()
data["Trend_8D"] = data["Trend_8D"].astype(int)

print("Data shape after Trend_8D creation:", data.shape)

# ===============================================================
# 2) FEATURE MATRIX
# ===============================================================

# Remove also raw price/return/Direction and forward target helper
base_remove = ["Apple", "Return", "Direction", "Direction_Forward8"]
remove_cols = base_remove

# Build feature matrix
X_full = data.drop(columns=remove_cols, errors="ignore")
y_full = data["Trend_8D"]

# ENSURE ONLY PAST INFORMATION (features at t-1, target at t)
X_full = X_full.shift(1)

# Drop rows with NaNs induced by lag
valid_idx = X_full.dropna().index
X_full = X_full.loc[valid_idx]
y_full = y_full.loc[valid_idx]

print("Shape after feature cleaning (lagged features):", X_full.shape)

# Chronological train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, shuffle=False
)

DATASET_LABEL = "APPLE_TREND_8D"

# ===============================================================
# 3) BASE MODELS
# ===============================================================

base_models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)),

    ('KNN', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", KNeighborsClassifier(n_neighbors=5))
    ])),

    ('CART', DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)),

    ('SVC', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE))
    ])),

    ('MLP', Pipeline([
        ("scaler", StandardScaler()),
        ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                              max_iter=5000,
                              random_state=RANDOM_STATE))
    ])),

    ('ABR',  AdaBoostClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('GBR',  GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('RFR',  RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('ETR',  ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE))
]

print("\n=========== BASE MODEL EVALUATION ===========\n")

for name, model in base_models:
    print(f"\n--- Training {name} ---")
    t0 = time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0

    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

# ===============================================================
# 4) ENSEMBLE & STACKING MODELS
# ===============================================================

print("\n=========== ENSEMBLE & STACKING MODELS ===========\n")

# Helper pipelines for ensembles
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))
])

pipe_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", SVC(probability=True, random_state=RANDOM_STATE))
])

pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors=5))
])

pipe_mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(hidden_layer_sizes=(64, 32),
                          max_iter=5000,
                          random_state=RANDOM_STATE))
])

# 1) Soft Voting Ensemble
voting_clf = VotingClassifier(
    estimators=[
        ('lr', pipe_lr),
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    voting='soft'
)

# 2) Bagging on CART and Logistic Regression
bagging_cart = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

bagging_lr = BaggingClassifier(
    estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    n_estimators=200,
    random_state=RANDOM_STATE
)

# 3) Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('svc', pipe_svc),
        ('rf', RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
        ('gbr', GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE))
    ],
    final_estimator=LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    stack_method='predict_proba',
    passthrough=False
)

ensemble_models = [
    ("VOTING_SOFT", voting_clf),
    ("BAGGING_CART", bagging_cart),
    ("BAGGING_LR", bagging_lr),
    ("STACKING", stacking_clf)
]

for name, model in ensemble_models:
    print(f"\n--- Training {name} ---")
    t0 = time()

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    comp_time = time() - t0

    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

# ===============================================================
# 5) EXPORT RESULTS
# ===============================================================
save_results_to_json(results)
save_results_to_csv(results)

# ===============================================================
# 6) COMPARISON TABLE
# ===============================================================
rows = []
for m_name, res in results.items():
    cr = res["classification_report"]
    f1_pos = cr["1"]["f1-score"] if "1" in cr else None

    rows.append({
        "Model": m_name,
        "Accuracy": cr["accuracy"],
        "F1 (class 1)": f1_pos,
        "F2": res["f2_score"],
        "ROC-AUC": res["roc_auc"],
        "Time (sec)": res["computation_time_sec"]
    })

df_results = pd.DataFrame(rows).sort_values("ROC-AUC", ascending=False)

def style_results_table(df):
    """Return a styled version of df with blue gradients per metric."""
    # Columns to format with heatmap
    metric_cols = ["Accuracy", "F1 (class 1)", "ROC-AUC", "F2", "Time (sec)"]
    
    # Build a Styler
    styler = (
        df.style
        .background_gradient(cmap="Blues", subset=metric_cols)  # blue heatmap
        .format({
            "Accuracy": "{:.4f}",
            "F1 (class 1)": "{:.4f}",
            "ROC-AUC": "{:.4f}",
            "F2": "{:.4f}",
            "Time (sec)": "{:.4f}"
        })
        .set_properties(**{
            "text-align": "center",
            "border": "1px solid #ccc"
        })
        .set_table_styles([
            {"selector": "th", 
             "props": [
                 ("background-color", "#1f4e79"),
                 ("color", "white"),
                 ("text-align", "center"),
                 ("padding", "6px")
             ]},
            {"selector": "td", 
             "props": [
                 ("padding", "6px")
             ]},
        ])
    )
    return styler


print("\n=========== MODEL COMPARISON TABLE ===========\n")
style_results_table(df_results)


Data shape after Trend_8D creation: (2970, 25)
Shape after feature cleaning (lagged features): (2969, 21)



--- Training LR ---
[INFO] Saved model results for: LR

--- Training KNN ---
[INFO] Saved model results for: KNN

--- Training CART ---
[INFO] Saved model results for: CART

--- Training SVC ---
[INFO] Saved model results for: SVC

--- Training MLP ---
[INFO] Saved model results for: MLP

--- Training ABR ---
[INFO] Saved model results for: ABR

--- Training GBR ---
[INFO] Saved model results for: GBR

--- Training RFR ---
[INFO] Saved model results for: RFR

--- Training ETR ---
[INFO] Saved model results for: ETR



--- Training VOTING_SOFT ---
[INFO] Saved model results for: VOTING_SOFT

--- Training BAGGING_CART ---
[INFO] Saved model results for: BAGGING_CART

--- Training BAGGING_LR ---
[INFO] Saved model results for: BAGGING_LR

--- Training STACKING ---
[INFO] Saved model results for: STACKING
[INFO] Saved JSON to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT

Unnamed: 0,Model,Accuracy,F1 (class 1),F2,ROC-AUC,Time (sec)
8,ETR,0.8939,0.9091,0.9083,0.8974,0.7099
3,SVC,0.8906,0.9057,0.9017,0.8962,0.8686
9,VOTING_SOFT,0.8939,0.9091,0.9083,0.8961,2.6593
10,BAGGING_CART,0.8939,0.9091,0.9083,0.8921,2.9359
7,RFR,0.8939,0.9091,0.9083,0.8909,1.7983
0,LR,0.8939,0.9091,0.9083,0.8894,0.0112
12,STACKING,0.8923,0.9075,0.9059,0.8894,28.04
11,BAGGING_LR,0.8939,0.9091,0.9083,0.8885,1.7976
1,KNN,0.862,0.8808,0.8762,0.8877,0.3347
6,GBR,0.8838,0.9013,0.9052,0.8744,3.2184
