In [55]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [56]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

DATASET_LABEL = "Cleaned_Features_for_ML"   # or "Cleaned_Features_for_ML_20ANOVA"

..\data


In [57]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [58]:
# ===============================================================
#  SAVE RESULTS HELPERS (JSON + CSV EXPORT)
# ===============================================================
from pathlib import Path
import json
import numpy as np
import pandas as pd

# ===============================================================
# Define output folder inside project: /data
# ===============================================================

OUT = Path("../data")   # <-- for notebooks located inside /notebooks/
OUT.mkdir(parents=True, exist_ok=True)

def make_json_serializable(obj):
    """
    Recursively convert objects (numpy arrays, numpy numbers, dicts, lists)
    into JSON-serializable Python native types.
    """
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    else:
        return obj

# ===============================================================
# JSON + CSV Saving Utilities
# ===============================================================

def save_results_to_json(results_dict, filename="model_results.json"):
    """Save the entire results dictionary into /data as JSON."""
    results_serializable = make_json_serializable(results_dict)
    filepath = OUT / filename

    with open(filepath, "w") as f:
        json.dump(results_serializable, f, indent=4)

    print(f"[INFO] Saved JSON to: {filepath.resolve()}")


def save_results_to_csv(results_dict, filename="model_results.csv"):
    """Flatten model metrics into tabular CSV saved under /data."""
    rows = []

    for model_name, res in results_dict.items():
        cm = np.array(res.get("confusion_matrix"))
        cr = res.get("classification_report", {})
        roc_auc = res.get("roc_auc", None)
        f2 = res.get("f2_score", None)
        comp_time = res.get("computation_time_sec", None)

        # Accuracy
        acc = cr.get("accuracy", None)

        # Positive class key
        pos_key = "1" if "1" in cr else None
        if not pos_key:
            keys = [k for k in cr.keys() if k.isdigit()]
            if keys:
                pos_key = keys[-1]

        precision_1 = recall_1 = f1_1 = None
        if pos_key and isinstance(cr.get(pos_key), dict):
            precision_1 = cr[pos_key].get("precision")
            recall_1 = cr[pos_key].get("recall")
            f1_1 = cr[pos_key].get("f1-score")

        # Confusion matrix extract
        tn = fp = fn = tp = None
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": res.get("dataset_label", "Unknown"),   # <--- added
            "Model": model_name,
            "Accuracy": acc,
            "Precision (class 1)": precision_1,
            "Recall (class 1)": recall_1,
            "F1-score (class 1)": f1_1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn,
            "FP": fp,
            "FN": fn,
            "TP": tp,
        })

    df = pd.DataFrame(rows)

    filepath = OUT / filename
    df.to_csv(filepath, index=False)

    print(f"[INFO] Saved CSV to: {filepath.resolve()}")


# GLOBAL CONTAINER
results = {}

In [59]:
def save_results(model_name, y_true, y_pred, y_prob=None, comp_time=None):
    from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, fbeta_score

    # Create entry
    results[model_name] = {}

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    results[model_name]["confusion_matrix"] = cm

    # Classification report
    report = classification_report(y_true, y_pred, output_dict=True)
    results[model_name]["classification_report"] = report

    # ROC-AUC
    if y_prob is not None:
        results[model_name]["roc_auc"] = roc_auc_score(y_true, y_prob)

    # F2-score
    results[model_name]["f2_score"] = fbeta_score(y_true, y_pred, beta=2)

    # Computation time
    results[model_name]["computation_time_sec"] = comp_time

    print(f"✓ Saved model results for: {model_name}")

In [61]:
# ===============================================================
# 0) IMPORTS
# ===============================================================
import numpy as np
import pandas as pd

from statsmodels.tsa.arima.model import ARIMA

from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

# Deep learning (LSTM)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ===============================================================
# 1) PREP DATA + CREATE 8-DAY FORWARD TREND TARGET
# ===============================================================
data = data.sort_index()

required_cols = ["Apple", "Return", "Direction"]
for c in required_cols:
    if c not in data.columns:
        raise ValueError(f"Missing column: {c}")

# Forward-looking 8-day average of daily Direction
# Direction_Forward8(t) = mean(Direction(t+1..t+8))
data["Direction_Forward8"] = (
    data["Direction"]
    .rolling(8)
    .mean()
    .shift(-8)
)

# Binary trend label:
#  1 if > 60% of future 8 days are up
#  0 if < 40% of future 8 days are up
#  NaN in between (neutral, we drop them for now)
data["Trend_8D"] = np.where(
    data["Direction_Forward8"] >= 0.6, 1,
    np.where(data["Direction_Forward8"] <= 0.4, 0, np.nan)
)

# Drop rows where target is NaN (start and end)
data = data.dropna(subset=["Trend_8D"]).copy()
data["Trend_8D"] = data["Trend_8D"].astype(int)

print("Data shape after building Trend_8D target:", data.shape)

# ===============================================================
# 2) ARIMA FORECAST FEATURE (ON RETURN)
# ===============================================================
print("Fitting ARIMA(2,1,2) on Return...")

arima_model = ARIMA(data["Return"], order=(2, 1, 2))
arima_fit = arima_model.fit()

arima_pred = arima_fit.predict(start=1, end=len(data), dynamic=False)
arima_pred.index = data.index

# Shift one step to avoid look-ahead
data["ARIMA_Return_Forecast"] = arima_pred.shift(1)
data["ARIMA_Direction"] = (data["ARIMA_Return_Forecast"] > 0).astype(int)

# ===============================================================
# 3) LSTM FORECAST FEATURE (ON RETURN)
# ===============================================================
print("Preparing LSTM sequences...")

SEQ_LEN = 20
returns_arr = data["Return"].values.reshape(-1, 1)

X_seq, y_seq = [], []
for i in range(SEQ_LEN, len(returns_arr)):
    X_seq.append(returns_arr[i-SEQ_LEN:i])
    y_seq.append(returns_arr[i])
X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

print("LSTM input shape:", X_seq.shape)

split_idx = int(0.8 * len(X_seq))
X_train_seq, X_test_seq = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_test_seq = y_seq[:split_idx], y_seq[split_idx:]

model_lstm = Sequential([
    LSTM(64, return_sequences=True, input_shape=(SEQ_LEN, 1)),
    LSTM(32),
    Dense(1)
])
model_lstm.compile(loss="mse", optimizer=Adam(learning_rate=0.001))

es = EarlyStopping(patience=10, restore_best_weights=True)

print("Training LSTM...")
model_lstm.fit(
    X_train_seq, y_train_seq,
    validation_split=0.1,
    epochs=50,
    batch_size=32,
    callbacks=[es],
    verbose=1
)

print("Forecasting with LSTM...")
lstm_pred = model_lstm.predict(X_seq).flatten()
lstm_series = pd.Series(lstm_pred, index=data.index[SEQ_LEN:])

# Shift by 1 to avoid look-ahead
data["LSTM_Return_Forecast"] = lstm_series.shift(1)
data["LSTM_Direction"] = (data["LSTM_Return_Forecast"] > 0).astype(int)

# ===============================================================
# 4) DROP NaNs (from ARIMA/LSTM shifts and sequence start)
# ===============================================================
data_ml = data.dropna(subset=["ARIMA_Return_Forecast", "LSTM_Return_Forecast"]).copy()
print("Data shape after ARIMA+LSTM dropna:", data_ml.shape)

# ===============================================================
# 5) FEATURES & TARGET FOR STACKING (Option C)
# ===============================================================
# Use ALL available predictors except:
#  - raw price/return
#  - daily Direction
#  - forward-label helpers
X_full = data_ml.drop(columns=[
    "Apple", "Return", "Direction", "Direction_Forward8"
])
y_full = data_ml["Trend_8D"]

X_train, X_test, y_train, y_test = train_test_split(
    X_full, y_full, test_size=0.2, shuffle=False
)

# Keep ARIMA/LSTM forecasts separate for meta-layer
arima_train = X_train["ARIMA_Return_Forecast"].values.reshape(-1, 1)
lstm_train  = X_train["LSTM_Return_Forecast"].values.reshape(-1, 1)
arima_test  = X_test["ARIMA_Return_Forecast"].values.reshape(-1, 1)
lstm_test   = X_test["LSTM_Return_Forecast"].values.reshape(-1, 1)

# Base feature set excludes ARIMA/LSTM forecasts
base_feature_cols = [
    c for c in X_train.columns
    if c not in ["ARIMA_Return_Forecast", "LSTM_Return_Forecast"]
]
X_train_base = X_train[base_feature_cols]
X_test_base  = X_test[base_feature_cols]

print("Base feature count (excluding ARIMA/LSTM):", len(base_feature_cols))

# ===============================================================
# 6) DEFINE BASE CLASSIFICATION MODELS
# ===============================================================
base_models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('CART', DecisionTreeClassifier(max_depth=6, random_state=RANDOM_STATE)),
    ('SVC',  SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE)),
    ('MLP',  MLPClassifier(hidden_layer_sizes=(64, 32),
                           max_iter=5000, random_state=RANDOM_STATE)),
    ('ABR',  AdaBoostClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('GBR',  GradientBoostingClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('RFR',  RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)),
    ('ETR',  ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE))
]

# ===============================================================
# 7) STACKING – BUILD OUT-OF-FOLD META-FEATURES (TRAIN)
# ===============================================================
print("\nBuilding out-of-fold meta-features for Trend_8D...\n")

tscv = TimeSeriesSplit(n_splits=5)
n_train = X_train_base.shape[0]
n_models = len(base_models)

meta_train_base = np.full((n_train, n_models), np.nan)

for m_idx, (name, model) in enumerate(base_models):
    print(f"  {name} OOF predictions...")
    oof_pred = np.full(n_train, np.nan)

    for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train_base)):
        X_tr, X_val = X_train_base.iloc[tr_idx], X_train_base.iloc[val_idx]
        y_tr, y_val = y_train.iloc[tr_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]

    # Fill NaNs (early samples not in any validation fold)
    if np.isnan(oof_pred).any():
        model.fit(X_train_base, y_train)
        full_pred = model.predict_proba(X_train_base)[:, 1]
        oof_pred = np.where(np.isnan(oof_pred), full_pred, oof_pred)

    meta_train_base[:, m_idx] = oof_pred

# Append ARIMA & LSTM forecasts to meta-features
meta_features_train = np.concatenate(
    [meta_train_base, arima_train, lstm_train],
    axis=1
)

print("meta_features_train shape:", meta_features_train.shape)

# ===============================================================
# 8) TRAIN META-CLASSIFIER (LOGISTIC REGRESSION)
# ===============================================================
meta_clf = LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)
meta_clf.fit(meta_features_train, y_train)

# ===============================================================
# 9) META-FEATURES FOR TEST
# ===============================================================
print("\nBuilding meta-features for TEST...\n")

meta_test_base = np.zeros((X_test_base.shape[0], n_models))

for m_idx, (name, model) in enumerate(base_models):
    print(f"  {name} full-train → test prediction...")
    model.fit(X_train_base, y_train)
    meta_test_base[:, m_idx] = model.predict_proba(X_test_base)[:, 1]

meta_features_test = np.concatenate(
    [meta_test_base, arima_test, lstm_test],
    axis=1
)

print("meta_features_test shape:", meta_features_test.shape)

# ===============================================================
# 10) EVALUATE META-MODEL (RAW PREDICTION)
# ===============================================================

from time import time
from sklearn.metrics import fbeta_score

#print("\n========== META-MODEL PERFORMANCE on Trend_8D (RAW) ==========")

t0 = time()
y_proba_meta = meta_clf.predict_proba(meta_features_test)[:, 1]
y_pred_meta  = (y_proba_meta >= 0.5).astype(int)
elapsed_raw = time() - t0

acc_raw = accuracy_score(y_test, y_pred_meta)
f1_raw  = f1_score(y_test, y_pred_meta)
f2_raw  = fbeta_score(y_test, y_pred_meta, beta=2)
auc_raw = roc_auc_score(y_test, y_proba_meta)

cm_raw     = confusion_matrix(y_test, y_pred_meta)
report_raw = classification_report(y_test, y_pred_meta, output_dict=True)


#print(f"Accuracy = {acc_raw:.4f}")
#print(f"F1       = {f1_raw:.4f}")
#print(f"F2       = {f2_raw:.4f}")
#print(f"ROC-AUC  = {auc_raw:.4f}")
#print("\nConfusion Matrix:\n", cm_raw)

# ---- Save results for dashboard ----
#results["Trend8D_MetaModel_Raw"] = {
#    "dataset_label": DATASET_LABEL,
#    "test_accuracy": float(acc_raw),
#    "roc_auc": float(auc_raw),
#    "f1_score": float(f1_raw),
#    "f2_score": float(f2_raw),
#    "computation_time_sec": float(elapsed_raw),
#    "confusion_matrix": cm_raw.tolist(),
#    "classification_report": report_raw
#}


# ===============================================================
# 11) SMOOTHED META-PREDICTION (8-DAY ROLLING)
# ===============================================================

print("\n========= META-MODEL PERFORMANCE (8-DAY SMOOTHED) =========")

proba_series = pd.Series(y_proba_meta, index=X_test.index)
proba_smooth = proba_series.rolling(window=8).mean().shift(1)

# Drop NaN
valid_idx = proba_smooth.dropna().index
y_test_smooth = y_test.loc[valid_idx]
y_pred_smooth = (proba_smooth.loc[valid_idx] >= 0.5).astype(int)

t0 = time()
acc_s = accuracy_score(y_test_smooth, y_pred_smooth)
f1_s  = f1_score(y_test_smooth, y_pred_smooth)
f2_s  = fbeta_score(y_test_smooth, y_pred_smooth, beta=2)
auc_s = roc_auc_score(y_test_smooth, proba_smooth.loc[valid_idx])
elapsed_smooth = time() - t0

cm_s     = confusion_matrix(y_test_smooth, y_pred_smooth)
report_s = classification_report(y_test_smooth, y_pred_smooth, output_dict=True)

print(f"Accuracy = {acc_s:.4f}")
print(f"F1       = {f1_s:.4f}")
print(f"F2       = {f2_s:.4f}")
print(f"ROC-AUC  = {auc_s:.4f}")
print("\nConfusion Matrix (Smoothed):\n", cm_s)

# ---- Save results for dashboard ----
results["Trend8D_MetaModel_Smoothed"] = {
    "dataset_label": DATASET_LABEL,
    "test_accuracy": float(acc_s),
    "roc_auc": float(auc_s),
    "f1_score": float(f1_s),
    "f2_score": float(f2_s),
    "computation_time_sec": float(elapsed_smooth),
    "confusion_matrix": cm_s.tolist(),
    "classification_report": report_s
}


Data shape after building Trend_8D target: (2969, 47)
Fitting ARIMA(2,1,2) on Return...
Preparing LSTM sequences...
LSTM input shape: (2949, 20, 1)
Training LSTM...
Epoch 1/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 34ms/step - loss: 1.0087 - val_loss: 1.4584
Epoch 2/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1.0084 - val_loss: 1.4468
Epoch 3/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1.0065 - val_loss: 1.4469
Epoch 4/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1.0059 - val_loss: 1.4429
Epoch 5/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1.0061 - val_loss: 1.4424
Epoch 6/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 1.0059 - val_loss: 1.4432
Epoch 7/50
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1.0063 - val_loss: 1.4427
E

In [62]:
save_results_to_json(results, filename="model_results.json")
save_results_to_csv(results,  filename="model_results.csv")

[INFO] Saved JSON to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\model_results.json
[INFO] Saved CSV to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\model_results.csv
