In [1]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

..\data


In [3]:
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [4]:
# ===============================================================
# 4) FEATURE SELECTION — SelectKBest (ANOVA F-test)
# ===============================================================

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------------
# 1) Ensure 'Direction' is the target
# ---------------------------------------------------------------
target = data["Direction"]
features = data.drop(columns=["Direction"])

# Optional cleaning
features = features.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(method="bfill")

# ---------------------------------------------------------------
# 2) Scaling (ANOVA expects standardized features)
# ---------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# ---------------------------------------------------------------
# 3) Apply SelectKBest (ANOVA F-test) — Keep Top 20 variables
# ---------------------------------------------------------------
k = 20
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_scaled, target)

# ---------------------------------------------------------------
# 4) Build ranking table
# ---------------------------------------------------------------
scores = selector.scores_
pvalues = selector.pvalues_
feature_names = features.columns

ranking_df = pd.DataFrame({
    "Feature": feature_names,
    "ANOVA_F_score": scores,
    "p_value": pvalues,
    "Selected": selector.get_support()
})

ranking_df = ranking_df.sort_values(by="ANOVA_F_score", ascending=False)

print("Top 20 Features Selected via ANOVA F-test:")
display(ranking_df.head(k))

# ---------------------------------------------------------------
# 5) Filter dataset to keep only the selected features
# ---------------------------------------------------------------
selected_features = feature_names[selector.get_support()]
X_selected = features[selected_features]

print("Selected feature set shape:", X_selected.shape)

Top 20 Features Selected via ANOVA F-test:


Unnamed: 0,Feature,ANOVA_F_score,p_value,Selected
7,HangSeng,5.803928,0.016034,True
14,LQD,5.460275,0.019502,True
9,US10Y,4.374051,0.036552,True
13,BND,3.515548,0.060867,True
39,MA20,2.48119,0.115293,True
12,IEF,2.43247,0.118923,True
11,TLT,1.743286,0.186797,True
33,Imports_GDP_Pct,1.692749,0.193312,True
6,Nikkei225,1.439436,0.2303,True
32,Exports_GDP_Pct,1.174765,0.278488,True


Selected feature set shape: (4085, 20)


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [6]:
# ===============================================================
# STEP — REDUCE DATASET TO TOP 20 FEATURES + REQUIRED COLUMNS
# ===============================================================

# Top 20 selected features

top20_features = [
    "HangSeng",
    "LQD",
    "US10Y",
    "BND",
    "MA20",
    "IEF",
    "TLT",
    "Imports_GDP_Pct",
    "Nikkei225",
    "Exports_GDP_Pct",
    "Recession_Probability",
    "Inflation_Annual_Pct",
    "Volatility_20d",
    "Fed_Funds_Rate",
    "Yield_Curve_Spread",
    "OECD_Unemp_rate_pct_USA",
    "Meta",
    "Unemployment_Rate",
    "MSCIWorld",
    "Google"
]

mandatory_columns = ["Apple", "Return", "Direction"]

for col in mandatory_columns:
    if col not in data.columns:
        print(f"Warning: column '{col}' was not found in data.")

keep_columns = top20_features + mandatory_columns
data = data[keep_columns].copy()

print("New dataset shape:", data.shape)
print("Columns kept:", data.columns.tolist())

New dataset shape: (4085, 23)
Columns kept: ['HangSeng', 'LQD', 'US10Y', 'BND', 'MA20', 'IEF', 'TLT', 'Imports_GDP_Pct', 'Nikkei225', 'Exports_GDP_Pct', 'Recession_Probability', 'Inflation_Annual_Pct', 'Volatility_20d', 'Fed_Funds_Rate', 'Yield_Curve_Spread', 'OECD_Unemp_rate_pct_USA', 'Meta', 'Unemployment_Rate', 'MSCIWorld', 'Google', 'Apple', 'Return', 'Direction']


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   HangSeng                 4085 non-null   float64
 1   LQD                      4085 non-null   float64
 2   US10Y                    4085 non-null   float64
 3   BND                      4085 non-null   float64
 4   MA20                     4085 non-null   float64
 5   IEF                      4085 non-null   float64
 6   TLT                      4085 non-null   float64
 7   Imports_GDP_Pct          4085 non-null   float64
 8   Nikkei225                4085 non-null   float64
 9   Exports_GDP_Pct          4085 non-null   float64
 10  Recession_Probability    4085 non-null   float64
 11  Inflation_Annual_Pct     4085 non-null   float64
 12  Volatility_20d           4085 non-null   float64
 13  Fed_Funds_Rate           4085 non-null   float64
 14  Yield_

In [8]:
data.to_csv(OUT / "Cleaned_Features_for_ML_20ANOVA.csv")

In [11]:
# ---------------------------------------------------------------
# IMPORTS
# ---------------------------------------------------------------
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, fbeta_score
)

from time import time


# ===============================================================
# FROM YOUR RESULTS MANAGER
# ===============================================================
from pathlib import Path
import json

OUT = Path("../data")
OUT.mkdir(parents=True, exist_ok=True)

results = {}


def make_json_serializable(obj):
    if isinstance(obj, dict):
        return {k: make_json_serializable(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [make_json_serializable(i) for i in obj]
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    if isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    if isinstance(obj, (np.bool_, bool)):
        return bool(obj)
    return obj


def save_results_to_json(results_dict, filename="model_results.json"):
    filepath = OUT / filename
    serializable = make_json_serializable(results_dict)
    with open(filepath, "w") as f:
        json.dump(serializable, f, indent=4)
    print(f"[INFO] Saved JSON to: {filepath.resolve()}")


def save_results_to_csv(results_dict, filename="model_results.csv"):
    rows = []

    for model_name, res in results_dict.items():
        cm = np.array(res.get("confusion_matrix", np.zeros((2, 2))))
        cr = res.get("classification_report", {})
        roc_auc = res.get("roc_auc")
        f2 = res.get("f2_score")
        comp_time = res.get("computation_time_sec")
        dataset_label = res.get("dataset_label", "Unknown")

        accuracy = cr.get("accuracy")

        # Detect positive class
        pos_key = "1" if "1" in cr else None
        if not pos_key:
            numeric_keys = [k for k in cr.keys() if k.isdigit()]
            pos_key = numeric_keys[-1] if numeric_keys else None

        precision = recall = f1_class1 = None
        if pos_key and isinstance(cr.get(pos_key), dict):
            precision = cr[pos_key].get("precision")
            recall = cr[pos_key].get("recall")
            f1_class1 = cr[pos_key].get("f1-score")

        # Confusion matrix split
        tn = fp = fn = tp = None
        if cm.shape == (2, 2):
            tn, fp, fn, tp = cm.ravel()

        rows.append({
            "Dataset": dataset_label,
            "Model": model_name,
            "Accuracy": accuracy,
            "Precision (class 1)": precision,
            "Recall (class 1)": recall,
            "F1-score (class 1)": f1_class1,
            "F2-score": f2,
            "ROC-AUC": roc_auc,
            "Computation Time (sec)": comp_time,
            "TN": tn, "FP": fp, "FN": fn, "TP": tp
        })

    df = pd.DataFrame(rows)
    filepath = OUT / filename
    df.to_csv(filepath, index=False)
    print(f"[INFO] Saved CSV to: {filepath.resolve()}")


def save_results(model_name, y_true, y_pred, y_prob=None, comp_time=None, dataset_label="Unknown"):
    results[model_name] = {}

    cm = confusion_matrix(y_true, y_pred)
    results[model_name]["confusion_matrix"] = cm

    report = classification_report(y_true, y_pred, output_dict=True)
    results[model_name]["classification_report"] = report

    if y_prob is not None:
        try:
            results[model_name]["roc_auc"] = roc_auc_score(y_true, y_prob)
        except:
            results[model_name]["roc_auc"] = None

    results[model_name]["f2_score"] = fbeta_score(y_true, y_pred, beta=2)
    results[model_name]["computation_time_sec"] = comp_time
    results[model_name]["dataset_label"] = dataset_label

    print(f"[INFO] Saved model results for: {model_name}")


# ===============================================================
# TRAIN / TEST SPLIT
# ===============================================================
features = data.drop(columns=['Apple', 'Return', 'Direction'])
target = data["Direction"]

features.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False
)

print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")


# ===============================================================
# FEATURE TYPES
# ===============================================================
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns


# ===============================================================
# PREPROCESSING PIPELINES
# ===============================================================
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


# ===============================================================
# DEFINE MODELS
# ===============================================================
models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('CART', DecisionTreeClassifier(max_depth=6, random_state=42)),
    ('SVC', SVC(kernel='rbf', probability=True, random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=5000, random_state=42)),
    ('ABR', AdaBoostClassifier(n_estimators=300, random_state=42)),
    ('GBR', GradientBoostingClassifier(n_estimators=300, random_state=42)),
    ('RFR', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('ETR', ExtraTreesClassifier(n_estimators=300, random_state=42))
]


# ===============================================================
# MODEL LOOP + SAVE RESULTS
# ===============================================================
rows_for_display = []
DATASET_LABEL = "APPLE_DIRECTION_8D"   # or dynamic

for name, model in models:

    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", model)
    ])

    t0 = time()
    pipe.fit(X_train, y_train)
    comp_time = time() - t0

    y_pred = pipe.predict(X_test)

    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_prob = pipe.predict_proba(X_test)[:, 1]
    else:
        y_prob = None

    # === Use the unified save_results() function ===
    save_results(
        model_name=name,
        y_true=y_test,
        y_pred=y_pred,
        y_prob=y_prob,
        comp_time=comp_time,
        dataset_label=DATASET_LABEL
    )

    # Row for view in notebook
    rows_for_display.append([
        name,
        accuracy_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob) if y_prob is not None else None,
        fbeta_score(y_test, y_pred, beta=2),
        comp_time
    ])


# ===============================================================
# EXPORT JSON + CSV FOR DASHBOARD
# ===============================================================
save_results_to_json(results)
save_results_to_csv(results)


# ===============================================================
# FINAL SUMMARY TABLE
# ===============================================================
df_results = pd.DataFrame(
    rows_for_display,
    columns=["Model", "Accuracy", "F1-score", "AUC", "F2-score", "Time (sec)"]
).sort_values("AUC", ascending=False).reset_index(drop=True)

display(df_results.style.background_gradient(cmap="Blues"))


Train shape: (3268, 20) | Test shape: (817, 20)
[INFO] Saved model results for: LR
[INFO] Saved model results for: KNN
[INFO] Saved model results for: CART
[INFO] Saved model results for: SVC
[INFO] Saved model results for: MLP
[INFO] Saved model results for: ABR
[INFO] Saved model results for: GBR
[INFO] Saved model results for: RFR
[INFO] Saved model results for: ETR
[INFO] Saved JSON to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\model_results.json
[INFO] Saved CSV to: C:\Users\dax_a\Documents\GitHub\ESILV-MLproject-AU-BEJOT\data\model_results.csv


Unnamed: 0,Model,Accuracy,F1-score,AUC,F2-score,Time (sec)
0,LR,0.517748,0.677049,0.51924,0.82567,0.131266
1,SVC,0.522644,0.686495,0.518315,0.845545,2.36606
2,ETR,0.544676,0.645038,0.516339,0.725633,1.220021
3,ABR,0.523868,0.68502,0.506533,0.84062,1.963307
4,RFR,0.520196,0.624521,0.50623,0.701075,3.199183
5,GBR,0.523868,0.651121,0.500937,0.757513,4.369032
6,CART,0.485924,0.498807,0.485111,0.493157,0.040912
7,MLP,0.506732,0.578892,0.485012,0.618856,7.647316
8,KNN,0.487148,0.447958,0.477845,0.416667,0.010747
