In [1]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [2]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path.home() / "ml_outputs"
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

C:\Users\dax_a\ml_outputs


In [3]:
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3970 entries, 2010-07-29 to 2025-11-03
Data columns (total 45 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             3970 non-null   float64
 1   NASDAQ                             3970 non-null   float64
 2   DowJones                           3970 non-null   float64
 3   CAC40                              3970 non-null   float64
 4   DAX                                3970 non-null   float64
 5   FTSE100                            3970 non-null   float64
 6   Nikkei225                          3970 non-null   float64
 7   HangSeng                           3970 non-null   float64
 8   MSCIWorld                          3970 non-null   float64
 9   US10Y                              3970 non-null   float64
 10  US2Y                               3970 non-null   float64
 11  TLT                                397

In [7]:
# ===============================================================
# BINARY CLASSIFICATION – K-FOLD EVALUATION (FULL PIPELINE)
# ===============================================================

# ---------------------------------------------------------------
# 1) IMPORTS
# ---------------------------------------------------------------
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

# ===============================================================
# 2) TRAIN / TEST SPLIT
# ===============================================================
features = data.drop(columns=['Apple', 'Return', 'Direction', 'target_index'])
target = data["Direction"] 

# Handle infinities and missing values early
features.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False
)
print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")

# ===============================================================
# 3) FEATURE TYPES
# ===============================================================
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

print(f"Numeric features: {len(numeric_features)} | Categorical features: {len(categorical_features)}")

# ===============================================================
# 4) PREPROCESSING PIPELINES
# ===============================================================
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ===============================================================
# 5) DEFINE MODELS
# ===============================================================
models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('CART', DecisionTreeClassifier(max_depth=6, random_state=42)),
    ('SVC', SVC(kernel='rbf', probability=True, random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=5000, random_state=42)),
    # Boosting
    ('ABR', AdaBoostClassifier(n_estimators=300, random_state=42)),
    ('GBR', GradientBoostingClassifier(n_estimators=300, random_state=42)),
    # Bagging
    ('RFR', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('ETR', ExtraTreesClassifier(n_estimators=300, random_state=42))
]

# ===============================================================
# 6) K-FOLD CONFIGURATION
# ===============================================================
num_folds = 5
seed = 42
scoring = "roc_auc"

names, kfold_results, train_results, test_results, test_auc_scores, test_f1_scores = [], [], [], [], [], []

# ===============================================================
# 7) LOOP THROUGH MODELS
# ===============================================================
for name, model in models:
    names.append(name)

    # ✅ Full pipeline (preprocessing + model)
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", model)
    ])

    # ✅ K-Fold Cross-validation
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
    kfold_results.append(cv_results)

    # ✅ Fit full model on training set
    res = pipe.fit(X_train, y_train)

    # ✅ Predict and evaluate
    y_train_pred = res.predict(X_train)
    y_test_pred  = res.predict(X_test)
    y_test_proba = res.predict_proba(X_test)[:, 1] if hasattr(res.named_steps['clf'], "predict_proba") else np.zeros_like(y_test_pred)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc  = accuracy_score(y_test, y_test_pred)
    test_auc  = roc_auc_score(y_test, y_test_proba)
    test_f1   = f1_score(y_test, y_test_pred)

    train_results.append(train_acc)
    test_results.append(test_acc)
    test_auc_scores.append(test_auc)
    test_f1_scores.append(test_f1)

    msg = (f"{name}: CV_AUC={cv_results.mean():.4f} ({cv_results.std():.4f}) | "
           f"Train_ACC={train_acc:.4f} | Test_ACC={test_acc:.4f} | "
           f"Test_AUC={test_auc:.4f} | F1={test_f1:.4f}")
    print(msg)

# ===============================================================
# 8) SUMMARY RESULTS
# ===============================================================
results_df = pd.DataFrame({
    "Model": names,
    "CV_AUC_mean": [np.mean(r) for r in kfold_results],
    "CV_AUC_std": [np.std(r) for r in kfold_results],
    "Train_ACC": train_results,
    "Test_ACC": test_results,
    "Test_AUC": test_auc_scores,
    "Test_F1": test_f1_scores
}).sort_values("CV_AUC_mean", ascending=False)

display(results_df.style.background_gradient(cmap="Blues", subset=["CV_AUC_mean", "Test_ACC", "Test_AUC"]))


Train shape: (3176, 41) | Test shape: (794, 41)
Numeric features: 41 | Categorical features: 0
LR: CV_AUC=0.5295 (0.0141) | Train_ACC=0.5441 | Test_ACC=0.5214 | Test_AUC=0.5101 | F1=0.6050
KNN: CV_AUC=0.4838 (0.0200) | Train_ACC=0.6861 | Test_ACC=0.5277 | Test_AUC=0.5326 | F1=0.5634
CART: CV_AUC=0.5110 (0.0304) | Train_ACC=0.6266 | Test_ACC=0.4950 | Test_AUC=0.4873 | F1=0.4675
SVC: CV_AUC=0.4977 (0.0275) | Train_ACC=0.5173 | Test_ACC=0.5264 | Test_AUC=0.5019 | F1=0.6882
MLP: CV_AUC=0.4987 (0.0161) | Train_ACC=0.9663 | Test_ACC=0.5302 | Test_AUC=0.5309 | F1=0.5457
ABR: CV_AUC=0.5030 (0.0211) | Train_ACC=0.7157 | Test_ACC=0.4950 | Test_AUC=0.4885 | F1=0.5845
GBR: CV_AUC=0.5043 (0.0088) | Train_ACC=0.8980 | Test_ACC=0.5176 | Test_AUC=0.5024 | F1=0.6120
RFR: CV_AUC=0.4886 (0.0189) | Train_ACC=1.0000 | Test_ACC=0.4824 | Test_AUC=0.4717 | F1=0.5366
ETR: CV_AUC=0.4873 (0.0128) | Train_ACC=1.0000 | Test_ACC=0.5202 | Test_AUC=0.5063 | F1=0.5790


Unnamed: 0,Model,CV_AUC_mean,CV_AUC_std,Train_ACC,Test_ACC,Test_AUC,Test_F1
0,LR,0.529496,0.014114,0.544081,0.521411,0.51013,0.60499
2,CART,0.511029,0.030359,0.626574,0.494962,0.487332,0.467463
6,GBR,0.504269,0.00884,0.897985,0.517632,0.502427,0.611955
5,ABR,0.503033,0.021091,0.71568,0.494962,0.488528,0.584456
4,MLP,0.498672,0.016127,0.96631,0.530227,0.530879,0.545676
3,SVC,0.497742,0.027501,0.517317,0.526448,0.501873,0.688226
7,RFR,0.48858,0.018856,1.0,0.482368,0.471745,0.53664
8,ETR,0.48727,0.012823,1.0,0.520151,0.50632,0.579006
1,KNN,0.4838,0.02002,0.686083,0.527708,0.532609,0.563446
