In [2]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

..\data


In [4]:
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [5]:
# ===============================================================
# 4) FEATURE SELECTION — SelectKBest (ANOVA F-test)
# ===============================================================

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------------
# 1) Ensure 'Direction' is the target
# ---------------------------------------------------------------
target = data["Direction"]
features = data.drop(columns=["Direction"])

# Optional cleaning
features = features.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(method="bfill")

# ---------------------------------------------------------------
# 2) Scaling (ANOVA expects standardized features)
# ---------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# ---------------------------------------------------------------
# 3) Apply SelectKBest (ANOVA F-test) — Keep Top 20 variables
# ---------------------------------------------------------------
k = 20
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_scaled, target)

# ---------------------------------------------------------------
# 4) Build ranking table
# ---------------------------------------------------------------
scores = selector.scores_
pvalues = selector.pvalues_
feature_names = features.columns

ranking_df = pd.DataFrame({
    "Feature": feature_names,
    "ANOVA_F_score": scores,
    "p_value": pvalues,
    "Selected": selector.get_support()
})

ranking_df = ranking_df.sort_values(by="ANOVA_F_score", ascending=False)

print("Top 20 Features Selected via ANOVA F-test:")
display(ranking_df.head(k))

# ---------------------------------------------------------------
# 5) Filter dataset to keep only the selected features
# ---------------------------------------------------------------
selected_features = feature_names[selector.get_support()]
X_selected = features[selected_features]

print("Selected feature set shape:", X_selected.shape)

# If you want to continue the ML pipeline:
# X_train, X_test, y_train, y_test = train_test_split(X_selected, target, test_size=0.2, shuffle=False)


Top 20 Features Selected via ANOVA F-test:


Unnamed: 0,Feature,ANOVA_F_score,p_value,Selected
9,US10Y,8.00128,0.004697,True
14,LQD,6.829667,0.008998,True
7,HangSeng,6.007374,0.014288,True
6,Nikkei225,2.939533,0.086512,True
13,BND,2.705184,0.1001,True
12,IEF,2.643676,0.10404,True
8,MSCIWorld,2.308848,0.128717,True
39,MA20,2.287361,0.130509,True
11,TLT,2.078979,0.149417,True
2,DowJones,1.945458,0.163153,True


Selected feature set shape: (4085, 20)


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [7]:
# ===============================================================
# STEP — REDUCE DATASET TO TOP 20 FEATURES + REQUIRED COLUMNS
# ===============================================================

# Top 20 selected features
top20_features = [
    "US10Y", "LQD", "HangSeng", "IEF", "DowJones", "BND", "TLT",
    "Nikkei225", "MSCIWorld", "MA20", "Imports_GDP_Pct", "US2Y",
    "Momentum", "S&P500", "Recession_Probability", "Exports_GDP_Pct",
    "DAX", "Inflation_Annual_Pct", "Fed_Funds_Rate", "CAC40"
]

# Columns that MUST remain in the dataset
mandatory_columns = ["Apple", "Return", "Direction"]

# Make sure they exist
for col in mandatory_columns:
    if col not in data.columns:
        print(f"Warning: column '{col}' was not found in data.")

# Build the new reduced dataframe
keep_columns = top20_features + mandatory_columns
data = data[keep_columns].copy()

print("New dataset shape:", data.shape)
print("Columns kept:", data.columns.tolist())

New dataset shape: (4085, 23)
Columns kept: ['US10Y', 'LQD', 'HangSeng', 'IEF', 'DowJones', 'BND', 'TLT', 'Nikkei225', 'MSCIWorld', 'MA20', 'Imports_GDP_Pct', 'US2Y', 'Momentum', 'S&P500', 'Recession_Probability', 'Exports_GDP_Pct', 'DAX', 'Inflation_Annual_Pct', 'Fed_Funds_Rate', 'CAC40', 'Apple', 'Return', 'Direction']


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   US10Y                  4085 non-null   float64
 1   LQD                    4085 non-null   float64
 2   HangSeng               4085 non-null   float64
 3   IEF                    4085 non-null   float64
 4   DowJones               4085 non-null   float64
 5   BND                    4085 non-null   float64
 6   TLT                    4085 non-null   float64
 7   Nikkei225              4085 non-null   float64
 8   MSCIWorld              4085 non-null   float64
 9   MA20                   4085 non-null   float64
 10  Imports_GDP_Pct        4085 non-null   float64
 11  US2Y                   4085 non-null   float64
 12  Momentum               4085 non-null   float64
 13  S&P500                 4085 non-null   float64
 14  Recession_Probability  4085 non-null  

In [9]:
data.to_csv(OUT / "Cleaned_Features_for_ML_20ANOVA.csv")

In [20]:
# ===============================================================
# BINARY CLASSIFICATION – K-FOLD EVALUATION (FULL PIPELINE)
# ===============================================================

# ---------------------------------------------------------------
# 1) IMPORTS
# ---------------------------------------------------------------
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

# ===============================================================
# 2) TRAIN / TEST SPLIT
# ===============================================================
features = data.drop(columns=['Apple', 'Return', 'Direction'])
target = data["Direction"] 

# Handle infinities and missing values early
features.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False
)
print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")

# ===============================================================
# 3) FEATURE TYPES
# ===============================================================
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

print(f"Numeric features: {len(numeric_features)} | Categorical features: {len(categorical_features)}")

# ===============================================================
# 4) PREPROCESSING PIPELINES
# ===============================================================
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ===============================================================
# 5) DEFINE MODELS
# ===============================================================
models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('CART', DecisionTreeClassifier(max_depth=6, random_state=42)),
    ('SVC', SVC(kernel='rbf', probability=True, random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=5000, random_state=42)),
    # Boosting
    ('ABR', AdaBoostClassifier(n_estimators=300, random_state=42)),
    ('GBR', GradientBoostingClassifier(n_estimators=300, random_state=42)),
    # Bagging
    ('RFR', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('ETR', ExtraTreesClassifier(n_estimators=300, random_state=42))
]

# ===============================================================
# 6) K-FOLD CONFIGURATION
# ===============================================================
num_folds = 5
seed = 42
scoring = "roc_auc"

names, kfold_results, train_results, test_results, test_auc_scores, test_f1_scores = [], [], [], [], [], []

# ===============================================================
# 7) LOOP THROUGH MODELS
# ===============================================================
for name, model in models:
    names.append(name)

    # ✅ Full pipeline (preprocessing + model)
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", model)
    ])

    # ✅ K-Fold Cross-validation
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
    kfold_results.append(cv_results)

    # ✅ Fit full model on training set
    res = pipe.fit(X_train, y_train)

    # ✅ Predict and evaluate
    y_train_pred = res.predict(X_train)
    y_test_pred  = res.predict(X_test)
    y_test_proba = res.predict_proba(X_test)[:, 1] if hasattr(res.named_steps['clf'], "predict_proba") else np.zeros_like(y_test_pred)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc  = accuracy_score(y_test, y_test_pred)
    test_auc  = roc_auc_score(y_test, y_test_proba)
    test_f1   = f1_score(y_test, y_test_pred)

    train_results.append(train_acc)
    test_results.append(test_acc)
    test_auc_scores.append(test_auc)
    test_f1_scores.append(test_f1)

    msg = (f"{name}: CV_AUC={cv_results.mean():.4f} ({cv_results.std():.4f}) | "
           f"Train_ACC={train_acc:.4f} | Test_ACC={test_acc:.4f} | "
           f"Test_AUC={test_auc:.4f} | F1={test_f1:.4f}")
    print(msg)

# ===============================================================
# 8) SUMMARY RESULTS
# ===============================================================
results_df = pd.DataFrame({
    "Model": names,
    "CV_AUC_mean": [np.mean(r) for r in kfold_results],
    "CV_AUC_std": [np.std(r) for r in kfold_results],
    "Train_ACC": train_results,
    "Test_ACC": test_results,
    "Test_AUC": test_auc_scores,
    "Test_F1": test_f1_scores
}).sort_values("CV_AUC_mean", ascending=False)

display(results_df.style.background_gradient(cmap="Blues", subset=["CV_AUC_mean", "Test_ACC", "Test_AUC"]))


Train shape: (3176, 20) | Test shape: (794, 20)
Numeric features: 20 | Categorical features: 0
LR: CV_AUC=0.5322 (0.0148) | Train_ACC=0.5305 | Test_ACC=0.5214 | Test_AUC=0.5159 | F1=0.6435
KNN: CV_AUC=0.5111 (0.0240) | Train_ACC=0.6943 | Test_ACC=0.4673 | Test_AUC=0.4709 | F1=0.4639
CART: CV_AUC=0.5199 (0.0265) | Train_ACC=0.5904 | Test_ACC=0.5139 | Test_AUC=0.4996 | F1=0.6037
SVC: CV_AUC=0.5054 (0.0208) | Train_ACC=0.5170 | Test_ACC=0.5264 | Test_AUC=0.4991 | F1=0.6882
MLP: CV_AUC=0.4999 (0.0170) | Train_ACC=0.8892 | Test_ACC=0.4685 | Test_AUC=0.4627 | F1=0.4006
ABR: CV_AUC=0.5177 (0.0089) | Train_ACC=0.7047 | Test_ACC=0.5076 | Test_AUC=0.5064 | F1=0.5057
GBR: CV_AUC=0.4961 (0.0172) | Train_ACC=0.8936 | Test_ACC=0.5050 | Test_AUC=0.5045 | F1=0.5498
RFR: CV_AUC=0.5000 (0.0292) | Train_ACC=1.0000 | Test_ACC=0.5151 | Test_AUC=0.5068 | F1=0.5650
ETR: CV_AUC=0.4892 (0.0167) | Train_ACC=1.0000 | Test_ACC=0.4987 | Test_AUC=0.4917 | F1=0.5425


Unnamed: 0,Model,CV_AUC_mean,CV_AUC_std,Train_ACC,Test_ACC,Test_AUC,Test_F1
0,LR,0.532179,0.014831,0.530542,0.521411,0.515893,0.643527
2,CART,0.519868,0.026484,0.590365,0.513854,0.499571,0.603696
5,ABR,0.517678,0.008888,0.70466,0.507557,0.506428,0.505689
1,KNN,0.511096,0.024023,0.69427,0.467254,0.470924,0.463878
3,SVC,0.505354,0.020836,0.517003,0.526448,0.499081,0.688226
7,RFR,0.500029,0.02923,1.0,0.515113,0.506794,0.564972
4,MLP,0.499853,0.017037,0.889169,0.468514,0.462664,0.400568
6,GBR,0.49609,0.017158,0.893577,0.505038,0.504526,0.549828
8,ETR,0.489181,0.016744,1.0,0.498741,0.491715,0.542529
