In [35]:
# ===============================================================
# 1) IMPORT LIBRARIES
# ===============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
import shap
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [36]:
# ===============================================================
# 1) LOAD DATASETS
# ===============================================================

from pathlib import Path
OUT = Path("../data")
print(OUT)

data = pd.read_csv(OUT / 'Cleaned_Features_for_ML.csv', index_col=0, parse_dates=True)

..\data


In [37]:
print(data.info())
print(data.head(10))

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [38]:
# ===============================================================
# 4) FEATURE SELECTION — SelectKBest (ANOVA F-test)
# ===============================================================

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------------
# 1) Ensure 'Direction' is the target
# ---------------------------------------------------------------
target = data["Direction"]
features = data.drop(columns=["Direction"])

# Optional cleaning
features = features.replace([np.inf, -np.inf], np.nan).fillna(method="ffill").fillna(method="bfill")

# ---------------------------------------------------------------
# 2) Scaling (ANOVA expects standardized features)
# ---------------------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# ---------------------------------------------------------------
# 3) Apply SelectKBest (ANOVA F-test) — Keep Top 20 variables
# ---------------------------------------------------------------
k = 20
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_scaled, target)

# ---------------------------------------------------------------
# 4) Build ranking table
# ---------------------------------------------------------------
scores = selector.scores_
pvalues = selector.pvalues_
feature_names = features.columns

ranking_df = pd.DataFrame({
    "Feature": feature_names,
    "ANOVA_F_score": scores,
    "p_value": pvalues,
    "Selected": selector.get_support()
})

ranking_df = ranking_df.sort_values(by="ANOVA_F_score", ascending=False)

print("Top 20 Features Selected via ANOVA F-test:")
display(ranking_df.head(k))

# ---------------------------------------------------------------
# 5) Filter dataset to keep only the selected features
# ---------------------------------------------------------------
selected_features = feature_names[selector.get_support()]
X_selected = features[selected_features]

print("Selected feature set shape:", X_selected.shape)

# If you want to continue the ML pipeline:
# X_train, X_test, y_train, y_test = train_test_split(X_selected, target, test_size=0.2, shuffle=False)


Top 20 Features Selected via ANOVA F-test:


Unnamed: 0,Feature,ANOVA_F_score,p_value,Selected
7,HangSeng,5.803928,0.016034,True
14,LQD,5.460275,0.019502,True
9,US10Y,4.374051,0.036552,True
13,BND,3.515548,0.060867,True
39,MA20,2.48119,0.115293,True
12,IEF,2.43247,0.118923,True
11,TLT,1.743286,0.186797,True
33,Imports_GDP_Pct,1.692749,0.193312,True
6,Nikkei225,1.439436,0.2303,True
32,Exports_GDP_Pct,1.174765,0.278488,True


Selected feature set shape: (4085, 20)


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   S&P500                             4085 non-null   float64
 1   NASDAQ                             4085 non-null   float64
 2   DowJones                           4085 non-null   float64
 3   CAC40                              4085 non-null   float64
 4   DAX                                4085 non-null   float64
 5   FTSE100                            4085 non-null   float64
 6   Nikkei225                          4085 non-null   float64
 7   HangSeng                           4085 non-null   float64
 8   MSCIWorld                          4085 non-null   float64
 9   US10Y                              4085 non-null   float64
 10  US2Y                               4085 non-null   float64
 11  TLT                                408

In [40]:
# ===============================================================
# STEP — REDUCE DATASET TO TOP 20 FEATURES + REQUIRED COLUMNS
# ===============================================================

# Top 20 selected features
#top20_features = [
#    "US10Y", "LQD", "HangSeng", "IEF", "DowJones", "BND", "TLT",
#    "Nikkei225", "MSCIWorld", "MA20", "Imports_GDP_Pct", "US2Y",
#    "Momentum", "S&P500", "Recession_Probability", "Exports_GDP_Pct",
#    "DAX", "Inflation_Annual_Pct", "Fed_Funds_Rate", "CAC40"
#]

top20_features = [
    "HangSeng",
    "LQD",
    "US10Y",
    "BND",
    "MA20",
    "IEF",
    "TLT",
    "Imports_GDP_Pct",
    "Nikkei225",
    "Exports_GDP_Pct",
    "Recession_Probability",
    "Inflation_Annual_Pct",
    "Volatility_20d",
    "Fed_Funds_Rate",
    "Yield_Curve_Spread",
    "OECD_Unemp_rate_pct_USA",
    "Meta",
    "Unemployment_Rate",
    "MSCIWorld",
    "Google"
]

# Columns that MUST remain in the dataset
mandatory_columns = ["Apple", "Return", "Direction"]

# Make sure they exist
for col in mandatory_columns:
    if col not in data.columns:
        print(f"Warning: column '{col}' was not found in data.")

# Build the new reduced dataframe
keep_columns = top20_features + mandatory_columns
data = data[keep_columns].copy()

print("New dataset shape:", data.shape)
print("Columns kept:", data.columns.tolist())

New dataset shape: (4085, 23)
Columns kept: ['HangSeng', 'LQD', 'US10Y', 'BND', 'MA20', 'IEF', 'TLT', 'Imports_GDP_Pct', 'Nikkei225', 'Exports_GDP_Pct', 'Recession_Probability', 'Inflation_Annual_Pct', 'Volatility_20d', 'Fed_Funds_Rate', 'Yield_Curve_Spread', 'OECD_Unemp_rate_pct_USA', 'Meta', 'Unemployment_Rate', 'MSCIWorld', 'Google', 'Apple', 'Return', 'Direction']


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4085 entries, 2010-03-15 to 2025-11-26
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   HangSeng                 4085 non-null   float64
 1   LQD                      4085 non-null   float64
 2   US10Y                    4085 non-null   float64
 3   BND                      4085 non-null   float64
 4   MA20                     4085 non-null   float64
 5   IEF                      4085 non-null   float64
 6   TLT                      4085 non-null   float64
 7   Imports_GDP_Pct          4085 non-null   float64
 8   Nikkei225                4085 non-null   float64
 9   Exports_GDP_Pct          4085 non-null   float64
 10  Recession_Probability    4085 non-null   float64
 11  Inflation_Annual_Pct     4085 non-null   float64
 12  Volatility_20d           4085 non-null   float64
 13  Fed_Funds_Rate           4085 non-null   float64
 14  Yield_

In [42]:
data.to_csv(OUT / "Cleaned_Features_for_ML_20ANOVA.csv")

In [43]:
# ===============================================================
# BINARY CLASSIFICATION – K-FOLD EVALUATION (FULL PIPELINE)
# ===============================================================

# ---------------------------------------------------------------
# 1) IMPORTS
# ---------------------------------------------------------------
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (
    AdaBoostClassifier, GradientBoostingClassifier,
    RandomForestClassifier, ExtraTreesClassifier
)

# ===============================================================
# 2) TRAIN / TEST SPLIT
# ===============================================================
features = data.drop(columns=['Apple', 'Return', 'Direction'])
target = data["Direction"] 

# Handle infinities and missing values early
features.replace([np.inf, -np.inf], np.nan, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False
)
print(f"Train shape: {X_train.shape} | Test shape: {X_test.shape}")

# ===============================================================
# 3) FEATURE TYPES
# ===============================================================
numeric_features = X_train.select_dtypes(include=[np.number]).columns
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns

print(f"Numeric features: {len(numeric_features)} | Categorical features: {len(categorical_features)}")

# ===============================================================
# 4) PREPROCESSING PIPELINES
# ===============================================================
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# ===============================================================
# 5) DEFINE MODELS
# ===============================================================
models = [
    ('LR',  LogisticRegression(max_iter=5000, random_state=42)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('CART', DecisionTreeClassifier(max_depth=6, random_state=42)),
    ('SVC', SVC(kernel='rbf', probability=True, random_state=42)),
    ('MLP', MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=5000, random_state=42)),
    # Boosting
    ('ABR', AdaBoostClassifier(n_estimators=300, random_state=42)),
    ('GBR', GradientBoostingClassifier(n_estimators=300, random_state=42)),
    # Bagging
    ('RFR', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('ETR', ExtraTreesClassifier(n_estimators=300, random_state=42))
]

# ===============================================================
# 6) K-FOLD CONFIGURATION
# ===============================================================
num_folds = 5
seed = 42
scoring = "roc_auc"

names, kfold_results, train_results, test_results, test_auc_scores, test_f1_scores = [], [], [], [], [], []

# ===============================================================
# 7) LOOP THROUGH MODELS
# ===============================================================
for name, model in models:
    names.append(name)

    # ✅ Full pipeline (preprocessing + model)
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("clf", model)
    ])

    # ✅ K-Fold Cross-validation
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(pipe, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1)
    kfold_results.append(cv_results)

    # ✅ Fit full model on training set
    res = pipe.fit(X_train, y_train)

    # ✅ Predict and evaluate
    y_train_pred = res.predict(X_train)
    y_test_pred  = res.predict(X_test)
    y_test_proba = res.predict_proba(X_test)[:, 1] if hasattr(res.named_steps['clf'], "predict_proba") else np.zeros_like(y_test_pred)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc  = accuracy_score(y_test, y_test_pred)
    test_auc  = roc_auc_score(y_test, y_test_proba)
    test_f1   = f1_score(y_test, y_test_pred)

    train_results.append(train_acc)
    test_results.append(test_acc)
    test_auc_scores.append(test_auc)
    test_f1_scores.append(test_f1)

    msg = (f"{name}: CV_AUC={cv_results.mean():.4f} ({cv_results.std():.4f}) | "
           f"Train_ACC={train_acc:.4f} | Test_ACC={test_acc:.4f} | "
           f"Test_AUC={test_auc:.4f} | F1={test_f1:.4f}")
    print(msg)

# ===============================================================
# 8) SUMMARY RESULTS
# ===============================================================
results_df = pd.DataFrame({
    "Model": names,
    "CV_AUC_mean": [np.mean(r) for r in kfold_results],
    "CV_AUC_std": [np.std(r) for r in kfold_results],
    "Train_ACC": train_results,
    "Test_ACC": test_results,
    "Test_AUC": test_auc_scores,
    "Test_F1": test_f1_scores
}).sort_values("CV_AUC_mean", ascending=False)

display(results_df.style.background_gradient(cmap="Blues", subset=["CV_AUC_mean", "Test_ACC", "Test_AUC"]))


Train shape: (3268, 20) | Test shape: (817, 20)
Numeric features: 20 | Categorical features: 0
LR: CV_AUC=0.5124 (0.0175) | Train_ACC=0.5367 | Test_ACC=0.5177 | Test_AUC=0.5192 | F1=0.6770
KNN: CV_AUC=0.4998 (0.0231) | Train_ACC=0.6720 | Test_ACC=0.4871 | Test_AUC=0.4778 | F1=0.4480
CART: CV_AUC=0.5274 (0.0204) | Train_ACC=0.5890 | Test_ACC=0.4859 | Test_AUC=0.4851 | F1=0.4988
SVC: CV_AUC=0.5174 (0.0122) | Train_ACC=0.5125 | Test_ACC=0.5226 | Test_AUC=0.5183 | F1=0.6865
MLP: CV_AUC=0.5016 (0.0237) | Train_ACC=0.9761 | Test_ACC=0.5067 | Test_AUC=0.4850 | F1=0.5789
ABR: CV_AUC=0.5006 (0.0078) | Train_ACC=0.6781 | Test_ACC=0.5239 | Test_AUC=0.5065 | F1=0.6850
GBR: CV_AUC=0.5108 (0.0225) | Train_ACC=0.8871 | Test_ACC=0.5239 | Test_AUC=0.5009 | F1=0.6511
RFR: CV_AUC=0.4958 (0.0218) | Train_ACC=1.0000 | Test_ACC=0.5202 | Test_AUC=0.5062 | F1=0.6245
ETR: CV_AUC=0.4924 (0.0133) | Train_ACC=1.0000 | Test_ACC=0.5447 | Test_AUC=0.5163 | F1=0.6450


Unnamed: 0,Model,CV_AUC_mean,CV_AUC_std,Train_ACC,Test_ACC,Test_AUC,Test_F1
2,CART,0.527355,0.020428,0.589045,0.485924,0.485111,0.498807
3,SVC,0.517441,0.01224,0.512546,0.522644,0.518315,0.686495
0,LR,0.512429,0.01754,0.53672,0.517748,0.51924,0.677049
6,GBR,0.510806,0.022473,0.887087,0.523868,0.500937,0.651121
4,MLP,0.501573,0.023696,0.976132,0.506732,0.485012,0.578892
5,ABR,0.500596,0.00782,0.678091,0.523868,0.506533,0.68502
1,KNN,0.499775,0.02308,0.671971,0.487148,0.477845,0.447958
7,RFR,0.495787,0.021759,1.0,0.520196,0.50623,0.624521
8,ETR,0.492391,0.013262,1.0,0.544676,0.516339,0.645038
