# Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score

import tensorflow as tf
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_path = "/content/drive/MyDrive/wearable_eeg/Data/"

In [None]:
n_channels = 14
fs = int(128)
pre = int(0.1 * 128)   # 0.1s baseline
post = int(2.9 * 128)  # 2.9s after
window_len = pre + post
n_times = window_len
n_times

383

# Load Data

In [None]:
X_feat = pd.read_csv(base_path+"X_features.csv").values
X_final = pd.read_csv(base_path+"X_final.csv", header=0).values
y = pd.read_csv(base_path+"y.csv")["y"].values

print("X_feat shape:", X_feat.shape)
print("X_final shape:", X_final.shape)
print("y shape:", y.shape)

X_feat shape: (900, 350)
X_final shape: (900, 303)
y shape: (900,)


In [None]:
np.sum(np.isnan(X_feat)*1)

np.int64(0)

In [None]:
np.sum(np.isnan(X_final)*1)

np.int64(0)

In [None]:
np.sum(np.isnan(y)*1)

np.int64(0)

## Column names for X_final

In [None]:
def get_feature_names(n_channels, fs=128, n_lpc=10, n_splits=4):
    names = []

    # PSD
    bands = ["delta", "theta", "alpha", "beta", "gamma"]
    for ch in range(n_channels):
        for band in bands:
            names.append(f"PSD_{band}_Ch{ch+1}")

    # Spectral Entropy
    for ch in range(n_channels):
        for q in range(n_splits):
            names.append(f"SE_Q{q+1}_Ch{ch+1}")

    # LPC
    for ch in range(n_channels):
        for k in range(n_lpc):
            names.append(f"LPC_C{k+1}_Ch{ch+1}")

    # Response Time
    names.append("RT")

    # ERP Energy + AUC (all channels)
    for ch in range(n_channels):
        names.append(f"ERP_Energy_Ch{ch+1}")
        names.append(f"ERP_AUC_Ch{ch+1}")

    # P300 & N400 restricted
    target_ch = [3, 10]  # only ch4 and ch11
    for ch in target_ch:
        names.append(f"P300_Energy_Ch{ch+1}")   # only ch4 and ch11
        names.append(f"P300_AUC_Ch{ch+1}")     # only ch4 and ch11
        names.append(f"N400_Energy_Ch{ch+1}")  # only ch4 and ch11
        names.append(f"N400_AUC_Ch{ch+1}")     # only ch4 and ch11

    return names

In [None]:
feature_names = get_feature_names(n_channels=14)
X_final = pd.DataFrame(X_final, columns=feature_names)
X_final

Unnamed: 0,PSD_delta_Ch1,PSD_theta_Ch1,PSD_alpha_Ch1,PSD_beta_Ch1,PSD_gamma_Ch1,PSD_delta_Ch2,PSD_theta_Ch2,PSD_alpha_Ch2,PSD_beta_Ch2,PSD_gamma_Ch2,...,ERP_Energy_Ch14,ERP_AUC_Ch14,P300_Energy_Ch4,P300_AUC_Ch4,N400_Energy_Ch4,N400_AUC_Ch4,P300_Energy_Ch11,P300_AUC_Ch11,N400_Energy_Ch11,N400_AUC_Ch11
0,0.584436,0.163271,0.036161,0.100595,0.066733,0.619334,0.104071,0.048717,0.098789,0.054241,...,-30.433009,-34.904184,0.700491,0.075915,2.192546,-0.185435,0.788279,0.038866,2.673637,-0.013454
1,0.256031,0.269023,0.146727,0.170858,0.033061,0.449877,0.013748,0.011275,0.037521,0.010703,...,64.025221,9.739109,3.540884,1.075746,-6.268833,3.441219,3.679766,1.358077,-6.363026,3.896657
2,0.484580,0.185753,0.041461,0.122867,0.010597,0.339368,0.029923,0.108739,0.225147,0.025827,...,11.356716,4.971700,0.030781,0.018886,0.471681,0.306862,0.074482,0.023619,0.886769,0.488797
3,0.313761,0.183488,0.082385,0.200044,0.030412,0.161684,0.083821,0.130628,0.388175,0.166831,...,17.060054,7.476612,0.039903,0.028046,-0.649837,0.443999,0.052271,0.011611,-0.740622,0.224707
4,0.057024,0.195452,0.208092,0.412442,0.077169,0.221954,0.155731,0.272907,0.196309,0.088272,...,-5.219148,-20.386565,0.021542,0.036446,0.392545,-0.497252,0.047049,0.044633,0.693205,-0.594728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,0.068247,0.197897,0.104256,0.272419,0.281493,0.166003,0.109932,0.064768,0.381894,0.207044,...,8.786373,0.000000,0.065177,0.035417,0.782985,0.309056,0.062687,0.109888,0.787212,1.012633
896,0.211642,0.247173,0.060200,0.332218,0.147624,0.062447,0.228103,0.043030,0.375715,0.255305,...,13.746136,0.000000,0.022139,0.028131,-0.362530,0.226875,0.021387,0.103447,-0.314796,0.941195
897,0.294060,0.042523,0.055180,0.440553,0.107901,0.133693,0.083575,0.100279,0.546778,0.098238,...,-6.738038,0.000000,0.014656,0.020494,-0.298376,-0.346713,0.014528,0.023625,0.261177,-0.114280
898,0.275411,0.095278,0.047260,0.367211,0.155489,0.161011,0.076314,0.040167,0.441518,0.274025,...,-4.059612,0.000000,0.004489,0.012913,-0.159361,-0.156861,0.007763,0.018993,0.125258,0.408365


# Train-test-split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42)

In [None]:
X_train.shape

(720, 303)

## ML Pipelline

## Models

In [None]:
models = {
    "LogReg": LogisticRegression(max_iter=2000,
                                 solver="lbfgs",
                                 penalty='l2',
                                 C=0.1),

    "SVM": SVC(kernel="rbf",
               C=0.1,
               decision_function_shape='ovo',
               gamma=0.001),

    "RandomForest": RandomForestClassifier(n_estimators=250,
                                           criterion='gini'),

    "NB": GaussianNB(),

    "MLP": MLPClassifier(hidden_layer_sizes=(50,250,100,),
                         activation='relu',
                         solver='sgd',
                         max_iter=100,
                         learning_rate_init=0.001,
                         alpha=0.001),

    "LGBM":  LGBMClassifier(objective="multiclass",
                            num_class=3,
                            learning_rate=0.001,
                            num_leaves=15,
                            min_child_samples=10,
                            colsample_bytree=0.8,
                            subsample=0.8,
                            subsample_freq=5,
                            n_estimators=100,
                            random_state=2025,
                            force_col_wise=True)
}

## Feature Selection

### Variance Threshold Feature Selection

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with variance below a threshold (e.g., 0.01)
selector = VarianceThreshold(threshold=0.01)
X_train_var = selector.fit_transform(X_train)
X_test_var = selector.transform(X_test)

print("Original features:", X_train.shape[1])
print("Selected features:", X_train_var.shape[1])

Original features: 303
Selected features: 259


In [None]:
var_selected_features = X_train.columns[selector.get_support()]
var_selected_features

Index(['PSD_delta_Ch1', 'PSD_theta_Ch1', 'PSD_beta_Ch1', 'PSD_delta_Ch2',
       'PSD_beta_Ch2', 'PSD_delta_Ch3', 'PSD_delta_Ch4', 'PSD_beta_Ch4',
       'PSD_delta_Ch5', 'PSD_beta_Ch5',
       ...
       'ERP_Energy_Ch14', 'ERP_AUC_Ch14', 'P300_Energy_Ch4', 'P300_AUC_Ch4',
       'N400_Energy_Ch4', 'N400_AUC_Ch4', 'P300_Energy_Ch11', 'P300_AUC_Ch11',
       'N400_Energy_Ch11', 'N400_AUC_Ch11'],
      dtype='object', length=259)

### Correlation-Based Feature Selection

In [None]:
def correlation_filter(X, threshold=0.9):
    """
    Removes features with correlation higher than `threshold`.
    X: pandas DataFrame
    """
    corr_matrix = X.corr().abs()

    # Upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find features with correlation > threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    return X.drop(columns=to_drop)

# # Convert train/test to DataFrame (needed for correlation filtering)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Apply correlation filter
X_train_corr = correlation_filter(X_train, threshold=0.9)
X_test_corr = X_test[X_train_corr.columns]   # Keep same columns in test set

print("Original features:", X_train.shape[1])
print("Selected features:", X_train_corr.shape[1])

Original features: 303
Selected features: 169


In [None]:
corr_selected_features = X_train_corr.columns
corr_selected_features

Index(['PSD_delta_Ch1', 'PSD_theta_Ch1', 'PSD_alpha_Ch1', 'PSD_beta_Ch1',
       'PSD_gamma_Ch1', 'PSD_delta_Ch2', 'PSD_theta_Ch2', 'PSD_alpha_Ch2',
       'PSD_beta_Ch2', 'PSD_gamma_Ch2',
       ...
       'ERP_Energy_Ch14', 'ERP_AUC_Ch14', 'P300_Energy_Ch4', 'P300_AUC_Ch4',
       'N400_Energy_Ch4', 'N400_AUC_Ch4', 'P300_Energy_Ch11', 'P300_AUC_Ch11',
       'N400_Energy_Ch11', 'N400_AUC_Ch11'],
      dtype='object', length=169)

### VT + CB Feature Selection

In [None]:
# Step 1: Variance threshold
selector = VarianceThreshold(threshold=0.01)
X_train_sel = selector.fit_transform(X_train)
X_test_sel = selector.transform(X_test)

# Step 2: Correlation filter (on reduced features)
X_train_sel_df = pd.DataFrame(X_train_sel)
X_test_sel_df = pd.DataFrame(X_test_sel)

X_train_final = correlation_filter(X_train_sel_df, threshold=0.9)
X_test_final = X_test_sel_df[X_train_final.columns]

print("Original features:", X_train.shape[1])
print("Selected features:", X_train_final.shape[1])

Original features: 303
Selected features: 125


In [None]:
final_selected_features = X_train.columns[X_train_final.columns]
final_selected_features

Index(['PSD_delta_Ch1', 'PSD_theta_Ch1', 'PSD_alpha_Ch1', 'PSD_beta_Ch1',
       'PSD_gamma_Ch1', 'PSD_delta_Ch2', 'PSD_theta_Ch2', 'PSD_alpha_Ch2',
       'PSD_beta_Ch2', 'PSD_gamma_Ch2',
       ...
       'LPC_C4_Ch13', 'LPC_C5_Ch13', 'LPC_C6_Ch13', 'LPC_C7_Ch13',
       'LPC_C8_Ch13', 'LPC_C9_Ch13', 'LPC_C10_Ch13', 'LPC_C1_Ch14',
       'LPC_C2_Ch14', 'LPC_C3_Ch14'],
      dtype='object', length=125)

## Run models across all the feature selection methods

In [None]:
feature_selections = {
    "None": (X_train, X_test),
    "Variance": (X_train_var, X_test_var),
    "Correlation": (X_train_corr.values, X_test_corr.values),
    "Both": (X_train_final, X_test_final)
}

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results = []

for fs_name, (Xtr, Xte) in feature_selections.items():
    for name, model in models.items():
        pipe = Pipeline([("scaler", StandardScaler()), ("clf", model)])
        pipe.fit(Xtr, y_train)
        y_pred = pipe.predict(Xte)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
        rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
        f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

        results.append({
            "Input": "X_final",
            "Feature_Selection": fs_name,
            "Selected_Features": Xtr.shape[1],
            "Model": name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1": f1
        })



[LightGBM] [Info] Total Bins 64639
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 303
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Total Bins 54948
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 259
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Total Bins 37842
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 169
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




[LightGBM] [Info] Total Bins 28151
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 125
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




## See results

In [None]:
df_results = pd.DataFrame(results)
df_results.style.highlight_max(color='yellow', subset=['Accuracy','Precision','Recall','F1'])

Unnamed: 0,Input,Feature_Selection,Selected_Features,Model,Accuracy,Precision,Recall,F1
0,X_feat,,350,LogReg,0.338889,0.337043,0.338889,0.337622
1,X_feat,,350,SVM,0.266667,0.226822,0.266667,0.21708
2,X_feat,,350,RandomForest,0.305556,0.307454,0.305556,0.304514
3,X_feat,,350,NB,0.272222,0.259646,0.272222,0.248209
4,X_feat,,350,MLP,0.316667,0.317331,0.316667,0.314328
5,X_feat,,350,LGBM,0.283333,0.281247,0.283333,0.279956
6,X_feat,Variance,255,LogReg,0.316667,0.316673,0.316667,0.315912
7,X_feat,Variance,255,SVM,0.222222,0.205419,0.222222,0.194116
8,X_feat,Variance,255,RandomForest,0.305556,0.311746,0.305556,0.307304
9,X_feat,Variance,255,NB,0.261111,0.243827,0.261111,0.234127


In [None]:
results_path = '/content/drive/MyDrive/wearable_eeg/Results/'
results_path

'/content/drive/MyDrive/wearable_eeg/Results/'

In [None]:
df_results.to_csv(results_path+"ML_FS_results.csv", index=False)

# Cross Validation

In [None]:
n_trials_per_subj = 90
n_subj = 10
users = np.repeat(np.arange(n_subj), n_trials_per_subj)

In [None]:
unique_users = np.unique(users)
all_reports = []
cv_results = []

for test_user in unique_users:
    print(f"\n=== Leave out {test_user+1} ===")

    train_idx = users != test_user
    test_idx  = users == test_user

    X_train, X_test = X_final[train_idx], X_final[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Standar Scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Feature Selection inside each CV iteration
    X_train = correlation_filter(pd.DataFrame(X_train), threshold=0.9)
    X_test = pd.DataFrame(X_test)[X_train.columns]

    # print(X_train.shape, X_test.shape)

    # Classifier
    clf1 = MLPClassifier(hidden_layer_sizes=(50,250,100,),
                         activation='relu',
                         solver='sgd',
                         max_iter=1000,
                         learning_rate_init=0.001,
                         alpha=0.001)

    clf = LogisticRegression(max_iter=2000,
                                 solver="lbfgs",
                                 penalty='l2',
                                 C=0.1)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # print(confusion_matrix(y_test, y_pred))
    # print(classification_report(y_test, y_pred, digits=3))
    all_reports.append(classification_report(y_test, y_pred, output_dict=True))

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)

    cv_results.append({
        "Input": "X_final",
        "Model": "LogReg", # change accordingly
        "Test_user": test_user+1,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1
    })

# Average metrics across folds

df_reports = pd.DataFrame(all_reports)
print("\n=== Mean LOSO performance ===")
print(df_reports.mean(numeric_only=True))


=== Leave out 1 ===

=== Leave out 2 ===

=== Leave out 3 ===

=== Leave out 4 ===

=== Leave out 5 ===

=== Leave out 6 ===

=== Leave out 7 ===

=== Leave out 8 ===

=== Leave out 9 ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



=== Leave out 10 ===

=== Mean LOSO performance ===
accuracy    0.353889
dtype: float64


**MLP**
```
=== Mean LOSO performance ===
accuracy    0.354444
dtype: float64
```

**LogReg**
```
=== Mean LOSO performance ===
accuracy    0.353889
dtype: float64
```



In [None]:
df_cv_results = pd.DataFrame(cv_results)
df_cv_results

Unnamed: 0,Input,Model,Test_user,Accuracy,Precision,Recall,F1
0,X_final,MLP,1,0.322222,0.316701,0.322222,0.319003
1,X_final,MLP,2,0.277778,0.240602,0.277778,0.236261
2,X_final,MLP,3,0.344444,0.339506,0.344444,0.340245
3,X_final,MLP,4,0.388889,0.363228,0.388889,0.340714
4,X_final,MLP,5,0.4,0.418129,0.4,0.369881
5,X_final,MLP,6,0.388889,0.388846,0.388889,0.379803
6,X_final,MLP,7,0.388889,0.38273,0.388889,0.381785
7,X_final,MLP,8,0.288889,0.282228,0.288889,0.277081
8,X_final,MLP,9,0.333333,0.111111,0.333333,0.166667
9,X_final,MLP,10,0.411111,0.427051,0.411111,0.408654


In [None]:
df_cv_results.to_csv(results_path+"ML_CV_results.csv", index=False)

## CV per user

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# correlation feature selection function
def corr_feature_selection(X, y, threshold=0.05):
    # returns boolean mask of selected features
    corrs = []
    for i in range(X.shape[1]):
        c = np.corrcoef(X[:, i], y)[0, 1]
        if np.isnan(c):
            c = 0.0
        corrs.append(c)
    corrs = np.array(corrs)
    mask = np.abs(corrs) > threshold

    # if mask is empty, keep top k features to avoid empty set
    if mask.sum() == 0:
        k = min(20, X.shape[1])
        idx = np.argsort(np.abs(corrs))[::-1][:k]
        mask = np.zeros_like(corrs, dtype=bool)
        mask[idx] = True
    return mask

def per_user_cv(X_feats, y, users, model=None, n_splits=5, corr_threshold=0.05, random_state=42):
    """
    X_feats: (n_trials, n_features)
    y: (n_trials,)
    users: (n_trials,) subject ids (strings or ints)
    model: sklearn estimator (if None -> LogisticRegression)
    n_splits: folds per user (StratifiedKFold)
    """
    if model is None:
        model = LogisticRegression(max_iter=2000,
                                 solver="lbfgs",
                                 penalty='l2',
                                 C=0.1)

    unique_users = np.unique(users)
    results = {}

    for user in unique_users:
        idx_user = (users == user)
        X_user = X_feats[idx_user]
        y_user = y[idx_user]
        n_samples = X_user.shape[0]
        print(f"\nUser {user}: {n_samples} trials")

        # if too few samples for k folds, reduce k or use LeaveOneOut
        k = min(n_splits, n_samples)
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=random_state)

        fold_metrics = []
        fold_reports = []
        fold_cms = []

        fold_num = 0
        for train_i, test_i in skf.split(X_user, y_user):
            fold_num += 1
            X_tr, X_te = X_user[train_i], X_user[test_i]
            y_tr, y_te = y_user[train_i], y_user[test_i]

            # scale with train scaler
            scaler = StandardScaler()
            X_tr_scaled = scaler.fit_transform(X_tr)
            X_te_scaled = scaler.transform(X_te)

            # correlation-based feature selection on train only
            mask = corr_feature_selection(X_tr_scaled, y_tr, threshold=corr_threshold)
            X_tr_sel = X_tr_scaled[:, mask]
            X_te_sel = X_te_scaled[:, mask]

            # train model
            clf = model
            clf.fit(X_tr_sel, y_tr)
            y_pred = clf.predict(X_te_sel)

            acc = accuracy_score(y_te, y_pred)
            bal_acc = balanced_accuracy_score(y_te, y_pred)
            cm = confusion_matrix(y_te, y_pred)
            report = classification_report(y_te, y_pred, output_dict=True, zero_division=0)

            fold_metrics.append({"accuracy": acc, "balanced_accuracy": bal_acc})
            fold_reports.append(report)
            fold_cms.append(cm)

            print(f" User {user+1} fold {fold_num} acc={acc:.3f} bal_acc={bal_acc:.3f}")

        # aggregate per-user results
        dfm = pd.DataFrame(fold_metrics)
        mean_metrics = dfm.mean().to_dict()
        results[user] = {
            "fold_metrics": fold_metrics,
            "mean_metrics": mean_metrics,
            "fold_reports": fold_reports,
            "fold_confusion_matrices": fold_cms
        }
        print(f" -> User {user+1} mean accuracy: {mean_metrics['accuracy']:.3f}, mean balanced: {mean_metrics['balanced_accuracy']:.3f}")

    return results

## Logistic Regression

In [None]:
# Example usage:
# X_feats shape (900, 351), y shape (900,), users shape (900,)
results = per_user_cv(X_final, y, users, model=None, n_splits=5, corr_threshold=0.1)


User 0: 90 trials
 User 1 fold 1 acc=0.333 bal_acc=0.333
 User 1 fold 2 acc=0.278 bal_acc=0.278
 User 1 fold 3 acc=0.167 bal_acc=0.167
 User 1 fold 4 acc=0.389 bal_acc=0.389
 User 1 fold 5 acc=0.167 bal_acc=0.167
 -> User 1 mean accuracy: 0.267, mean balanced: 0.267

User 1: 90 trials
 User 2 fold 1 acc=0.333 bal_acc=0.333
 User 2 fold 2 acc=0.389 bal_acc=0.389
 User 2 fold 3 acc=0.444 bal_acc=0.444
 User 2 fold 4 acc=0.333 bal_acc=0.333
 User 2 fold 5 acc=0.278 bal_acc=0.278
 -> User 2 mean accuracy: 0.356, mean balanced: 0.356

User 2: 90 trials
 User 3 fold 1 acc=0.333 bal_acc=0.333
 User 3 fold 2 acc=0.500 bal_acc=0.500
 User 3 fold 3 acc=0.167 bal_acc=0.167
 User 3 fold 4 acc=0.444 bal_acc=0.444
 User 3 fold 5 acc=0.500 bal_acc=0.500
 -> User 3 mean accuracy: 0.389, mean balanced: 0.389

User 3: 90 trials
 User 4 fold 1 acc=0.333 bal_acc=0.333
 User 4 fold 2 acc=0.444 bal_acc=0.444
 User 4 fold 3 acc=0.278 bal_acc=0.278
 User 4 fold 4 acc=0.333 bal_acc=0.333
 User 4 fold 5 acc=0.

In [None]:
bal_accs = [res["mean_metrics"]["balanced_accuracy"] for res in results.values()]

bal_accs = np.array(bal_accs)
print(f"Balanced Accuracy (across users): {bal_accs.mean():.3f} ± {bal_accs.std():.3f}")

Balanced Accuracy (across users): 0.326 ± 0.032


## Random Forest

In [None]:
# Example usage:
# X_feats shape (900, 351), y shape (900,), users shape (900,)
rf = RandomForestClassifier(n_estimators=25,criterion='gini')
results = per_user_cv(X_final, y, users, model=rf, n_splits=5, corr_threshold=0.1)


User 0: 90 trials
 User 1 fold 1 acc=0.111 bal_acc=0.111
 User 1 fold 2 acc=0.222 bal_acc=0.222
 User 1 fold 3 acc=0.333 bal_acc=0.333
 User 1 fold 4 acc=0.389 bal_acc=0.389
 User 1 fold 5 acc=0.333 bal_acc=0.333
 -> User 1 mean accuracy: 0.278, mean balanced: 0.278

User 1: 90 trials
 User 2 fold 1 acc=0.278 bal_acc=0.278
 User 2 fold 2 acc=0.278 bal_acc=0.278
 User 2 fold 3 acc=0.111 bal_acc=0.111
 User 2 fold 4 acc=0.500 bal_acc=0.500
 User 2 fold 5 acc=0.278 bal_acc=0.278
 -> User 2 mean accuracy: 0.289, mean balanced: 0.289

User 2: 90 trials
 User 3 fold 1 acc=0.056 bal_acc=0.056
 User 3 fold 2 acc=0.222 bal_acc=0.222
 User 3 fold 3 acc=0.444 bal_acc=0.444
 User 3 fold 4 acc=0.444 bal_acc=0.444
 User 3 fold 5 acc=0.222 bal_acc=0.222
 -> User 3 mean accuracy: 0.278, mean balanced: 0.278

User 3: 90 trials
 User 4 fold 1 acc=0.278 bal_acc=0.278
 User 4 fold 2 acc=0.389 bal_acc=0.389
 User 4 fold 3 acc=0.389 bal_acc=0.389
 User 4 fold 4 acc=0.278 bal_acc=0.278
 User 4 fold 5 acc=0.

In [None]:
bal_accs = [res["mean_metrics"]["balanced_accuracy"] for res in results.values()]
bal_accs = np.array(bal_accs)
print(f"Balanced Accuracy (across users): {bal_accs.mean():.3f} ± {bal_accs.std():.3f}")

Balanced Accuracy (across users): 0.338 ± 0.043
