In [1]:
import sys
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import logging
from datetime import datetime
import traceback

from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, StratifiedKFold, LeaveOneOut
from sklearn.utils.multiclass import unique_labels

from sklearn.inspection import permutation_importance

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    RocCurveDisplay, auc, accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix, classification_report,
    balanced_accuracy_score, roc_auc_score
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import xgboost as xgb
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_selection import SelectKBest, f_classif
import re
import scipy

In [2]:
pd.set_option('display.max_columns', None)

sys.path.append(os.path.abspath("../src"))

In [3]:
processed_path = '../data/processed/'
# Read data
df_digital_tmt_with_target = pd.read_csv(processed_path + 'df_digital_tmt_with_target.csv')
demographic_df = pd.read_csv(processed_path + 'demographic_df.csv')
non_digital_df = pd.read_csv(processed_path + 'non_digital_df.csv')
df_digital_hand_and_eye = pd.read_csv(processed_path + 'df_digital_hand_and_eye.csv')
digital_test_less_subjects = pd.read_csv(processed_path + 'digital_test_less_subjects.csv')
non_digital_test_less_subjects = pd.read_csv(processed_path + 'non_digital_test_less_subjects.csv')


# Final checks
print(df_digital_tmt_with_target['group'].value_counts())
print(demographic_df['group'].value_counts())
print(non_digital_df['group'].value_counts())
print(df_digital_hand_and_eye['group'].value_counts())
print(digital_test_less_subjects['group'].value_counts())
print(non_digital_test_less_subjects['group'].value_counts())

group
1    43
0    29
Name: count, dtype: int64
group
1    43
0    29
Name: count, dtype: int64
group
1    43
0    29
Name: count, dtype: int64
group
1    43
0    29
Name: count, dtype: int64
group
1    43
0    29
Name: count, dtype: int64
group
1    43
0    29
Name: count, dtype: int64


### Logging setup

In [4]:
# Configure logging
# Ensure logs folder exists
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
# Create a fresh log file each run
log_filename = os.path.join(log_dir, f"error_log_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.log")
logging.basicConfig(filename=log_filename,
                    level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')

### Helper functions

In [6]:
def extract_X_y_features(df):
    df = df.drop('subject_id', axis=1)
    print("group in X", 'group' in df.iloc[:, :-1].columns)
    print("suj in X", 'suj' in df.iloc[:, :-1].columns)
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    feature_names = df.columns[:-1]
    assert 'subject_id' not in df.columns , "'subject_id' still in the final dataframe"
    return X, y, feature_names

def join_and_reorder(df1, df2):
    df =  pd.merge(
        df1,
        df2.drop(columns='group'),
        on='subject_id',
        how='inner'
    )
    cols = [col for col in df.columns if col != 'group'] + ['group']
    df = df[cols]
    assert df.columns[-1] == 'group', "'group' is not the last column after reordering"
    return df


### Setup

In [5]:
n_splits = 2
n_repeats = 1

global_seed = 42
inner_cv_seed = 50  # Fixed for reproducibility in inner CV
perform_pca = False
type_of_cv = 'loo'
n_components = 4
tune_hyperparameters = False
feature_selection = True