In [52]:
# === Load and clean data ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns






In [54]:
# Load OASIS dataset
df = pd.read_csv("oasis_longitudinal.csv")

In [56]:
# Fill missing SES and MMSE values
df['SES'] = df['SES'].fillna(df['SES'].median())
df['MMSE'] = df['MMSE'].fillna(df['MMSE'].mean())

In [58]:
# Convert labels to binary classification: demented = 1, nondemented = 0
df['Group'] = df['Group'].replace("Converted", "Demented")
df['Group'] = df['Group'].map({"Demented": 1, "Nondemented": 0})

In [60]:
# Define features and dropping missing values
my_features = ['Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
df_clean = df.dropna(subset=my_features + ['Group', 'M/F'])

In [62]:

# === REPEATED TRAIN/TEST EVALUATION ===

def repeated_evaluation(model, X, y, label, n_splits=5, n_repeats=5):
    rskf = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42)
    metrics = []
    for train_idx, test_idx in rskf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        scaler = StandardScaler
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled) [:, 1]
        metrics.append([
            accuracy_score(y_test, y_pred),
            f1_score(y_test, y_pred),
            precision_score(y_test, y_pred),
            recall_score(y_test, y_pred),
            roc_auc_score(y_test, y_proba)

        ])
    metrics = np.array(metrics)
    return[label,
           metrics[:, 0].mean(),
           metrics[:, 1].mean(),
           metrics[:, 2].mean(),
           metrics[:, 3].mean(),
           metrics[:, 4].mean()
          ]

In [64]:

results = []
X_all = df_clean[my_features]
y_all = df_clean['Group']
results.append(repeated_evaluation(LogisticRegression(max_iter=1000), X_all, y_all, "Gen-LR-All"))
results.append(repeated_evaluation(RandomForestClassifier(n_estimators=100, random_state=42), X_all, y_all, "Gen-RF-All"))


TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

In [66]:

# === Sex specific models ===
df_male = df_clean[df_clean['M/F'] == 'M']
X_m, y_m = df_male[my_features], df_male['Group']
results.append(repeated_evaluation(LogisticRegression(max_iter=1000), X_m, y_m, "Male-LR-M"))
results.append(repeated_evaluation(RandomForestClassifier(n_estimators=100, random_state=42), X_m, y_m, "Male-RF-M"))

df_female = df_clean[df_clean['M/F'] == 'F']
X_f, y_f = df_female[my_features], df_female['Group']
results.append(repeated_evaluation(LogisticRegression(max_iter=1000), X_f, y_f, "Female-LR-F"))
results.append(repeated_evaluation(RandomForestClassifier(n_estimators=100, random_state=42), X_f, y_f, "Female-RF-F"))


TypeError: TransformerMixin.fit_transform() missing 1 required positional argument: 'X'

In [68]:

# === Results Table ===

columns =['Model', 'Accuracy', 'F1', 'Precision', 'Recall', 'AUC']
results_table = pd.DataFrame(results, columns=columns)
print(results_table)


Empty DataFrame
Columns: [Model, Accuracy, F1, Precision, Recall, AUC]
Index: []


In [None]:
#### TODO
#from scipy.stats import ttest_rel
#import pandas as pd

# Extract AUC scores for each model from the earlier results
# Define model pairs to compare
# Perform t-tests
# Bonferroni correction

# Evaluate all models with cross validation??