In [1]:
# === 1. LOAD & CLEAN DATA ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,
    confusion_matrix, roc_curve, auc
)
import matplotlib.pyplot as plt
import seaborn as sns

# Load OASIS dataset
df = pd.read_csv("oasis_longitudinal.csv")

# Fill missing SES and MMSE values
df["SES"] = df["SES"].fillna(df["SES"].median())
df["MMSE"] = df["MMSE"].fillna(df["MMSE"].mean())

# Convert labels to binary classification: Demented = 1, Nondemented = 0
df["Group"] = df["Group"].replace("Converted", "Demented")
df["Group"] = df["Group"].map({"Demented": 1, "Nondemented": 0})

# Define features and drop any rows with missing values in them
my_features = ['Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
df_clean = df.dropna(subset=my_features + ['Group', 'M/F'])


# === 2. TRAIN/TEST SPLIT ===
# Split entire dataset into training/testing sets for general model
X = df_clean[my_features]
y = df_clean['Group']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Scale features for general models
scaler_general = StandardScaler()
X_train_scaled = scaler_general.fit_transform(X_train)
X_test_scaled = scaler_general.transform(X_test)

# Also grab metadata for sex-based splitting
train_meta = df_clean.iloc[X_train.index].copy()
test_meta = df_clean.iloc[X_test.index].copy()


# === 3. SEX-SPECIFIC SPLITS ===
# Subset males/females for training
train_m = train_meta[train_meta['M/F'] == 'M']
train_f = train_meta[train_meta['M/F'] == 'F']

# Scale and separate features/labels for each sex
scaler_m = StandardScaler()
X_m_train = scaler_m.fit_transform(train_m[my_features])
y_m_train = train_m['Group']

scaler_f = StandardScaler()
X_f_train = scaler_f.fit_transform(train_f[my_features])
y_f_train = train_f['Group']

# Subset males/females for testing (used for evaluating general model)
test_m = test_meta[test_meta['M/F'] == 'M']
test_f = test_meta[test_meta['M/F'] == 'F']

X_m_test = scaler_general.transform(test_m[my_features])
y_m_test = test_m['Group']

X_f_test = scaler_general.transform(test_f[my_features])
y_f_test = test_f['Group']


# === 4. DEFINE CV SCORING FUNCTION ===
def cross_val_metrics(model, X, y, label):
    scoring = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring)
    print(f"{label} CV Results:")
    for metric in scoring:
        print(f"  {metric.capitalize()}: {np.mean(scores[f'test_{metric}']):.3f} ± {np.std(scores[f'test_{metric}']):.3f}")
    return [np.mean(scores[f'test_{m}']) for m in scoring]


# === 5. DEFINE ALL MODELS ===
models = {
    'Gen-LR-All': (LogisticRegression(max_iter=1000), X_train_scaled, y_train),
    'Gen-RF-All': (RandomForestClassifier(n_estimators=100, random_state=42), X_train_scaled, y_train),
    'Male-LR-M': (LogisticRegression(max_iter=1000), X_m_train, y_m_train),
    'Male-RF-M': (RandomForestClassifier(n_estimators=100, random_state=42), X_m_train, y_m_train),
    'Female-LR-F': (LogisticRegression(max_iter=1000), X_f_train, y_f_train),
    'Female-RF-F': (RandomForestClassifier(n_estimators=100, random_state=42), X_f_train, y_f_train),
}


# === 6. EVALUATE ALL MODELS WITH CV ===
results = []
for label, (model, X_data, y_data) in models.items():
    scores = cross_val_metrics(model, X_data, y_data, label)
    results.append([label] + scores)


# === 7. FIT GENERAL MODELS TO TEST ON SEX SUBSETS ===
gen_lr = LogisticRegression(max_iter=1000).fit(X_train_scaled, y_train)
gen_rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_scaled, y_train)

# Test general models on male/female subsets
def test_eval(name, model, X, y):
    y_pred = model.predict(X)
    return [
        name,
        accuracy_score(y, y_pred),
        f1_score(y, y_pred),
        precision_score(y, y_pred),
        recall_score(y, y_pred),
        roc_auc_score(y, y_pred),
    ]

results.append(test_eval('Gen-LR on Males', gen_lr, X_m_test, y_m_test))
results.append(test_eval('Gen-LR on Females', gen_lr, X_f_test, y_f_test))
results.append(test_eval('Gen-RF on Males', gen_rf, X_m_test, y_m_test))
results.append(test_eval('Gen-RF on Females', gen_rf, X_f_test, y_f_test))


# === 8. OPTIONAL CV ON FULL MALE/FEMALE POPS ===
df_male = df_clean[df_clean['M/F'] == 'M']
df_female = df_clean[df_clean['M/F'] == 'F']

scaler_male_eval = StandardScaler()
X_m_cv = scaler_male_eval.fit_transform(df_male[my_features])
y_m_cv = df_male['Group']

scaler_female_eval = StandardScaler()
X_f_cv = scaler_female_eval.fit_transform(df_female[my_features])
y_f_cv = df_female['Group']

# Fit fresh general models for CV
scores = cross_val_metrics(LogisticRegression(max_iter=1000), X_m_cv, y_m_cv, "Gen-LR CV on Males")
results.append(["Gen-LR CV on Males"] + scores)

scores = cross_val_metrics(RandomForestClassifier(n_estimators=100, random_state=42), X_m_cv, y_m_cv, "Gen-RF CV on Males")
results.append(["Gen-RF CV on Males"] + scores)

scores = cross_val_metrics(LogisticRegression(max_iter=1000), X_f_cv, y_f_cv, "Gen-LR CV on Females")
results.append(["Gen-LR CV on Females"] + scores)

scores = cross_val_metrics(RandomForestClassifier(n_estimators=100, random_state=42), X_f_cv, y_f_cv, "Gen-RF CV on Females")
results.append(["Gen-RF CV on Females"] + scores)


# === 9. RESULTS TABLE ===
columns = ["Model", "Accuracy", "F1", "Precision", "Recall", "AUC"]
results_table = pd.DataFrame(results, columns=columns)
display(results_table)


Gen-LR-All CV Results:
  Accuracy: 0.943 ± 0.017
  F1: 0.939 ± 0.018
  Precision: 0.985 ± 0.018
  Recall: 0.897 ± 0.020
  Roc_auc: 0.970 ± 0.009
Gen-RF-All CV Results:
  Accuracy: 0.950 ± 0.010
  F1: 0.947 ± 0.011
  Precision: 0.979 ± 0.017
  Recall: 0.918 ± 0.027
  Roc_auc: 0.980 ± 0.010
Male-LR-M CV Results:
  Accuracy: 0.946 ± 0.031
  F1: 0.956 ± 0.025
  Precision: 0.965 ± 0.045
  Recall: 0.950 ± 0.047
  Roc_auc: 0.999 ± 0.002
Male-RF-M CV Results:
  Accuracy: 0.969 ± 0.015
  F1: 0.975 ± 0.013
  Precision: 0.976 ± 0.029
  Recall: 0.975 ± 0.031
  Roc_auc: 0.996 ± 0.005
Female-LR-F CV Results:
  Accuracy: 0.934 ± 0.049
  F1: 0.903 ± 0.074
  Precision: 1.000 ± 0.000
  Recall: 0.831 ± 0.123
  Roc_auc: 0.935 ± 0.051
Female-RF-F CV Results:
  Accuracy: 0.934 ± 0.036
  F1: 0.907 ± 0.056
  Precision: 0.970 ± 0.036
  Recall: 0.862 ± 0.113
  Roc_auc: 0.952 ± 0.026
Gen-LR CV on Males CV Results:
  Accuracy: 0.956 ± 0.042
  F1: 0.964 ± 0.034
  Precision: 0.980 ± 0.040
  Recall: 0.949 ± 0.032
  

Unnamed: 0,Model,Accuracy,F1,Precision,Recall,AUC
0,Gen-LR-All,0.943051,0.939209,0.985185,0.897471,0.970351
1,Gen-RF-All,0.949718,0.946966,0.978818,0.917931,0.980408
2,Male-LR-M,0.946154,0.95595,0.965278,0.95,0.99875
3,Male-RF-M,0.969231,0.974976,0.976471,0.975,0.99625
4,Female-LR-F,0.933868,0.902606,1.0,0.830769,0.934872
5,Female-RF-F,0.934046,0.906757,0.97033,0.861538,0.951758
6,Gen-LR on Males,0.933333,0.941176,1.0,0.888889,0.944444
7,Gen-LR on Females,0.977778,0.972973,1.0,0.947368,0.973684
8,Gen-RF on Males,0.933333,0.941176,1.0,0.888889,0.944444
9,Gen-RF on Females,0.977778,0.972973,1.0,0.947368,0.973684


In [7]:
from scipy.stats import ttest_rel
import pandas as pd

# Extract AUC scores for each model from the earlier results
auc_general_lr = results_table['Gen-LR']['cv_auc']
auc_general_rf = results['Gen-RF']['cv_auc']
auc_male_lr    = results['Male-LR']['cv_auc']
auc_male_rf    = results['Male-RF']['cv_auc']
auc_female_lr  = results['Female-LR']['cv_auc']
auc_female_rf  = results['Female-RF']['cv_auc']

# Define model pairs to compare
comparisons = {
    "Gen-LR vs Gen-RF": (auc_general_lr, auc_general_rf),
    "Male-LR vs Male-RF": (auc_male_lr, auc_male_rf),
    "Female-LR vs Female-RF": (auc_female_lr, auc_female_rf),
    "Gen-LR vs Male-LR": (auc_general_lr, auc_male_lr),
    "Gen-LR vs Female-LR": (auc_general_lr, auc_female_lr),
    "Gen-RF vs Male-RF": (auc_general_rf, auc_male_rf),
    "Gen-RF vs Female-RF": (auc_general_rf, auc_female_rf),
}

# Perform t-tests
raw_results = []
for name, (group1, group2) in comparisons.items():
    t_stat, p_value = ttest_rel(group1, group2)
    raw_results.append({
        "Comparison": name,
        "Mean AUC 1": np.mean(group1),
        "Mean AUC 2": np.mean(group2),
        "t-stat": t_stat,
        "Raw p-value": p_value
    })

# Bonferroni correction
num_tests = len(raw_results)
for r in raw_results:
    r["Bonferroni-adjusted p"] = min(r["Raw p-value"] * num_tests, 1.0)
    r["Significant (adj p<0.05)"] = r["Bonferroni-adjusted p"] < 0.05

# Display
bonf_df = pd.DataFrame(raw_results)
import caas_jupyter_tools
caas_jupyter_tools.display_dataframe_to_user(name="Bonferroni-Corrected AUC t-tests", dataframe=bonf_df)


KeyError: 'Gen-LR'