In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
!pip install shap
import shap
from itertools import combinations




Collecting shap
  Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (540 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m540.1/540.1 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8


In [25]:
def train_evaluate_and_plot_roc(df, target_column, feature_sets, parameter_grid, n_splits=7):
    # Initialize StandardScaler
    scaler = StandardScaler()

    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize plot
    plt.figure(figsize=(10, 6))

    metrics_list = []

    for feature_columns in feature_sets:
        # Initialize dictionaries to store evaluation metrics
        metrics = {
            'Feature Set': [', '.join(feature_columns)],
            'AUC': [],
            'Accuracy': [],
            'Precision': [],
            'Recall': [],
            'F1 Score': [],
            'Specificity': [],
            'NPV': []
        }

        # Extract features and target
        X = df[feature_columns]
        y = df[target_column]

        # Standardize features
        X_scaled = scaler.fit_transform(X)

        # Initialize lists to store evaluation metrics
        auc_scores = []
        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []
        specificity_scores = []
        npv_scores = []
        tprs = []
        mean_fpr = np.linspace(0, 1, 100)

        # Iterate through folds
        for train_index, test_index in kf.split(X_scaled):
            X_train, X_test = X_scaled[train_index], X_scaled[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Initialize GridSearchCV
            grid_search = GridSearchCV(MLPClassifier(max_iter=2000), param_grid=parameter_grid, cv=7, scoring='roc_auc')
            grid_search.fit(X_train, y_train)

            # Best MLP model
            best_mlp_model = grid_search.best_estimator_

            # Predict probabilities
            y_pred_prob = best_mlp_model.predict_proba(X_test)[:, 1]

            # Compute ROC curve
            fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
            tprs.append(np.interp(mean_fpr, fpr, tpr))
            tprs[-1][0] = 0.0

            # Compute AUC
            roc_auc = roc_auc_score(y_test, y_pred_prob)
            auc_scores.append(roc_auc)

            # Compute other evaluation metrics
            y_pred = best_mlp_model.predict(X_test)
            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

            tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
            specificity = tn / (tn + fp)
            npv = tn / (tn + fn)
            specificity_scores.append(specificity)
            npv_scores.append(npv)

        # Compute average ROC AUC
        mean_tpr = np.mean(tprs, axis=0)
        mean_auc = np.mean(auc_scores)

        # Plot ROC curve
        plt.plot(mean_fpr, mean_tpr, label=f'{", ".join(feature_columns)} (AUC = {mean_auc:.2f})')

        # Calculate mean scores
        metrics['AUC'].append(mean_auc)
        metrics['Accuracy'].append(np.mean(accuracy_scores))
        metrics['Precision'].append(np.mean(precision_scores))
        metrics['Recall'].append(np.mean(recall_scores))
        metrics['F1 Score'].append(np.mean(f1_scores))
        metrics['Specificity'].append(np.mean(specificity_scores))
        metrics['NPV'].append(np.mean(npv_scores))

        metrics_list.append(metrics)

    # Plot settings
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random Guessing')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend(loc='lower right')
    plt.show()

    # Create DataFrame from metrics dictionary
    evaluation_metrics = pd.concat([pd.DataFrame(m) for m in metrics_list], ignore_index=True)

    # Save evaluation_metrics to Excel file
    evaluation_metrics.to_excel('evaluation_metrics.xlsx', index=False)

    return evaluation_metrics


In [36]:
# Define the different feature sets
feature_sets = [
    #['p-tau217', 'NfL', 'Aβ42/Aβ40 ratio','APOE risk', 'GFAP'],
    ['p-tau217', 'NfL', 'Aβ42/Aβ40 ratio','APOE risk', 'GFAP', 'Age', 'Sex', 'Education'],
    #['NfL'],
    #['p-tau217'],
    #['Aβ42/Aβ40 ratio'],
    #['GFAP'],
    #['APOE risk']
]

parameter_grid = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant']
}



In [37]:
# Assuming df_cn_vs and feature_sets have been defined as in your earlier code.

# Initialize StandardScaler
scaler = StandardScaler()

# Extract the first feature set (you can loop through others similarly)
X = df_cn_vs[feature_sets[0]]
y = df_cn_vs['Diagnosis']

# Standardize the features
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (e.g., using a KFold split)
kf = KFold(n_splits=7, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model (using the same parameters as before)
    grid_search = GridSearchCV(MLPClassifier(max_iter=2000), param_grid=parameter_grid, cv=7, scoring='roc_auc')
    grid_search.fit(X_train, y_train)

    # Best MLP model
    best_mlp_model = grid_search.best_estimator_

    # Calculate SHAP values
    explainer = shap.KernelExplainer(best_mlp_model.predict_proba, X_train)
    shap_values = explainer.shap_values(X_test)

    # Break after the first fold to focus on SHAP
    break




  0%|          | 0/8 [00:00<?, ?it/s]

In [38]:
print("Shape of shap_values[1]:", shap_values[1].shape)
print("Shape of X_test:", X_test.shape)

Shape of shap_values[1]: (8, 2)
Shape of X_test: (8, 8)


In [None]:
evaluation_metrics.to_excel('evaluation_metrics.xlsx', index=False)


# Create an Excel writer object
#with pd.ExcelWriter('evaluation_metrics.xlsx', engine='openpyxl') as writer:
 #   for sheet_name, df in evaluation_metrics.items():
  #      df.to_excel(writer, sheet_name=sheet_name)

# Download the Excel file to your local machine
from google.colab import files
files.download('evaluation_metrics.xlsx')
