In [1]:
import os
import math
import shap
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.metrics import RocCurveDisplay, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
import time
import winsound

In [2]:
def class_label(df):
    label = df.iloc[:,1]
    classlb = []
    for i in label:
        if 'H' in i:
            classlb.append(0)
        if 'C' in i:
            classlb.append(1)
    
    classlb = np.array(classlb)
    
    return classlb

In [None]:
#drop columns that contain NaN in a new dataset
def droplist(df): 
    drop_col = []
    for col in df.columns: 
        nan = df[col].isna()
        for i in nan: 
            if i == True:
                drop_col.append(col)
                break
    df_new = df.drop(drop_col, axis=1)
    return df_new

In [None]:
def endo_assign(df): 
    colnum_all = len(df.columns) 
    endo = df.iloc[:,2:colnum_all]
    endo = endo.to_numpy()

    return endo

In [3]:
def test_train_split(df, size):
    
    endo = endo_assign(df)
    classlb = class_label(df)
    
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(endo, classlb, test_size=size, random_state=0, shuffle=True)
    
    return X_train_all, y_train_all, X_test_all, y_test_all

In [None]:
def best_clf (df, size):
    start_time = time.time()
    print('Time start: ', start_time)
    result = test_train_split(df, size)
    grid_search.fit(result[0], result[1]) #X_train_all and y_train_all
    best_clf = grid_search.best_estimator_
    print('Best hyperparameters:',  grid_search.best_params_)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Execution time: {elapsed_time} seconds")

In [4]:
def model_accuracy(df, clf, size):
    result = test_train_split(df, size)
    y_pred = clf.predict(result[2]) #model accuracy on the test sets
    print("MODEL ACCURACY: ", metrics.accuracy_score(result[3], y_pred))

In [None]:
def report_score(df, clf, size):
    result = test_train_split(df, size)
    y_pred = clf.predict(result[2]) #model accuracy on the test sets
    print(classification_report(y_pred, result[3]))

In [5]:
def clf_fit(df, clf, size):
    
    result = test_train_split(df, size)
    y_pred = clf.predict(result[2])
    
    clf.fit(result[0], result[1])
    y_pred_all = clf.predict_proba(result[2])[:, 1]
    fpr, tpr, thresholds = roc_curve(result[3], y_pred_all)

    return fpr, tpr

In [7]:
# #Plot week4, 5, 6 in the same figure
def plot_roc_allwks(df1, df2, df3, clf1, clf2, clf3, size):
    
    pr1 = clf_fit(df1, clf1, size)
    pr2 = clf_fit(df2, clf2, size)
    pr3 = clf_fit(df3, clf3, size)
        
    roc_auc1 = auc(pr1[0], pr1[1])
    roc_auc2 = auc(pr2[0], pr2[1])
    roc_auc3 = auc(pr3[0], pr3[1])
    
    plt.figure(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    
    plt.plot(pr1[0], pr1[1], color='hotpink', lw=1.5, label='W4 (AUC: %0.2f)' % roc_auc1)
    plt.plot(pr2[0], pr2[1], color='darkmagenta', lw=1.5, label='W5 (AUC: %0.2f)' % roc_auc2)
    plt.plot(pr3[0], pr3[1], color= 'midnightblue', label='W6 (AUC: %0.2f)' % roc_auc3)
    
    plt.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.3, label='Random classifier')

    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.xlabel('1 - Specificity', fontsize=13, labelpad=7)
    plt.ylabel('Sensitivity', fontsize=13, labelpad=7)

    plt.legend(bbox_to_anchor=(0.29, 0.4), loc='best', facecolor="None", edgecolor="None", fontsize=10)

    plt.show()

In [1]:
def plot_single_wk(df, clf, size):
    
    pr = clf_fit(df, clf, size)
    roc_auc = auc(pr[0], pr[1])
    
    plt.figure(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    
    plt.plot(pr[0], pr[1], color='black', lw=1.5, label='EA (AUC: %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.3, label='Random classifier')

    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.xlabel('1 - Specificity', fontsize=13, labelpad=7)
    plt.ylabel('Sensitivity', fontsize=13, labelpad=7)

    plt.legend(loc='best', facecolor="None", edgecolor="None", fontsize=10)

    plt.show()

In [None]:
def get_feature_mz(df): 
    #feature_names: id based on which column the feature is listed
    feature_names = [f"{i}" for i in range(endo_assign(df).shape[1])]
    feature_mz = []
    for i in feature_names:
        feature_mz.append(df.columns[int(i)+2])
        
    return feature_names, feature_mz

In [2]:
def shap_summary_plot(clf, df, size, max_display):
    explainer = shap.Explainer(clf)
    shap_values = explainer(test_train_split(df, size)[2]) #X_test

    shap.summary_plot(shap_values, test_train_split(df, size)[2], max_display=max_display, feature_names=get_feature_mz(df)[1], plot_size=(6.5, 8)) #X_test]


In [None]:
def feature_importance(clf, df, size, head):
    explainer = shap.Explainer(clf )
    shap_values = explainer(test_train_split(df, size)[2]) #X_test
    feature_names = get_feature_mz(df)[0]
    feature_mz = get_feature_mz(df)[1]
    
    importance = pd.DataFrame(shap_values.values, columns = feature_names)
    vals = np.abs(importance.values).mean(0)

    shap_importance = pd.DataFrame(list(zip(feature_mz, vals)),
                                  columns=['m/z','feature_vals'])
    shap_importance.sort_values(by=['feature_vals'],
                               ascending=False, inplace=True)
    shap_head = shap_importance.head(head)

    return shap_head

In [None]:
#f1 score
def cross_val_with_f1(classifier, X, y, n_splits=5, random_state=42):

    cv = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)

    tprs = []
    aucs = []
    f1_scores = []
    mean_fpr = np.linspace(0, 1, 100)

    plt.rc('font', family='Arial')
    fig, ax = plt.subplots(figsize=(4, 4), dpi=160)

    for fold, (train, test) in enumerate(cv.split(X, y)):
        classifier.fit(X[train], y[train])

        # Skip single-class folds
        if len(np.unique(y[test])) > 1:
            viz = RocCurveDisplay.from_estimator(
                classifier,
                X[test],
                y[test],
                name=f"Fold {fold}",
                alpha=0.4,
                lw=1,
                ax=ax)

            # Check if AUC is valid
            if not math.isnan(viz.roc_auc):
                interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
                interp_tpr[0] = 0.0
                tprs.append(interp_tpr)
                aucs.append(viz.roc_auc)

                # F1 score calculation
                y_pred = classifier.predict(X[test])
                f1 = f1_score(y[test], y_pred)
                f1_scores.append(f1)

    # Mean ROC Curve
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)

    curve_properties = {'color': 'black', 'linewidth': 1.5}
    ax.plot(mean_fpr, mean_tpr, 
            color = "black",
            lw = 1.5, 
            label= r"Mean (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc))

    ax.set_xlabel('1 - Specificity', fontsize=12)
    ax.set_ylabel('Sensitivity', fontsize=12)
    ax.grid(alpha=0.3)

    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='lower right', fontsize=10)
    plt.show()

    # Return metrics
    return {
        "mean_auc": mean_auc,
        "std_auc": std_auc,
        "mean_f1": np.mean(f1_scores) if f1_scores else None,
        "std_f1": np.std(f1_scores) if f1_scores else None
    }