In [1]:
import os
import math
import shap
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.metrics import RocCurveDisplay, classification_report, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, KFold

In [2]:
def class_label(df):
    label = df.iloc[:,0]
    classlb = []
    for i in label:
        if 'H' in i:
            classlb.append(0)
        if 'E' in i:
            classlb.append(1)
    
    classlb = np.array(classlb)
    
    return classlb

In [None]:
def ns_assign(df):
    ns1 = df.iloc[:,1].to_numpy().reshape(-1, 1)
    ns2 = df.iloc[:,2].to_numpy().reshape(-1, 1)
    ns3 = df.iloc[:,3].to_numpy().reshape(-1, 1)
    ns4 = df.iloc[:,4].to_numpy().reshape(-1, 1)
    ns5 = df.iloc[:,5].to_numpy().reshape(-1, 1)
    ns_all = df.iloc[:,1:6].to_numpy()
    
    return ns1, ns2, ns3, ns4, ns5, ns_all

In [3]:
def test_train_split(df, size):
    
    ns1 = ns_assign(df)[0]
    ns2 = ns_assign(df)[1]
    ns3 = ns_assign(df)[2]
    ns4 = ns_assign(df)[3]
    ns5 = ns_assign(df)[4]
    ns_all = ns_assign(df)[5]
    
    classlb = class_label(df)
    
    X_train1, X_test1, y_train1, y_test1 = train_test_split(ns1, classlb, test_size=size, random_state=0, shuffle=True)
    X_train2, X_test2, y_train2, y_test2 = train_test_split(ns2, classlb, test_size=size, random_state=0, shuffle=True)
    X_train3, X_test3, y_train3, y_test3 = train_test_split(ns3, classlb, test_size=size, random_state=0, shuffle=True)
    X_train4, X_test4, y_train4, y_test4 = train_test_split(ns4, classlb, test_size=size, random_state=0, shuffle=True)
    X_train5, X_test5, y_train5, y_test5 = train_test_split(ns5, classlb, test_size=size , random_state=0, shuffle=True)
    X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(ns_all, classlb, test_size=size, random_state=0, shuffle=True)
    
    return X_train1, y_train1, X_test1, y_test1, \
            X_train2, y_train2, X_test2, y_test2, \
            X_train3, y_train3, X_test3, y_test3, \
            X_train4, y_train4, X_test4, y_test4, \
            X_train5, y_train5, X_test5, y_test5, \
            X_train_all, y_train_all, X_test_all, y_test_all

In [None]:
def best_clf (df, size):
    result = test_train_split(df, size)
    grid_search.fit(result[20], result[21]) #X_train_all and y_train_all
    best_clf = grid_search.best_estimator_
    print('Best hyperparameters:',  grid_search.best_params_)

In [4]:
def model_accuracy(df, clf, size):
    result = test_train_split(df, size)
    y_pred = clf.predict(result[22]) #model accuracy on the test sets
    print("MODEL ACCURACY: ", metrics.accuracy_score(result[23], y_pred))

In [None]:
def report_score(df, clf, size):
    result = test_train_split(df, size)
    y_pred = clf.predict(result[22]) #model accuracy on the test sets
    print(classification_report(y_pred, result[23]))

In [5]:
def clf_fit(df, clf, size):
    
    result = test_train_split(df, size)
    y_pred = clf.predict(result[22])
    
    clf.fit(result[0], result[1]) #fitting on the train sets
    y_pred1 = clf.predict_proba(result[2])[:, 1]
    fpr1, tpr1, thresholds = roc_curve(result[3], y_pred1)

    clf.fit(result[4], result[5])
    y_pred2 = clf.predict_proba(result[6])[:, 1]
    fpr2, tpr2, thresholds = roc_curve(result[7], y_pred2)

    clf.fit(result[8], result[9])
    y_pred3 = clf.predict_proba(result[10])[:, 1]
    fpr3, tpr3, thresholds = roc_curve(result[11], y_pred3)

    clf.fit(result[12], result[13])
    y_pred4 = clf.predict_proba(result[14])[:, 1]
    fpr4, tpr4, thresholds = roc_curve(result[15], y_pred4)

    clf.fit(result[16], result[17])
    y_pred5 = clf.predict_proba(result[18])[:, 1]
    fpr5, tpr5, thresholds = roc_curve(result[19], y_pred5)

    clf.fit(result[20], result[21])
    y_pred_all = clf.predict_proba(result[22])[:, 1]
    fpr_all, tpr_all, thresholds = roc_curve(result[23], y_pred_all)

    return fpr1, tpr1, fpr2, tpr2, fpr3, tpr3, fpr4, tpr4, fpr5, tpr5, fpr_all, tpr_all

In [6]:
def plot_roc_allns(df, clf, size):
    
    pr = clf_fit(df, clf, size)
        
    roc_auc1 = auc(pr[0], pr[1])
    roc_auc2 = auc(pr[2], pr[3])
    roc_auc3 = auc(pr[4], pr[5])
    roc_auc4 = auc(pr[6], pr[7])
    roc_auc5 = auc(pr[8], pr[9])
    roc_auc_all = auc(pr[10], pr[11])
    
    #vABN1: PP01-HFA1 (col1, E1), vABN2: PP13-HFA3 (col4, E2), vABN2: PP12-d5eth (col2, E3), 
    #vABN3: PP10-d7isop (col3, E4), #vABN5: PP04-d3but (col5, E5)
    plt.figure(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    plt.plot(pr[0], pr[1], color='red', lw=1.5, label='PP01-HFA1 (AUC: %0.2f)' % roc_auc1)
    plt.plot(pr[8], pr[9], color='saddlebrown', lw=1.5, label='PP13-HFA3 (AUC: %0.2f)' % roc_auc4)
    plt.plot(pr[2], pr[3], color='gold', lw=1.5, label='PP10-d7isop (AUC: %0.2f)' % roc_auc2)
    plt.plot(pr[4], pr[5], color='blue', lw=1.5, label='PP12-d5eth (AUC: %0.2f)' % roc_auc3)
    plt.plot(pr[6], pr[7], color='green', lw=1.5, label='PP09-d3but (AUC: %0.2f)' % roc_auc5)

    plt.plot(pr[10], pr[11], color='black', lw=1.5, label='Multiplex (AUC = %0.2f)' % roc_auc_all)

    plt.plot([0, 1], [0, 1], 'k--', alpha=0.2, lw=1.5, label='Random classifier')

    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.xlabel('1 - Specificity', fontsize=15)
    plt.ylabel('Sensitivity', fontsize=15)
    plt.legend(bbox_to_anchor=(1.05, 0.9), loc='upper left', edgecolor="None", fontsize=13)

    plt.show()

In [7]:
#Plot week4, 5, 6 in the same figure
def plot_roc_allwks(df1, df2, df3, clf1, clf2, clf3, size):
    
    pr1 = clf_fit(df1, clf1, size)
    pr2 = clf_fit(df2, clf2, size)
    pr3 = clf_fit(df3, clf3, size)
        
    roc_auc1 = auc(pr1[10], pr1[11])
    roc_auc2 = auc(pr2[10], pr2[11])
    roc_auc3 = auc(pr3[10], pr3[11])
    
    plt.figure(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    
    plt.plot(pr1[10], pr1[11], color='hotpink', lw=1.5, label='W4 (AUC: %0.2f)' % roc_auc1)
    plt.plot(pr2[10], pr2[11], color='darkmagenta', lw=1.5, label='W5 (AUC: %0.2f)' % roc_auc2)
    plt.plot(pr3[10], pr3[11], color= 'midnightblue', label='W6 (AUC: %0.2f)' % roc_auc3)
    
    plt.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.3, label='Random classifier')

    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.xlabel('1 - Specificity', fontsize=13, labelpad=7)
    plt.ylabel('Sensitivity', fontsize=13, labelpad=7)

    plt.legend(bbox_to_anchor=(0.29, 0.4), loc='best', facecolor="None", edgecolor="None", fontsize=10)

    plt.show()

In [1]:
def plot_single_wk(df, clf, size):
    
    pr = clf_fit(df, clf, size)
    roc_auc = auc(pr[10], pr[11])
    
    plt.figure(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    
    plt.plot(pr[10], pr[11], color='black', lw=1.5, label='Eml4-Alk (AUC: %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--', lw=1, alpha=0.3, label='Random classifier')

    plt.xticks(fontsize=11)
    plt.yticks(fontsize=11)
    plt.xlabel('1 - Specificity', fontsize=13, labelpad=7)
    plt.ylabel('Sensitivity', fontsize=13, labelpad=7)

    plt.legend(loc='best', facecolor="None", edgecolor="None", fontsize=10)

    plt.show()

In [8]:
def plot_cm(y_test, y_pred): 
    fig, ax = plt.subplots(figsize=(3, 3),dpi = 160)
    plt.rc('font', family='Arial')
    disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=['Healthy', 'Cancer'],cmap=plt.cm.Reds, ax=ax)
    plt.rcParams.update({'font.size': 11})
    
    label_font = {'size':'12'}  # Adjust to fit
    ax.set_xlabel('Predicted labels', fontdict=label_font, labelpad=10);
    ax.set_ylabel('True labels', fontdict=label_font, labelpad=10);

In [None]:
#f1 score
def cross_val_with_f1(classifier, X, y, n_splits=5, random_state=42):

    cv = KFold(n_splits=n_splits, random_state=random_state, shuffle=True)

    tprs = []
    aucs = []
    f1_scores = []
    mean_fpr = np.linspace(0, 1, 100)

    plt.rc('font', family='Arial')
    fig, ax = plt.subplots(figsize=(4, 4), dpi=160)

    for fold, (train, test) in enumerate(cv.split(X, y)):
        classifier.fit(X[train], y[train])

        # Skip single-class folds
        if len(np.unique(y[test])) > 1:
            viz = RocCurveDisplay.from_estimator(
                classifier,
                X[test],
                y[test],
                name=f"Fold {fold}",
                alpha=0.4,
                lw=1,
                ax=ax)

            # Check if AUC is valid
            if not math.isnan(viz.roc_auc):
                interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
                interp_tpr[0] = 0.0
                tprs.append(interp_tpr)
                aucs.append(viz.roc_auc)

                # F1 score calculation
                y_pred = classifier.predict(X[test])
                f1 = f1_score(y[test], y_pred)
                f1_scores.append(f1)

    # Mean ROC Curve
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)

    ax.plot(
        mean_fpr,
        mean_tpr,
        color="black",
        label=r"Mean (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
        lw=1.5,
        alpha=1.0)

    ax.set_xlabel('1 - Specificity', fontsize=12)
    ax.set_ylabel('Sensitivity', fontsize=12)
    ax.grid(alpha=0.3)

    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='lower right', fontsize=10)
    plt.show()

    # Return metrics
    return {
        "mean_auc": mean_auc,
        "std_auc": std_auc,
        "mean_f1": np.mean(f1_scores) if f1_scores else None,
        "std_f1": np.std(f1_scores) if f1_scores else None
    }