## Load Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import plot_roc_curve, roc_auc_score

In [None]:
plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 17

## Load Data

In [None]:
pre_data = pd.read_csv('supp_data_3_rounded_deid_w_omicron.csv')
pre_data['Class'] = np.vectorize({'DELTA': 'COVID-19', 'OMICRON': 'COVID-19', 'NEGATIVE': 'No-Virus', 'N_GENE': 'COVID-19', 'OTHER_VIRUS': 'Other-Virus', 'POSITIVE': 'COVID-19', 'SPIKE_IN': 'No-Virus'}.get)(pre_data['Category'])
pre_data['Experiment'] = np.vectorize({1: 'Primary', 2: 'Variants', 3: 'Contamination', 4: 'Omicron'}.get)(pre_data['Experiment'])
pre_data.columns = np.char.strip(pre_data.columns.values.astype(str))

In [None]:
pre_data[pre_data['Category'] == 'OTHER_VIRUS']

## Make ROC Curves

In [None]:
def plot_curve(genes):
    
    data = pre_data[['Category', 'Experiment'] + genes].dropna(axis = 0, subset = genes)
    filt = data[(data['Experiment'] == 'Primary') & ((data['Category'] == 'POSITIVE') | (data['Category'] == 'NEGATIVE'))]
    X = filt[genes].values
    y = (filt['Category'] == 'POSITIVE').values
    label = 'COVID+ vs. COVID-'
    
    
    cv = StratifiedKFold(n_splits=5)
    
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    seed = 8675309
    
    fig, ax = plt.subplots(figsize=(6, 6))
    cv_scores = np.zeros(len(y))
    for i, (train, test) in enumerate(cv.split(X, y)):
        
        scaler = StandardScaler()
        scaler.fit(X[train])
        X_sc = scaler.transform(X)
        classifier = SVC(kernel='linear', probability=True, random_state=seed)
        classifier.fit(X_sc[train], y[train])
        clf_scores = classifier.predict_proba(X_sc[test])[:, 1]
        cv_scores[test] = clf_scores
        viz = plot_roc_curve(classifier, X_sc[test], y[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax)
        viz.line_.remove()
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
    
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b', label=r'ROC (AUC = %0.3f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
    
    
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
    
    
    
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05]
           #, title='%s and %s, %s' % (genes[0], genes[1], label)
          )
    ax.legend(loc="lower right", prop={"size":13}, labelspacing=0.1)
    plt.xlabel('False Positive Rate', labelpad=10)
    plt.ylabel('True Positive Rate', labelpad=10)
    #plt.show()
    
    plt.gca().set_aspect('equal', adjustable='box')
    plt.tight_layout()
    
    # Uncomment to save figures and scores to files
    
    plt.savefig('%s (%s+%s) no title.pdf' % (label, genes[0], genes[1]), dpi = 200)
    #np.savetxt('%s (%s+%s).txt' % (label, genes[0], genes[1]), cv_scores)

In [None]:
for one in ['IFI6', 'IFI27', 'IFI44', 'IFI44L']:
    for two in ['GBP5', 'CCL3']:
        plot_curve([one, two])

In [None]:
def plot_curve_1g(comp, gene):
    
    data = pre_data[['Category', 'Experiment'] + gene].dropna(axis = 0, subset = gene)
    filt = data[(data['Experiment'] == 'Primary') & ((data['Category'] == 'POSITIVE') | (data['Category'] == 'NEGATIVE'))]
    X = filt[gene].values
    y = (filt['Category'] == 'POSITIVE').values
    label = 'COVID+ vs. COVID-'
    
    
    cv = StratifiedKFold(n_splits=5)
    
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    seed = 8675309
    
    fig, ax = plt.subplots(figsize=(6, 6))
    cv_scores = np.zeros(len(y))
    for i, (train, test) in enumerate(cv.split(X, y)):
        
        scaler = StandardScaler()
        scaler.fit(X[train])
        X_sc = scaler.transform(X)
        classifier = SVC(kernel='linear', probability=True, random_state=seed)
        classifier.fit(X_sc[train], y[train])
        clf_scores = classifier.predict_proba(X_sc[test])[:, 1]
        cv_scores[test] = clf_scores
        viz = plot_roc_curve(classifier, X_sc[test], y[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax)
        viz.line_.remove()
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
    
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b', label=r'ROC (AUC = %0.3f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8)
    
    
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')
    
    
    
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05]
           #, title='%s, %s' % (gene[0], label)
          )
    ax.legend(loc="lower right", prop={"size":13}, labelspacing=0.1)
    plt.xlabel('False Positive Rate', labelpad=10)
    plt.ylabel('True Positive Rate', labelpad=10)
    #plt.show()
    
    plt.gca().set_aspect('equal', adjustable='box')
    plt.tight_layout()
    
    # Uncomment to save figures and scores to files
    
    plt.savefig('%s (%s) no title.pdf' % (label, gene[0]), dpi = 200)
    #np.savetxt('%s (%s).txt' % (label, gene[0]), cv_scores)

In [None]:
for one in ['IFI6', 'IFI27', 'IFI44', 'IFI44L']:
    plot_curve_1g('nv', [one])

## Calculate Metrics at the Youden's index

In [None]:
from sklearn.metrics import recall_score, precision_score

display = []

data = pre_data
data = data.dropna(axis = 0, subset = ['IFI6', 'GBP5'])
filt = data[(data['Experiment'] == 'Primary') & ((data['Category'] == 'NEGATIVE') | (data['Category'] == 'POSITIVE'))]

X = filt[['IFI6', 'GBP5']].values
y = (filt['Category'] == 'POSITIVE').values


cv = StratifiedKFold(n_splits=5)
#tprs = []
#aucs = []
#mean_fpr = np.linspace(0, 1, 100)
seed = 8675309

#cv_scores = np.zeros(len(y))

for i, (train, test) in enumerate(cv.split(X, y)):
    
    scaler = StandardScaler()
    scaler.fit(X[train])
    X_sc = scaler.transform(X)
    classifier = SVC(kernel='linear', probability=True, random_state=seed)
    classifier.fit(X_sc[train], y[train])
    
    
    y_true = y[test]
    y_pred = classifier.predict_proba(X_sc[test])[:, 1]
    
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    #print(tpr)
    #print(fpr)
    #print(tpr-fpr)
    cutoff = thresholds[np.argmax(tpr - fpr)]
    #cutoff = 0.5
    
    cur_scores = []
    
    cur_scores.append(roc_auc_score(y_true, y_pred))
    cur_scores.append(precision_score(y_true, y_pred >= cutoff))
    cur_scores.append(precision_score(1 - y_true, y_pred < cutoff))
    cur_scores.append(recall_score(y_true, y_pred >= cutoff))
    cur_scores.append(recall_score(1 - y_true, y_pred < cutoff))
    
    for j in range(len(cur_scores)):
        cur_scores[j] = "{:.3f}".format(cur_scores[j])
    
    #cur_scores.append(recall_score(1 - y_true[~(np.array(corn_SC2) | np.array(corn_Other))], y_pred[~(np.array(corn_SC2) | np.array(corn_Other))] <= cutoff))
    #cur_scores.append(recall_score(1 - y_true[np.array(corn_Other)], y_pred[np.array(corn_Other)] <= cutoff))
    
    #print(cur_scores)
    
    display.append(np.concatenate((['Fold '+str(i+1)], cur_scores)))

pd.DataFrame(display, columns = ['Fold', 'AUC', 'PPV', 'NPV', 'Sensitivity', 'Specificity'])
    
    
#    cv_scores[test] = clf_scores
#    
#    viz = plot_roc_curve(classifier, X_sc[test], y[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1)
#    viz.line_.remove()
#    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
#    interp_tpr[0] = 0.0
#    tprs.append(interp_tpr)
#    aucs.append(viz.roc_auc)
#
#mean_tpr = np.mean(tprs, axis=0)
#mean_tpr[-1] = 1.0
#mean_auc = auc(mean_fpr, mean_tpr)
#std_auc = np.std(aucs)
#std_tpr = np.std(tprs, axis=0)

## Apply Models to Variants and Spike-ins

In [None]:
# Uses the same CV loop and models generated by plot_curve to get COVID probability estimates for additional expression data.

def external_data(comp, ext):
    
    X = filt[['IFI6', 'GBP5']].values
    y = (filt['Category'] == 'POSITIVE').values
    
    cv = StratifiedKFold(n_splits=5)
    seed = 8675309
    cv_scores = np.zeros(len(y))
    ext_all_scores = []
    for i, (train, test) in enumerate(cv.split(X, y)):
        
        scaler = StandardScaler()
        scaler.fit(X[train])
        
        X_sc = scaler.transform(X)
        X_ext = scaler.transform(ext)
        
        classifier = SVC(kernel='linear', probability=True, random_state=seed)
        classifier.fit(X_sc[train], y[train])
        
        ext_scores = classifier.predict_proba(X_ext)[:, 1]
        ext_all_scores.append(ext_scores)
        
        cv_scores[test] = classifier.predict_proba(X_sc[test])[:, 1]
    print(np.average(ext_all_scores, axis = 0))
    return cv_scores, np.average(ext_all_scores, axis = 0)

In [None]:
#Calculate shifts for different sample categories

pri_negs_data_c = pre_data[(pre_data['Experiment'] == 'Primary')].iloc[np.isin(pre_data[(pre_data['Experiment'] == 'Primary')]['Swab_ID'], pre_data[(pre_data['Experiment'] == 'Contamination') & (pre_data['Category'] == 'NEGATIVE')]['Swab_ID'])]
con_negs_data = pre_data[(pre_data['Experiment'] == 'Contamination') & (pre_data['Category'] == 'NEGATIVE')]

pri_con_med_shift = np.median(con_negs_data[['IFI6', 'GBP5']].values - pri_negs_data_c[['IFI6', 'GBP5']].values, axis = 0)


pri_negs_data_v = pre_data[(pre_data['Experiment'] == 'Primary')].iloc[np.isin(pre_data[(pre_data['Experiment'] == 'Primary')]['Swab_ID'], pre_data[(pre_data['Experiment'] == 'Variants') & (pre_data['Category'] == 'NEGATIVE')]['Swab_ID'])]
var_negs_data = pre_data[(pre_data['Experiment'] == 'Variants') & (pre_data['Category'] == 'NEGATIVE')]

pri_var_med_shift = np.median(var_negs_data[['IFI6', 'GBP5']].values - pri_negs_data_v[['IFI6', 'GBP5']].values, axis = 0)


pri_negs_data_o = pre_data[(pre_data['Experiment'] == 'Primary')].iloc[np.isin(pre_data[(pre_data['Experiment'] == 'Primary')]['Swab_ID'], pre_data[(pre_data['Experiment'] == 'Omicron') & (pre_data['Category'] == 'NEGATIVE')]['Swab_ID'])]
omi_negs_data = pre_data[(pre_data['Experiment'] == 'Omicron') & (pre_data['Category'] == 'NEGATIVE')]

pri_omi_med_shift = np.median(omi_negs_data[['IFI6', 'GBP5']].values - pri_negs_data_o[['IFI6', 'GBP5']].values, axis = 0)




In [None]:
shift_data = pre_data[(pre_data['Experiment'] != 'Primary') | (pre_data['Category'] == 'N_GENE') | (pre_data['Category'] == 'OTHER_VIRUS')].copy()


shift_data.loc[shift_data['Experiment'] == 'Contamination', ['IFI6', 'GBP5']] -= pri_con_med_shift
shift_data.loc[shift_data['Experiment'] == 'Variants', ['IFI6', 'GBP5']] -= pri_var_med_shift
shift_data.loc[shift_data['Experiment'] == 'Omicron', ['IFI6', 'GBP5']] -= pri_omi_med_shift





In [None]:
#Get COVID probabilities

full_out = data[['Swab_ID', 'Class', 'Category', 'Experiment']].copy()

ext_data = shift_data.dropna(subset = ['IFI6', 'GBP5'])
int_scores, ext_scores = external_data('nv', ext_data[['IFI6', 'GBP5']])
full_out['COVID Probability'] = np.nan
full_out['COVID Probability'].loc[ext_data.index] = ext_scores
full_out['COVID Probability'].loc[filt.index] = int_scores
full_out.to_csv('COVID_Probability_Table.csv')