## Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Load Data

In [2]:
first_gene_counts_df = pd.read_csv("../data/swab_gene_counts.csv",low_memory=False).T
first_headers = first_gene_counts_df.iloc[0]
first_gene_counts_df = first_gene_counts_df[1:]
first_gene_counts_df.columns = first_headers

In [3]:
second_gene_counts_df = pd.read_csv("../data/Adult-validation-cohort/adult_validation_swab_gene_counts.csv",low_memory=False).T
second_headers = second_gene_counts_df.iloc[0]
second_gene_counts_df = second_gene_counts_df[1:]
second_gene_counts_df.columns = second_headers
second_gene_counts_df = second_gene_counts_df.drop(['gene_symbol'])

In [4]:
ENSG_trans = pd.read_csv('../annotation/gene2name.txt',low_memory=False, index_col = 0, header = None, sep = '\t').T
sharedgenes = np.intersect1d(first_gene_counts_df.columns[1:], second_gene_counts_df.columns[1:])

In [5]:
R_first_gene_counts = pd.read_csv('../code/test.csv',low_memory=False)[sharedgenes].values
R_second_gene_counts = pd.read_csv('../data/Adult-validation-cohort/test2.csv',low_memory=False)[sharedgenes].values


In [6]:
R_first_second_gene_counts = np.concatenate((R_first_gene_counts, R_second_gene_counts))

In [7]:
IDs = pd.read_csv('GSE152075_raw_counts_GEO.csv')['Unnamed: 0']

first_SC2 = (pd.read_csv("../../covid19-swab-host-expression-master-1/data/metatable_with_viral_status.csv",low_memory=False)['viral_status'].values == 'SC2')
second_SC2 = (pd.read_csv("../data/Adult-validation-cohort/adult_validation_metatable_with_viral_status.csv",low_memory=False)['viral_status'].values == 'sc2')

In [8]:
first_second_SC2 = np.concatenate((first_SC2, second_SC2))

In [9]:
first_rpM = pd.read_csv("../../covid19-swab-host-expression-master-1/data/metatable_with_viral_status.csv",low_memory=False)['SC2_rpm']
second_rpM = pd.read_csv("../data/Adult-validation-cohort/adult_validation_metatable_with_viral_status.csv",low_memory=False)['sc2_rpm']




In [10]:
first_second_rpM = np.concatenate((first_rpM, second_rpM))


## Split Data into Training and Testing Cohorts

In [11]:
seed = 8675309

In [12]:
comb_data_src = np.concatenate((np.zeros(first_SC2.shape[0]), np.ones(second_SC2.shape[0])))

In [13]:
scaler = StandardScaler().fit(R_first_second_gene_counts)
scaled_R_first_second_gene_counts = scaler.transform(R_first_second_gene_counts)

In [14]:
#Splits the data into training and validation datasets (70:30)

[train_ind, test_ind] = train_test_split(np.arange(368), random_state = seed, test_size = 0.3, stratify = 2*first_second_SC2+comb_data_src)

TRcomb_counts = scaled_R_first_second_gene_counts[train_ind]
TEcomb_counts = scaled_R_first_second_gene_counts[test_ind]

#TRcomb_counts = pre_norm_first_gene_counts_df.values[train_ind]
#TEcomb_counts = pre_norm_first_gene_counts_df.values[test_ind]

TRcomb_SC2 = first_second_SC2[train_ind]
TEcomb_SC2 = first_second_SC2[test_ind]


In [15]:
TRcomb_counts_rpM = np.c_[TRcomb_counts, first_second_rpM[train_ind]]
TEcomb_counts_rpM = np.c_[TEcomb_counts, first_second_rpM[test_ind]]

## Generate Scores for Gene Sets

In [16]:
#Given a gene set, generates the following five scores:
#
#   CV score on the training cohort
#   CV score on the testing cohort
#   Score on the testing cohort when trained on the training cohort
#   CV score on the training cohort with rpm
#   Score on the testing cohort when trained on the training cohort with rpm


def rpMandnorpMAUCs(gene_set):
    print(gene_set)
    comb_inds = []
    for gene in gene_set:
        comb_inds.append(np.where(sharedgenes == ENSG_trans.columns[np.where(ENSG_trans == gene)[1]][0])[0][0])
        
        
    cur_scores = []
    for i in range(10000):
        clf = SVC(gamma = 'auto', probability = True)
        score = cross_validate(clf, TRcomb_counts_rpM[:, comb_inds], TRcomb_SC2, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True))
        av = np.average(score['test_score'])
        cur_scores.append(av)
    print(np.average(cur_scores), np.std(cur_scores))
    
    
    cur_scores = []
    for i in range(10000):
        clf = SVC(gamma = 'auto', probability = True)
        score = cross_validate(clf, TEcomb_counts_rpM[:, comb_inds], TEcomb_SC2, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True))
        av = np.average(score['test_score'])
        cur_scores.append(av)
    print(np.average(cur_scores), np.std(cur_scores))
        
    
    clf = SVC(gamma = 'auto', probability = True)
    clf.fit(TRcomb_counts_rpM[:, comb_inds], TRcomb_SC2)
    print(roc_auc_score(TEcomb_SC2, clf.predict_proba(TEcomb_counts_rpM[:, comb_inds])[:, 1]))
    
    
    comb_inds_rpM = comb_inds
    comb_inds_rpM.append(-1)
    
    
    cur_scores = []
    for i in range(10000):
        clf = SVC(gamma = 'auto', probability = True)
        score = cross_validate(clf, TRcomb_counts_rpM[:, comb_inds_rpM], TRcomb_SC2, scoring='roc_auc', cv=StratifiedKFold(n_splits=5, shuffle=True))
        av = np.average(score['test_score'])
        cur_scores.append(av)
    print(np.average(cur_scores), np.std(cur_scores))
    
    
    clf = SVC(gamma = 'auto', probability = True)
    clf.fit(TRcomb_counts_rpM[:, comb_inds_rpM], TRcomb_SC2)
    print(roc_auc_score(TEcomb_SC2, clf.predict_proba(TEcomb_counts_rpM[:, comb_inds_rpM])[:, 1]))

In [17]:
rpMandnorpMAUCs(['IFI6', 'PTAFR'])

['IFI6', 'PTAFR']
0.8602914909290271 0.007508648877131656
0.853766011904762 0.013182312012917351
0.8693379790940766
0.8686702850129584 0.00741060798696929
0.9243902439024391


In [18]:
rpMandnorpMAUCs(['IFI6', 'GBP5'])

['IFI6', 'GBP5']
0.8867830061802233 0.005490651155151024
0.8758208531746033 0.011687080091482905
0.8954703832752613
0.8788577571271929 0.007451145442708754
0.9285714285714286


In [19]:
rpMandnorpMAUCs(['IFI6', 'GRINA'])

['IFI6', 'GRINA']
0.8531939004685007 0.007806904800266287
0.9022496031746032 0.010973935606340779
0.8989547038327527
0.8838121406997609 0.007047141898642161
0.9400696864111499


In [20]:
rpMandnorpMAUCs(['IFI44', 'TPM4'])

['IFI44', 'TPM4']
0.8644896300837321 0.004855151806441586
0.838130615079365 0.012631596144732759
0.8501742160278745
0.9131138171102472 0.006535019077060335
0.9177700348432055


In [21]:
rpMandnorpMAUCs(['IFI44', 'BAZ1A'])

['IFI44', 'BAZ1A']
0.8554000949461722 0.005748161907463926
0.8664335119047619 0.013225584508841693
0.8989547038327526
0.8667258199013158 0.010248356088173661
0.913588850174216


In [22]:
rpMandnorpMAUCs(['IFI44', 'VCAN'])

['IFI44', 'VCAN']
0.8406831003289473 0.008624329730927385
0.7926942658730158 0.014279575156844625
0.8289198606271777
0.8583748612938596 0.00916255063506019
0.8843205574912892


In [23]:
rpMandnorpMAUCs(['IFI44L', 'GBP5'])

['IFI44L', 'GBP5']
0.8794445910586124 0.004681357404995641
0.8515173809523808 0.012052278188902957
0.8801393728222996
0.883833426659689 0.008600751404836442
0.9278745644599303


In [24]:
rpMandnorpMAUCs(['IFI44L', 'BAZ1A'])

['IFI44L', 'BAZ1A']
0.8712538101076556 0.006025738003142466
0.8233347222222223 0.013586212057115462
0.8642857142857143
0.8731348511263953 0.011193967159935087
0.9066202090592335


In [25]:
rpMandnorpMAUCs(['IFI44L', 'SH3BP2'])

['IFI44L', 'SH3BP2']
0.8679272436204146 0.005511421657995489
0.8483792857142857 0.013379937391836834
0.8627177700348432
0.8827739939443779 0.007838985773384405
0.8878048780487805
