In [None]:
metatable_csv = "../data/Adult-validation-cohort/adult_validation_metatable_with_viral_status.csv"
norm_gene_counts_csv = "../data/Adult-validation-cohort/test2.csv"
gene_counts_csv = "../data/Adult-validation-cohort/adult_validation_swab_gene_counts.csv"

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

In [None]:
SKF = StratifiedKFold(n_splits=5, shuffle=True)

In [None]:
#Sets the seed for generating training and validation datasets

seed = 1602028882

In [None]:
#Various dataframes

metatable_df = pd.read_csv(metatable_csv,low_memory=False)
norm_gene_counts = pd.read_csv(norm_gene_counts_csv,low_memory=False)
norm_df = pd.read_csv('../code/test.csv',low_memory=False)
full_old_metatable_df = pd.read_csv("../data/metatable_with_viral_status.csv",low_memory=False)
old_metatable_df = full_old_metatable_df[['CZB_ID', 'viral_status']]
gene_counts_df = pd.read_csv(gene_counts_csv,low_memory=False)

In [None]:
#Various numpy arrays

sharedgenes = np.intersect1d(norm_df.columns[1:], norm_gene_counts.columns[1:])
comb_counts = np.concatenate((norm_gene_counts[sharedgenes], norm_df[sharedgenes]))
comb_SC2 = np.concatenate((metatable_df['viral_status'].values == 'sc2', old_metatable_df['viral_status'].values == 'SC2'))
comb_data_src = np.concatenate((np.zeros(metatable_df.shape[0]), np.ones(old_metatable_df.shape[0])))
comb_rpm = np.concatenate((metatable_df['sc2_rpm'], full_old_metatable_df['SC2_rpm']))

In [None]:
#Splits the data into training and validation datasets (70:30)

[train_ind, test_ind] = train_test_split(np.arange(368), random_state = seed, test_size = 0.3, stratify = 2*comb_SC2+comb_data_src)

TRcomb_counts = comb_counts[train_ind]
TEcomb_counts = comb_counts[test_ind]
TRcomb_SC2 = comb_SC2[train_ind]
TEcomb_SC2 = comb_SC2[test_ind]

In [None]:
#Beginning of gene set generation code
#Each run of the code identifies an additional gene.  Each new gene identified during one round is added to other_genes list prior to next round

other_genes = []
all_scores = []
y = TRcomb_SC2
for i in range(comb_counts.shape[1]):
    X = np.concatenate((TRcomb_counts[:, [i]+other_genes], comb_rpm[train_ind].reshape(-1, 1)), axis = 1)
    cur_scores = []
    if not i%100: print(i)
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    all_scores.append(av)

In [None]:
#For maximum time optimization, the number 13000 can be increased if 0 or 1 genes have already been selected.

bar = np.sort(all_scores)[13000]
x = 13000-15783
barset = np.argpartition(all_scores, x)[x:]
bar

In [None]:
all_scores = []
y = TRcomb_SC2

for i in barset:
    cur_scores = []
    print(i)
    for j in range(10):
        X = np.concatenate((TRcomb_counts[:, [i]+other_genes], comb_rpm[train_ind].reshape(-1, 1)), axis = 1)
        clf = SVC(gamma = 'auto', probability = True)
        score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
        av = np.average(score['test_score'])
        if av < bar:
            cur_scores = np.zeros(10)
            break
        cur_scores.append(av)
    all_scores.append(cur_scores)

In [None]:
onlyset = np.nonzero(np.average(all_scores, axis = 1) != 0)
barset[onlyset]

In [None]:
all_scores = []
y = TRcomb_SC2
for i in barset[onlyset]:
    cur_scores = []
    print(i)
    for j in range(100):
        X = np.concatenate((TRcomb_counts[:, [i]+other_genes], comb_rpm[train_ind].reshape(-1, 1)), axis = 1)
        clf = SVC(gamma = 'auto', probability = True)
        score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
        av = np.average(score['test_score'])
        cur_scores.append(av)
    all_scores.append(cur_scores)



In [None]:
best = np.argpartition(np.average(all_scores, axis = 1), [-3,-2])[-3:] # This guarantees proper order at end


In [None]:
best_gene_index = barset[onlyset][best]
best_gene_index

In [None]:
[gene_counts_df.loc[gene_counts_df['Unnamed: 0']==ID].gene_symbol.values[0] for ID in sharedgenes[best_gene_index]]


In [None]:
#End of gene set generation code

In [None]:
#Plug in indices generated by barset[onlyset] to find corresponding Ensemble IDs

sharedgenes[best_gene_index]

In [None]:
#Plug in Ensemble IDs to find corresponding indices

np.nonzero(sharedgenes == 'ENSG00000168743')

In [None]:
#Saves the patient IDs for the training and validation sets as CSVs

#np.savetxt("train.csv", np.concatenate((norm_gene_counts['Unnamed: 0'], norm_df['Unnamed: 0']))[train_ind], delimiter=",", fmt='%s')
#np.savetxt("test.csv", np.concatenate((norm_gene_counts['Unnamed: 0'], norm_df['Unnamed: 0']))[test_ind], delimiter=",", fmt='%s')

In [None]:
#Cross-validation performance on 257-patient training dataset

gene_set = []
X = TRcomb_counts[:, gene_set]
y = TRcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))

In [None]:
#Cross-validation performance on 111-patient validation dataset

gene_set = []
X = TEcomb_counts[:, gene_set]
y = TEcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))

In [None]:
#Cross-validation performance on 257-patient training dataset

gene_set = []
X = np.concatenate((TRcomb_counts[:, gene_set], comb_rpm[train_ind].reshape(-1, 1)), axis = 1)
y = TRcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))

In [None]:
#Cross-validation performance on 111-patient validation dataset

gene_set = []
X = np.concatenate((TEcomb_counts[:, gene_set], comb_rpm[test_ind].reshape(-1, 1)), axis = 1)
y = TEcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))

In [None]:
#Only SC2-rpm Cross-validation performance on 257-patient training dataset

X = comb_rpm[train_ind].reshape(-1, 1)
y = TRcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))

In [None]:
#Only SC2-rpm Cross-validation performance on 111-patient validation dataset

X = comb_rpm[test_ind].reshape(-1, 1)
y = TEcomb_SC2
full = []
for j in range(10000):
    clf = SVC(gamma = 'auto', probability = True)
    score = cross_validate(clf, X, y, scoring='roc_auc', cv=SKF)
    av = np.average(score['test_score'])
    full.append(av)
    #print(np.average(full), np.std(full)) #Uncomment to view live averages (useful if you think the average will converge early)
    
print(np.average(full), np.std(full))