# In addition to experimenting with SVD vector rankings, I look at the most predictive features by AUC.

I run the algorithm described in the project writeup, computing the AUCs and measuring their distance from the worst case, 0.5.

In [1]:
from sklearn.metrics import roc_curve, auc
import pandas as pd
import numpy as np
from scipy import sparse
from dataloaders import *
from tqdm import tqdm

In [2]:
multiomics, target_tags = import_data("../data_csv/rna.csv", '../data_csv/motif.csv', "../data_csv/atac_mat.csv")
atac = pd.read_csv("../data_csv/atac_sparse_csv.csv")
rna = pd.read_csv("../data_csv/rna_sparse_csv.csv")
motif = pd.read_csv("../data_csv/motif.csv", index_col=[0]).T
atac_sparse = sparse.csc_matrix((atac["x"].to_numpy(), (atac["i"].to_numpy()-1, atac["j"].to_numpy()-1)), shape=(71201, 3900)).transpose()
rna_sparse = sparse.csc_matrix((rna["x"].to_numpy(), (rna["i"].to_numpy()-1, rna["j"].to_numpy()-1)), shape=(36597, 3900)).transpose()
targets = rna_sparse[:, -6:]
rna_sparse = rna_sparse[:, :-6]
motif_sparse = sparse.csc_matrix(motif.to_numpy())
multiomics_sparse = sparse.hstack((atac_sparse, rna_sparse, motif_sparse)).tocsr()

Index(['AAACAGCCAATAACGA-4', 'AAACATGCAAGCGATG-1', 'AAACCGAAGGGCTAAA-3',
       'AAACCGCGTACTTCAC-3', 'AAACCGGCACTCGCTC-1', 'AAACGCGCAATGAAGC-2',
       'AAACGCGCAGGAATCG-4', 'AAACGGATCCGCCTCA-2', 'AAACGGATCTAGCGTG-3',
       'AAACGTACAAGGATTA-2',
       ...
       'TTTGTGAAGGCGGATG-3', 'TTTGTGAAGTAAGTGG-1', 'TTTGTGGCACACAATT-3',
       'TTTGTGGCAGGTTATT-3', 'TTTGTGTTCAACCAAC-4', 'TTTGTGTTCAATGACC-4',
       'TTTGTGTTCCCATAAA-1', 'TTTGTGTTCGCCTGTT-4', 'TTTGTGTTCTAAGGAG-1',
       'TTTGTTGGTCCTAAGA-1'],
      dtype='object', length=3900)


# Computing the AUCs

Originally, I had two loops and computed AUCs for each individual HIV gene in addition to the total target goal, however, computing 100000 x 7 ROC-AUC scores takes multiple hours on my CPU so I removed it for the report.

In [None]:
aucdict = dict()
# for targetindex in range(7):
aucs = []
l1vertdist = []
fprs = []
tprs = []
for i in tqdm(range(multiomics_sparse.shape[1])):
    bin_target = (multiomics["Target"].to_numpy()>0).astype(int)
    sparsecol = multiomics_sparse[:, i]
    bin_col = (sparsecol.toarray()>0).astype(int)
    fpr, tpr, _ = roc_curve(bin_target,bin_col)
    aucs.append(auc(fpr, tpr))
    l1vertdist.append(np.sum(np.abs(tpr-fpr)))
    fprs.append(fpr)
    tprs.append(tpr)
aucdict[6] = aucs
print(aucs)

 75%|████████████████████████████████████████████████████████▎                  | 81332/108425 [56:08<19:06, 23.63it/s]

In [None]:
indset = set()
for auckey in aucdict.keys():
    print(np.array(aucdict[auckey]).shape)
    topvals = 100

    ind = np.argpartition(np.abs(np.array(aucdict[auckey])-0.5), -topvals)[-topvals:]
    ind = np.argpartition(l1vertdist, -topvals)[-topvals:]
    
    indset.update(ind.tolist())
    print(len(indset))

In [None]:
for ind in indset:
    plt.figure()
    plt.plot(fpr[ind], tpr[ind])

In [9]:
# del multiomics_clean

multiomics_clean = multiomics.drop(target_tags, axis=1).drop(pd.Index(["dataset"]), axis=1).iloc[:, list(indset)]
multiomics_clean = pd.concat((multiomics_clean, multiomics[target_tags]), axis=1)
multiomics_clean = pd.concat((multiomics_clean, multiomics["dataset"]), axis=1)
print(multiomics_clean.shape)

(3900, 108)


In [10]:
multiomics_clean.to_csv("../data_csv/multiomics_auc_dist_{}.csv".format(len(indset)), index=None)