In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import umap
import os
from os.path import join
from tqdm import tqdm
from disease_ontology import sorted_cancer_subtypes

from scipy.sparse import csr_matrix
from sklearn.svm import SVC, OneClassSVM
from sklearn.model_selection import cross_validate, train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler

# import warnings
# warnings.filterwarnings("ignore")

In [2]:
sample_encodings = pd.read_csv(join("data", "sample_subtype_encodings.csv"))
sample_ids = sample_encodings["ID_sample"].astype(str).values
subtype_encodings = sample_encodings.values[:,1:]
with open(join("data", "sorted_mutations.json"), "r") as f:
    sorted_mutations = json.load(f)

embeddings_data = pd.read_csv(join("embeddings", "mmd_vae_embeddings.csv"))
embeddings = embeddings_data.values[:,1:] 

In [3]:
scaler = StandardScaler()
standardized_embeddings = scaler.fit_transform(embeddings)

In [7]:
embeddings_evaluation = []
raw_evaluations = []
np.random.seed(100)

for i, st in tqdm(enumerate(sorted_cancer_subtypes)):
    labels = subtype_encodings[:,i]
    positive_samples = np.where(labels==1)[0]
    negative_samples = np.where(labels==0)[0]
    n_positive_samples = len(positive_samples)
    if len(positive_samples)>=len(negative_samples):
        selected_idxs = list(range(len(labels)))
    else:
        sampled_negative_examples = np.random.choice(negative_samples, len(positive_samples), replace=False)
        selected_idxs = np.concatenate((positive_samples, sampled_negative_examples))
    np.random.shuffle(selected_idxs)
    clf = SVC(C=3, kernel="linear", random_state=42, class_weight='balanced')
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'roc_auc']
    X = standardized_embeddings#[selected_idxs,:]
    y = labels#[selected_idxs]
    scores = cross_validate(clf, X, y, scoring=scoring, cv=3)

    accuracy = np.median(scores["test_accuracy"])
    precision = np.median(scores["test_precision_macro"])
    recall = np.median(scores["test_recall_macro"])
    auc = np.median(scores["test_roc_auc"])
    raw_evaluations.append((st, n_positive_samples, auc, accuracy, precision, recall))
    accuracy = "{:.2f} %".format(accuracy*100)
    precision = "{:.2f} %".format(precision*100)
    recall = "{:.2f} %".format(recall*100)
    auc = str(round(auc, 2))
    embeddings_evaluation.append((st, n_positive_samples, auc, accuracy, precision, recall))


56it [2:00:41, 129.32s/it]


In [8]:
embeddings_evaluation

[('hepatocellular_carcinoma', 849, '0.73', '54.40 %', '52.10 %', '62.54 %'),
 ('liver_cancer', 1315, '0.67', '54.89 %', '52.34 %', '58.96 %'),
 ('colon_adenocarcinoma', 575, '0.78', '66.44 %', '52.13 %', '65.89 %'),
 ('colon_carcinoma', 582, '0.78', '66.48 %', '52.23 %', '65.74 %'),
 ('colon_cancer', 633, '0.79', '64.61 %', '52.34 %', '65.87 %'),
 ('colorectal_cancer', 633, '0.79', '64.61 %', '52.34 %', '65.87 %'),
 ('intestinal_cancer', 1814, '0.81', '72.73 %', '58.41 %', '66.31 %'),
 ('esophageal_carcinoma', 969, '0.8', '62.63 %', '53.44 %', '72.70 %'),
 ('stomach_cancer', 906, '0.81', '56.94 %', '53.23 %', '63.67 %'),
 ('biliary_tract_cancer', 613, '0.62', '60.07 %', '50.77 %', '57.51 %'),
 ('gastrointestinal_system_cancer',
  6255,
  '0.8',
  '69.35 %',
  '67.46 %',
  '62.16 %'),
 ('integumentary_system_cancer', 668, '0.63', '67.00 %', '51.02 %', '58.94 %'),
 ('diffuse_large_B_cell_lymphoma',
  256,
  '0.68',
  '59.22 %',
  '50.74 %',
  '59.50 %'),
 ('lymphoma', 760, '0.74', '57.64

## Comparison to SVM trained on normal data

In [18]:
with open(join("data", "mutations_mapping_split.json"), "r") as f:
    mutations_mapping = json.load(f)
deleterious_mutations = {k: v[0] for k, v in mutations_mapping.items()}
non_deleterious_mutations = {k: v[1] for k, v in mutations_mapping.items()}
mutations_lookup = {m: i for i, m in enumerate(sorted_mutations)}

In [19]:
del_mutations_cols = []
del_mutations_rows = []
del_data = []

nd_mutations_cols = []
nd_mutations_rows = []
nd_data = []
row_counter = 0
for sid in tqdm(sample_ids):
    if sid not in mutations_mapping.keys():
        continue
    del_mutations = deleterious_mutations[sid]
    if len(del_mutations)<1:
        del_data.append(0)
        del_mutations_cols.append(0)
        del_mutations_rows.append(row_counter)
    else:
        del_data.extend([1]*len(del_mutations))
        del_mutations_cols.extend([mutations_lookup[m] for m in del_mutations])
        del_mutations_rows.extend([row_counter]*len(del_mutations))    
    
    nd_mutations = non_deleterious_mutations[sid]
    if len(nd_mutations)<1:
        nd_data.append(0)
        nd_mutations_cols.append(0)
        nd_mutations_rows.append(row_counter)
    else:
        nd_data.extend([1]*len(nd_mutations))
        nd_mutations_cols.extend([mutations_lookup[m] for m in nd_mutations])
        nd_mutations_rows.extend([row_counter]*len(nd_mutations))

    row_counter += 1

del_mutations_cols = np.array(del_mutations_cols).astype(int)
del_mutations_rows = np.array(del_mutations_rows).astype(int)

nd_mutations_cols = np.array(nd_mutations_cols).astype(int)
nd_mutations_rows = np.array(nd_mutations_rows).astype(int)


del_mutations_profiles = csr_matrix((np.array(del_data), (del_mutations_rows, del_mutations_cols)))
nd_mutations_profiles = csr_matrix((np.array(nd_data), (nd_mutations_rows, nd_mutations_cols)))

100%|██████████| 25645/25645 [00:01<00:00, 16820.16it/s]


In [21]:
del_mutations_profiles.shape

(25645, 12449)

In [24]:
del_mut_profiles_evaluation = []
raw_del_evaluation = []
nd_mut_profiles_evaluation = []
raw_nd_evaluation = []
np.random.seed(42)

for i, st in tqdm(enumerate(sorted_cancer_subtypes)):
    labels = subtype_encodings[:,i]
    positive_samples = np.where(labels==1)[0]
    negative_samples = np.where(labels==0)[0]
    n_positive_samples = len(positive_samples)
    if len(positive_samples)>=len(negative_samples):
        selected_idxs = list(range(len(labels)))
    else:
        sampled_negative_examples = np.random.choice(negative_samples, len(positive_samples), replace=False)
        selected_idxs = np.concatenate((positive_samples, sampled_negative_examples))
    np.random.shuffle(selected_idxs)
    
    # Deleterious mutations
    clf = SVC(C=3, kernel="linear", random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'roc_auc']
    X = del_mutations_profiles[selected_idxs,:]
    y = labels[selected_idxs]
    scores = cross_validate(clf, X, y, scoring=scoring, cv=5)

    accuracy = np.median(scores["test_accuracy"])
    precision = np.median(scores["test_precision_macro"])
    recall = np.median(scores["test_recall_macro"])
    auc = np.median(scores["test_roc_auc"])
    raw_del_evaluation.append((st, n_positive_samples, auc, accuracy, precision, recall))
    accuracy = "{:.2f} %".format(accuracy*100)
    precision = "{:.2f} %".format(precision*100)
    recall = "{:.2f} %".format(recall*100)
    auc = str(round(auc, 2))
    del_mut_profiles_evaluation.append((st, n_positive_samples, auc, accuracy, precision, recall))
    
    # Non-deleterious mutations
    clf = SVC(C=3, kernel="linear", random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro', 'roc_auc']
    X = nd_mutations_profiles[selected_idxs,:]
    y = labels[selected_idxs]
    scores = cross_validate(clf, X, y, scoring=scoring, cv=5)

    accuracy = np.median(scores["test_accuracy"])
    precision = np.median(scores["test_precision_macro"])
    recall = np.median(scores["test_recall_macro"])
    auc = np.median(scores["test_roc_auc"])
    raw_nd_evaluation.append((st, n_positive_samples, auc, accuracy, precision, recall))
    accuracy = "{:.2f} %".format(accuracy*100)
    precision = "{:.2f} %".format(precision*100)
    recall = "{:.2f} %".format(recall*100)
    auc = str(round(auc, 2))
    nd_mut_profiles_evaluation.append((st, n_positive_samples, auc, accuracy, precision, recall))

56it [3:07:24, 200.80s/it] 


In [26]:
with open("ignore/del_mut_profiles_evaluation_linear.json", "w") as f:
    json.dump(del_mut_profiles_evaluation, f)
with open("ignore/nd_mut_profiles_evaluation_linear.json", "w") as f:
    json.dump(nd_mut_profiles_evaluation, f)

In [28]:
nd_mut_profiles_evaluation

[('hepatocellular_carcinoma', 849, '0.65', '61.95 %', '62.53 %', '61.91 %'),
 ('liver_cancer', 1315, '0.68', '65.21 %', '65.67 %', '65.21 %'),
 ('colon_adenocarcinoma', 575, '0.76', '69.57 %', '70.31 %', '69.57 %'),
 ('colon_carcinoma', 582, '0.76', '71.67 %', '71.99 %', '71.64 %'),
 ('colon_cancer', 633, '0.76', '71.15 %', '71.64 %', '71.11 %'),
 ('colorectal_cancer', 633, '0.75', '70.75 %', '71.75 %', '70.70 %'),
 ('intestinal_cancer', 1814, '0.74', '70.11 %', '70.18 %', '70.11 %'),
 ('esophageal_carcinoma', 969, '0.76', '72.35 %', '72.62 %', '72.34 %'),
 ('stomach_cancer', 906, '0.75', '68.60 %', '68.93 %', '68.61 %'),
 ('biliary_tract_cancer', 613, '0.65', '58.37 %', '59.69 %', '58.29 %'),
 ('gastrointestinal_system_cancer',
  6255,
  '0.66',
  '63.71 %',
  '63.72 %',
  '63.71 %'),
 ('integumentary_system_cancer', 668, '0.55', '50.56 %', '50.69 %', '50.62 %'),
 ('diffuse_large_B_cell_lymphoma',
  256,
  '0.75',
  '57.84 %',
  '63.49 %',
  '57.84 %'),
 ('lymphoma', 760, '0.74', '63.

In [31]:
with open(join("embeddings", "svm_evaluation.txt"), "w") as f:
    
    f.write("| Cancer Subtype | N. Pos. Samples | Embeddings AUC | Xdel AUC | Xnd AUC |\n")
    f.write("| --- | --- | --- | --- | --- |\n")
    counters = [0, 0, 0]
    for i in range(len(embeddings_evaluation)):
        st = embeddings_evaluation[i][0]
        n_positive_samples = embeddings_evaluation[i][1]
        embeddings_auc = float(embeddings_evaluation[i][2])
        xdel_auc = float(del_mut_profiles_evaluation[i][2])
        xnd_auc = float(nd_mut_profiles_evaluation[i][2])
        max_auc = max([embeddings_auc, xdel_auc, xnd_auc])
        if embeddings_auc==max_auc:
            embeddings_auc = "<b>{}</b>".format(embeddings_auc)  
            counters[0] += 1
        if xdel_auc==max_auc:
            xdel_auc = "<b>{}</b>".format(xdel_auc)
            counters[1] += 1
        if xnd_auc==max_auc:
            xnd_auc = "<b>{}</b>".format(xnd_auc)
            counters[2] += 1
        f.write("| {} | {} | {} | {} | {} |\n".format(st, n_positive_samples, embeddings_auc, xdel_auc, xnd_auc))

In [32]:
counters

[24, 25, 11]