In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import umap
import os
from os.path import join
from tqdm import tqdm
from disease_ontology import sorted_cancer_subtypes

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

import warnings
warnings.filterwarnings("ignore")

In [2]:
sample_encodings = pd.read_csv(join("data", "sample_subtype_encodings.csv"))
sample_ids = sample_encodings["ID_sample"].astype(str).values
subtype_encodings = sample_encodings.values[:,1:]
with open(join("data", "sorted_mutations.json"), "r") as f:
    sorted_mutations = json.load(f)

embeddings_data = pd.read_csv(join("embeddings", "vae_embeddings.csv"))
embeddings = embeddings_data.values[:,1:] 

In [12]:
embeddings_evaluation = []
raw_evaluations = []
np.random.seed(42)

for i, st in tqdm(enumerate(sorted_cancer_subtypes)):
    labels = subtype_encodings[:,i]
    positive_samples = np.where(labels==1)[0]
    negative_samples = np.where(labels==0)[0]
    n_positive_samples = len(positive_samples)
    if len(positive_samples)>=len(negative_samples):
        selected_idxs = list(range(len(labels)))
    else:
        sampled_negative_examples = np.random.choice(negative_samples, len(positive_samples), replace=False)
        selected_idxs = np.concatenate((positive_samples, sampled_negative_examples))
    np.random.shuffle(selected_idxs)
    clf = SVC(C=3, kernel="rbf", random_state=42)
    scoring = ['accuracy', 'precision_macro', 'recall_macro']
    X = embeddings[selected_idxs,:]
    y = labels[selected_idxs]
    scores = cross_validate(clf, X, y, scoring=scoring, cv=10)

    accuracy = np.median(scores["test_accuracy"])
    precision = np.median(scores["test_precision_macro"])
    recall = np.median(scores["test_recall_macro"])
    raw_evaluations.append((st, n_positive_samples, accuracy, precision, recall))
    accuracy = "{:.2f} %".format(accuracy*100)
    precision = "{:.2f} %".format(precision*100)
    recall = "{:.2f} %".format(recall*100)
    embeddings_evaluation.append((st, n_positive_samples,accuracy, precision, recall))
    
    
with open(join("embeddings", "svm_classification.txt"), "w") as f:
    f.write("| Cancer Subtype | N. Pos. Samples | Accuracy | Precision | Recall |\n")
    f.write("| --- | --- | --- | --- | --- |\n")
    for st, n_positive_samples, accuracy, precision, recall in embeddings_evaluation:
        f.write("| {} | {} | {} | {} | {} |\n".format(st, n_positive_samples, accuracy, precision, recall))

8it [00:20,  2.60s/it]


KeyboardInterrupt: 

In [4]:
np.median([ee[1] for ee in raw_evaluations])

0.6331553536485836

In [5]:
np.median([ee[1] for ee in raw_evaluations])

0.6331553536485836

In [6]:
with open(join("embeddings", "svm_classification.txt"), "w") as f:
    f.write("| Cancer Subtype | Accuracy | Precision | Recall |\n")
    f.write("| --- | --- | --- | --- | --- |\n")
    for st, accuracy, precision, recall in embeddings_evaluation:
        f.write("| {} | {} | {} | {} |\n".format(st, accuracy, precision, recall))

In [7]:
scores

{'fit_time': array([0.04221463, 0.0404973 , 0.0383091 , 0.03911734, 0.0386579 ]),
 'score_time': array([0.01020503, 0.0101397 , 0.0107758 , 0.00963354, 0.00997567]),
 'test_accuracy': array([0.58682635, 0.56287425, 0.56024096, 0.57831325, 0.54819277]),
 'test_precision_macro': array([0.58759019, 0.5668175 , 0.56175595, 0.57872465, 0.55030303]),
 'test_recall_macro': array([0.58706254, 0.56353987, 0.56024096, 0.57831325, 0.54819277])}

In [8]:
roc_curve()

NameError: name 'roc_curve' is not defined