In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import glob
from pprint import pprint
from collections import Counter

In [None]:
SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [8]:
EVEROLIMUS = "C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)OCCO)C)/C)O)OC)C)C)/C)OC"
RIBOCICLIB = "CN(C)C(=O)C1=CC2=CN=C(N=C2N1C3CCCC3)NC4=NC=C(C=C4)N5CCNCC5"

In [4]:
def generate_test_data(genie_data, mutation_data, drugs):
    
    cell_lines = genie_data.query('`CDK4/6 Inhibitor Overall` == "Yes" or `mTOR Inhibitor Overall` == "Yes"')
    cell_lines = sorted(cell_lines['Sample ID'])
    cell_line_df = pd.DataFrame(cell_lines, columns=['C'])
    
    filtered_mut_data = mutation_data.query('sampleId in @cell_lines')
    filtered_mut_data = filtered_mut_data.sort_values(by=['sampleId'])
    filtered_mut_data = filtered_mut_data.drop(columns=['sampleId', 'Altered'])
    
    test_data = []
    for d in drugs:
        for c in cell_lines:
            test_data.append((c, d, 0.5))
    test_data_df = pd.DataFrame(test_data, columns=['C', 'D', 'AUC'])
    
    return cell_line_df, filtered_mut_data, test_data_df

In [9]:
def generate_cell_mutation_file(all_genie_genes, other_genes, genie_data, all_mutation_data):
    
    cell_lines = genie_data.query('`CDK4/6 Inhibitor Overall` == "Yes" or `mTOR Inhibitor Overall` == "Yes"')
    cell_lines = sorted(cell_lines['Sample ID'])

    filtered_mut_data = all_mutation_data.query('sampleId in @cell_lines')
    filtered_mut_data = filtered_mut_data.sort_values(by=['sampleId'])
    filtered_mut_data.drop(columns=['sampleId', 'Altered'], inplace=True)
    
    non_overlapping_genes_AnotB = [g for g in all_genie_genes if g not in other_genes] #A-B
    
    filtered_mut_data.drop(columns=non_overlapping_genes_AnotB, inplace=True)
    
    non_overlapping_genes_BnotA = [g for g in other_genes if g not in all_genie_genes] #B-A
    
    for gene in non_overlapping_genes_BnotA:
        filtered_mut_data[gene] = 0
        
    filtered_mut_data.sort_index(inplace=True, axis=1)
    
    return filtered_mut_data

In [50]:
def create_survival_plot(g_data, test_data, predict_data):
    
    genie_data = g_data.query('`CDK4/6 Inhibitor Overall` == "Yes" or `mTOR Inhibitor Overall` == "Yes"')
    genie_data = genie_data.sort_values(by=['Sample ID'])
    
    pred_df = pd.Series(predict_data, name='P_AUC')
    pred_df = pd.concat([test_data, pred_df], axis=1)[['C', 'D', 'P_AUC']]
        
    pred_median = pred_df.groupby('D')['P_AUC'].median().to_dict()
    print(pred_median)
    
    dc_p = dict()
    dc_n = dict()
    
    for _,row in genie_data.iterrows():
        
        cell_line = row['Sample ID']
        df = pred_df.query('C == @cell_line')
        dp_map = dict(zip(df.D, df.P_AUC))
        
        is_positive = False
        if row['CDK4/6 Inhibitor Overall'] == 'Yes' and row['mTOR Inhibitor Overall'] == 'Yes':
            if dp_map[RIBOCICLIB] <= pred_median[RIBOCICLIB] and dp_map[EVEROLIMUS] <= pred_median[EVEROLIMUS]:
                is_positive = True
            #print(row['Overall Survival (Months)'], is_positive, dp_map[RIBOCICLIB], dp_map[EVEROLIMUS])
        elif row['CDK4/6 Inhibitor Overall'] == 'Yes':
            if dp_map[RIBOCICLIB] <= pred_median[RIBOCICLIB]:
                is_positive = True
        elif row['mTOR Inhibitor Overall'] == 'Yes':
            if dp_map[EVEROLIMUS] <= pred_median[EVEROLIMUS]:
                is_positive = True
        else:
            print('This code should have been unreachable')
            continue
        
        if is_positive:
            dc_p[cell_line] = row['Overall Survival (Months)']
        else:
            dc_n[cell_line] = row['Overall Survival (Months)']

    return dc_p, dc_n


In [12]:
genie_data = pd.read_csv('../data/GENIE/brca_akt1_genie_2019_clinical_data.tsv', sep='\t')

nest_mutation_data = pd.read_csv('../data/GENIE/sample_matrix_nest.txt', sep='\t')
nest_mutation_data['sampleId'] = nest_mutation_data['studyID:sampleId'].str.split(":", expand=True)[1]
nest_mutation_data.drop(columns =['studyID:sampleId'], inplace = True)

all_mutation_data = pd.read_csv('../data/GENIE/sample_matrix_all_genes.txt', sep='\t')
all_mutation_data['sampleId'] = all_mutation_data['studyID:sampleId'].str.split(":", expand=True)[1]
all_mutation_data.drop(columns =['studyID:sampleId'], inplace = True)

drugs = pd.read_csv('../data/GENIE/GENIE_drug2ind.txt', sep='\t', header=None, names=['I', 'D'])['D']

predict_data = np.loadtxt('../result/predict_genie_clinical_trial.txt')
test_data = pd.read_csv('../data/GENIE/GENIE_test.txt', sep='\t', header=None, names=['C', 'D', 'AUC'])

all_genie_genes = pd.read_csv('../data/GENIE/GENIE_gene_list.txt', header=None, names=['G'])['G']

drugcell_genes = pd.read_csv('../../drugcell/data/gene2ind.txt', sep='\t', header=None, names=['I', 'G'])['G']

In [13]:
cell_lines, filtered_mut_data, test_data = generate_test_data(genie_data, nest_mutation_data, drugs)

cell_lines.to_csv('../data/GENIE/GENIE_cell2ind.txt', sep='\t', header=False, index=True)
filtered_mut_data.to_csv('../data/GENIE/GENIE_cell2mutation.txt', header=False, index=False)
test_data.to_csv("../data/GENIE/GENIE_test.txt", sep='\t', header=False, index=False)

In [14]:
filtered_mut_data = generate_cell_mutation_file(list(all_genie_genes), list(drugcell_genes), genie_data, all_mutation_data)
filtered_mut_data.to_csv('../../drugcell/data/GENIE_cell2mutation.txt', header=False, index=False)

In [36]:
dc_p, dc_n = create_survival_plot(genie_data, test_data, predict_data)

print(np.median(list(dc_p.values())), len(dc_p.values()))
print(np.median(list(dc_n.values())), len(dc_n.values()))

{'CN(C)C(=O)C1=CC2=CN=C(N=C2N1C3CCCC3)NC4=NC=C(C=C4)N5CCNCC5': 0.67694, 'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)OCCO)C)/C)O)OC)C)C)/C)OC': 0.62833, 'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)[C@@H](CC4=C3C=CC(=C4)O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F': 0.91865}
43.125 111
37.828947365000005 110


In [51]:
drugcell_predict_data = np.loadtxt('../../drugcell/result/predict_genie_drugcell.txt')
dc_p, dc_n = create_survival_plot(genie_data, test_data, drugcell_predict_data)

print(np.median(list(dc_p.values())), len(dc_p.values()))
print(np.median(list(dc_n.values())), len(dc_n.values()))

{'CN(C)C(=O)C1=CC2=CN=C(N=C2N1C3CCCC3)NC4=NC=C(C=C4)N5CCNCC5': 0.70896, 'C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)OCCO)C)/C)O)OC)C)C)/C)OC': 0.63519, 'C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)[C@@H](CC4=C3C=CC(=C4)O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F': 0.91576}
36.858552630000005 104
43.84868421 117
