In [None]:
from collections import Counter
import glob
from lifelines import KaplanMeierFitter
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import researchpy as rp
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
SMALL_SIZE = 12
MEDIUM_SIZE = 16
BIGGER_SIZE = 22

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [None]:
EVEROLIMUS = "C[C@@H]1CC[C@H]2C[C@@H](/C(=C/C=C/C=C/[C@H](C[C@H](C(=O)[C@@H]([C@@H](/C(=C/[C@H](C(=O)C[C@H](OC(=O)[C@@H]3CCCCN3C(=O)C(=O)[C@@]1(O2)O)[C@H](C)C[C@@H]4CC[C@H]([C@@H](C4)OC)OCCO)C)/C)O)OC)C)C)/C)OC"
RIBOCICLIB = "CN(C)C(=O)C1=CC2=CN=C(N=C2N1C3CCCC3)NC4=NC=C(C=C4)N5CCNCC5"
PALBOCICLIB = "CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCNCC4)C5CCCC5)C(=O)C"
CAPIVASERTIB = "C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C3=NC=NC4=C3C=CN4"
FULVESTRANT = "C[C@]12CC[C@H]3[C@H]([C@@H]1CC[C@@H]2O)[C@@H](CC4=C3C=CC(=C4)O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F"

drug_func_map = {RIBOCICLIB:'CDK4_6_Inhibitor_Overall', EVEROLIMUS:'mTOR_Inhibitor_Overall'}
drug_func_map[PALBOCICLIB] = 'CDK4_6_Inhibitor_Overall'
drug_func_map[CAPIVASERTIB] = 'AKT_Inhibitor_Overall'

drug_name_map = {EVEROLIMUS:'Everolimus', RIBOCICLIB:'Ribociclib', PALBOCICLIB:'Palbociclib'}
drug_name_map[CAPIVASERTIB] = 'Capivasertib'
drug_name_map[FULVESTRANT] = 'Fulvestrant'

In [None]:
def generate_test_data(genie_data, drugs):
    
    cell_lines = sorted(genie_data['Sample_ID'])
    cell_line_df = pd.DataFrame(cell_lines, columns=['C'])
    
    test_data = []
    for d in drugs:
        for c in cell_lines:
            test_data.append((c, d, 0.0, 'GENIE', drug_name_map[d]))
    test_data_df = pd.DataFrame(test_data, columns=['cell_line', 'smiles', 'auc', 'dataset', 'drug'])
    
    return cell_line_df, test_data_df

In [None]:
def generate_cell_mutation_file(all_genie_genes, other_genes, genie_data, all_mutation_data):
    
    cell_lines = sorted(genie_data['Sample_ID'])

    filtered_mut_data = all_mutation_data.query('sampleId in @cell_lines')
    filtered_mut_data = filtered_mut_data.sort_values(by=['sampleId'])
    filtered_mut_data.drop(columns=['sampleId', 'Altered'], inplace=True)
    
    non_overlapping_genes_AnotB = [g for g in all_genie_genes if g not in other_genes] #A-B
    
    filtered_mut_data.drop(columns=non_overlapping_genes_AnotB, inplace=True)
    
    non_overlapping_genes_BnotA = [g for g in other_genes if g not in all_genie_genes] #B-A
    
    for gene in non_overlapping_genes_BnotA:
        filtered_mut_data[gene] = 0
        
    filtered_mut_data.sort_index(inplace=True, axis=1)
    
    return filtered_mut_data

In [None]:
def create_confusion_matrix(table, entity):
    
    fig, ax = plt.subplots(figsize=(7,6))    
    sns.heatmap(table, vmin=0, vmax=60, annot=True, cmap='Blues', ax=ax)
    ax.invert_yaxis()
    ax.invert_xaxis()
    plt.yticks(rotation=0)
    ax.xaxis.tick_top() # x axis on top
    ax.xaxis.set_label_position('top')
    ax.set_ylabel(entity + ' True')
    ax.set_xlabel(entity + ' Prediction')

In [None]:
def create_kaplan_meier(true_positive, false_negative, false_positive, true_negative, entity):
    
    if len(true_positive) == 0 or len(true_negative) == 0 or len(false_positive) == 0 or len(false_negative) == 0:
        return
        
    kmf_d = KaplanMeierFitter(label=entity + " Survival plot")
    
    kmf_d.fit(true_positive, label='True Sensitive(TP)')
    a = kmf_d.plot(ci_show=False)
    
    kmf_d.fit(false_negative, label='False Resistive(FN)')
    a = kmf_d.plot(ci_show=False)
    
    kmf_d.fit(false_positive, label='False Sensitive(FP)')
    a = kmf_d.plot(ci_show=False)
    
    kmf_d.fit(true_negative, label='True Resistive(TN)')
    a = kmf_d.plot(ci_show=False)
    
    a.set_xlim(0, 120)
    a.set_xlabel('Overall_Survival_Months')
    a.set_ylabel('Surviving Fractions')

In [None]:
def create_drug_combo_survival_plot(genie_data, pred_df, pred_median_dict, d_list):
    
    pos_ctrl_df = genie_data.query('CDK4_6_Inhibitor_Overall == "Yes" or mTOR_Inhibitor_Overall == "Yes"')
    pos_ctrl_df = pos_ctrl_df.sort_values(by=['Sample_ID'])
    
    neg_ctrl_df = genie_data.query('CDK4_6_Inhibitor_Overall == "No" and mTOR_Inhibitor_Overall == "No"')
    neg_ctrl_df = neg_ctrl_df.sort_values(by=['Sample_ID'])
    
    true_positive = []
    false_negative = []
    false_positive = []
    true_negative = []
    
    for _,row in pos_ctrl_df.iterrows():
        
        sampleId = row['Sample_ID']
        df = pred_df.query('C == @sampleId')
        dp_map = dict(zip(df.smiles, df.pred))
        
        is_positive = False
        for d in d_list:
            if row[drug_func_map[d]] == 'Yes' and dp_map[d] <= pred_median_dict[d]:
                is_positive = True
        
        if is_positive:
            true_positive.append(row['Overall_Survival_Months'])
        else:
            false_negative.append(row['Overall_Survival_Months'])
        
    print('True Sensitive:', len(true_positive))
    print('False Resistive:', len(false_negative))
    
    for _,row in neg_ctrl_df.iterrows():
        
        sampleId = row['Sample_ID']
        df = pred_df.query('C == @sampleId')
        dp_map = dict(zip(df.smiles, df.pred))
        
        is_positive = False
        for d in d_list:
            if dp_map[d] <= pred_median_dict[d]:
                is_positive = True
        
        if is_positive:
            false_positive.append(row['Overall_Survival_Months'])
        else:
            true_negative.append(row['Overall_Survival_Months'])

    print('False Sensitive:', len(false_positive))
    print('True Resistive:', len(true_negative))
    
    if len(true_positive) == 0 or len(true_negative) == 0 or len(false_positive) == 0 or len(false_negative) == 0:
        return
    
    kmf_d = KaplanMeierFitter(label="Survival plot")
    kmf_d.fit(true_positive, label='True Sensitive(TP)')
    print('True Sensitive median:', kmf_d.median_survival_time_)
    a = kmf_d.plot(ci_show=False)
    kmf_d.fit(false_negative, label='False Resistive(FN)')
    print('False Resistive median:', kmf_d.median_survival_time_)
    a = kmf_d.plot(ci_show=False)
    kmf_d.fit(false_positive, label='False Sensitive(FP)')
    print('False Sensitive median:', kmf_d.median_survival_time_)
    a = kmf_d.plot(ci_show=False)
    kmf_d.fit(true_negative, label='True Resistive(TN)')
    print('True Resistive median:', kmf_d.median_survival_time_)
    a = kmf_d.plot(ci_show=False)
    a.set_xlim(0, 120)
    a.set_xlabel('Overall_Survival_Months')
    a.set_ylabel('Surviving Fractions')
    return

In [None]:
def create_drug_survival_plot(genie_drug_df, drug):
  
    drug_function = drug_func_map[drug]
    genie_drug_df['Drug_Function'] = ""
    y_true = []
    y_pred = []
    
    for i,row in genie_drug_df.iterrows():
        if row[drug_function] == 'Yes':
            y_true.append('+')
            if row[drug_name_map[drug]] == 'Sensitive':
                genie_drug_df.at[i, 'Drug_Function'] = 'True Sensitive(TP)'
                y_pred.append('+')
            else:
                genie_drug_df.at[i, 'Drug_Function'] = 'False Resistive(FN)'
                y_pred.append('-')
        else:
            y_true.append('-')
            if row[drug_name_map[drug]] == 'Sensitive':
                genie_drug_df.at[i, 'Drug_Function'] = 'False Sensitive(FP)'
                y_pred.append('+')
            else:
                genie_drug_df.at[i, 'Drug_Function'] = 'True Resistive(TN)'
                y_pred.append('-')
    
    print(genie_drug_df.groupby(by=['Drug_Function']).median()['Overall_Survival_Months'])
    print(rp.summary_cont(genie_drug_df['Overall_Survival_Months'].groupby(genie_drug_df['Drug_Function'])))
    
    true_positive = genie_drug_df['Overall_Survival_Months'][genie_drug_df['Drug_Function'] == 'True Sensitive(TP)']
    false_negative = genie_drug_df['Overall_Survival_Months'][genie_drug_df['Drug_Function'] == 'False Resistive(FN)']
    false_positive = genie_drug_df['Overall_Survival_Months'][genie_drug_df['Drug_Function'] == 'False Sensitive(FP)']
    true_negative = genie_drug_df['Overall_Survival_Months'][genie_drug_df['Drug_Function'] == 'True Resistive(TN)']
    
    model = ols('Overall_Survival_Months ~ C(' + 
                drug_function + ') + C('+ drug_name_map[drug] + ') + C(' + 
                drug_function + '):C(' + drug_name_map[drug] + ')', data=genie_drug_df).fit()
    print(sm.stats.anova_lm(model, typ=2))
    
    table = pd.pivot_table(genie_drug_df, values='Overall_Survival_Months',
                       index=drug_func_map[drug], columns=[drug_name_map[drug]], aggfunc=np.median)
    create_confusion_matrix(table, drug_name_map[drug])
    
    return true_positive, false_negative, false_positive, true_negative

In [None]:
#Common data

genie_data = pd.read_csv('../data/GENIE/brca_akt1_genie_2019_clinical_data.tsv', sep='\t')
genie_data.columns = genie_data.columns.str.replace(' ','_')
genie_data.columns = genie_data.columns.str.replace('/','_')
genie_data.columns = genie_data.columns.str.replace('(','')
genie_data.columns = genie_data.columns.str.replace(')','')

all_genie_genes = pd.read_csv('../data/GENIE/GENIE_gene_list.txt', header=None, names=['G'])['G']

drugs = pd.read_csv('../data/GENIE/GENIE_all_drug2ind.txt', sep='\t', header=None, names=['I', 'D'])['D']

test_data = pd.read_csv('../data/GENIE/GENIE_test_zscore.txt', sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset', 'drug'])

all_mutation_data = pd.read_csv('../data/GENIE/sample_matrix_all_genes.txt', sep='\t')
all_mutation_data['sampleId'] = all_mutation_data['studyID:sampleId'].str.split(":", expand=True)[1]
all_mutation_data.drop(columns =['studyID:sampleId'], inplace = True)

In [None]:
# Generating test data

#drugcell_cg_genes = pd.read_csv('../data/gene2ind_cg.txt', sep='\t', header=None, names=['I', 'G'])['G']

#filtered_mut_data = generate_cell_mutation_file(list(all_genie_genes), list(drugcell_cg_genes), genie_data, all_mutation_data)

#cell_lines, test_data = generate_test_data(genie_data, drugs)

#cell_lines.to_csv('../data/GENIE/GENIE_all_cell2ind.txt', sep='\t', header=False, index=True)
#filtered_mut_data.to_csv('../data/GENIE/GENIE_cell2mutation_cg.txt', header=False, index=False)
#test_data.to_csv("../data/GENIE/GENIE_test_zscore.txt", sep='\t', header=False, index=False)

In [None]:
nest_predict_data = np.loadtxt('../model_cg_4_auc/predict_genie.txt')

nest_pred_df = pd.Series(nest_predict_data, name='pred')
nest_pred_df = pd.concat([test_data, nest_pred_df], axis=1)[['cell_line', 'smiles', 'pred']]
    
nest_pred_median_dict = nest_pred_df.groupby('smiles')['pred'].median().to_dict()
for key in nest_pred_median_dict.keys():
    print(drug_name_map[key], ':', nest_pred_median_dict[key])

In [None]:
genie_data = genie_data.query('not (CDK4_6_Inhibitor_Overall == "Yes" and mTOR_Inhibitor_Overall == "Yes")').reset_index()

In [None]:
genie_drug_data = genie_data.copy()
for d in drug_name_map.keys():
    genie_drug_data[drug_name_map[d]] = ""

for i,row in genie_data.iterrows():
    sampleId = row['Sample_ID']
    df = nest_pred_df.query('cell_line == @sampleId')
    for d in drug_name_map.keys():
        p_auc = df[df.smiles == d]['pred'].item()
        if p_auc <= nest_pred_median_dict[d]:
            genie_drug_data.at[i, drug_name_map[d]] = "Sensitive"
        else:
            genie_drug_data.at[i, drug_name_map[d]] = "Resistive"


In [None]:
#Survival plot for PALBOCICLIB

tp, fn, fp, tn = create_drug_survival_plot(genie_drug_data, PALBOCICLIB)

In [None]:
create_kaplan_meier(tp, fn, fp, tn, drug_name_map[PALBOCICLIB])

In [None]:
#Survival plot for EVEROLIMUS

tp, fn, fp, tn = create_drug_survival_plot(genie_drug_data, EVEROLIMUS)

In [None]:
create_kaplan_meier(tp, fn, fp, tn, drug_name_map[EVEROLIMUS])

In [None]:
#Common data

genie_data = pd.read_csv('../data/GENIE/brca_akt1_genie_2019_clinical_data.tsv', sep='\t')
genie_data.columns = genie_data.columns.str.replace(' ','_')
genie_data.columns = genie_data.columns.str.replace('/','_')
genie_data.columns = genie_data.columns.str.replace('(','')
genie_data.columns = genie_data.columns.str.replace(')','')

all_genie_genes = pd.read_csv('../data/GENIE/GENIE_gene_list.txt', header=None, names=['G'])['G']

drugs = pd.read_csv('../data/GENIE/GENIE_all_drug2ind.txt', sep='\t', header=None, names=['I', 'D'])['D']

test_data = pd.read_csv('../data/GENIE/GENIE_test_zscore.txt', sep='\t', header=None, names=['cell_line', 'smiles', 'auc', 'dataset', 'drug'])

all_mutation_data = pd.read_csv('../data/GENIE/sample_matrix_all_genes.txt', sep='\t')
all_mutation_data['sampleId'] = all_mutation_data['studyID:sampleId'].str.split(":", expand=True)[1]
all_mutation_data.drop(columns =['studyID:sampleId'], inplace = True)

In [None]:
nest_predict_data = np.loadtxt('../model_cg_4_zscore/predict_genie.txt')

nest_pred_df = pd.Series(nest_predict_data, name='pred')
nest_pred_df = pd.concat([test_data, nest_pred_df], axis=1)[['cell_line', 'smiles', 'pred']]
    
nest_pred_median_dict = nest_pred_df.groupby('smiles')['pred'].median().to_dict()
for key in nest_pred_median_dict.keys():
    print(drug_name_map[key], ':', nest_pred_median_dict[key])

In [None]:
genie_data = genie_data.query('not (CDK4_6_Inhibitor_Overall == "Yes" and mTOR_Inhibitor_Overall == "Yes")').reset_index()

In [None]:
genie_drug_data = genie_data.copy()
for d in drug_name_map.keys():
    genie_drug_data[drug_name_map[d]] = ""

for i,row in genie_data.iterrows():
    sampleId = row['Sample_ID']
    df = nest_pred_df.query('cell_line == @sampleId')
    for d in drug_name_map.keys():
        p_auc = df[df.smiles == d]['pred'].item()
        if p_auc <= nest_pred_median_dict[d]:
            genie_drug_data.at[i, drug_name_map[d]] = "Sensitive"
        else:
            genie_drug_data.at[i, drug_name_map[d]] = "Resistive"


In [None]:
#Survival plot for PALBOCICLIB

tp, fn, fp, tn = create_drug_survival_plot(genie_drug_data, PALBOCICLIB)

In [None]:
create_kaplan_meier(tp, fn, fp, tn, drug_name_map[PALBOCICLIB])

In [None]:
#Survival plot for EVEROLIMUS

tp, fn, fp, tn = create_drug_survival_plot(genie_drug_data, EVEROLIMUS)

In [None]:
create_kaplan_meier(tp, fn, fp, tn, drug_name_map[EVEROLIMUS])

In [None]:
#Survival plot for CAPIVASERTIB

#create_drug_survival_plot(genie_drug_data, CAPIVASERTIB)

In [None]:
#create_gene_survival_plot(genie_drug_data, nest_pred_median_dict, all_mutation_data, 'AKT1', CAPIVASERTIB)

In [None]:
#For DrugCell_Classic

#dc_predict_data = np.loadtxt('../../drugcell/result/predict_genie_all_drugcell.txt')

#dc_pred_df = pd.Series(dc_predict_data, name='P_AUC')
#dc_pred_df = pd.concat([test_data, dc_pred_df], axis=1)[['C', 'D', 'P_AUC']]
    
#dc_pred_median_dict = dc_pred_df.groupby('D')['P_AUC'].median().to_dict()
#for key in dc_pred_median_dict.keys():
#    print(drug_name_map[key], ':', dc_pred_median_dict[key])

In [None]:
#dc_genie_drug_data = genie_data.copy()
#for d in drug_name_map.keys():
#    dc_genie_drug_data[drug_name_map[d]] = ""

#for i,row in genie_data.iterrows():
#    sampleId = row['Sample_ID']
#    df = dc_pred_df.query('C == @sampleId')
#    for d in drug_name_map.keys():
#        p_auc = df[df.D == d]['P_AUC'].item()
#        if p_auc <= dc_pred_median_dict[d]:
#            dc_genie_drug_data.at[i, drug_name_map[d]] = "Sensitive"
#        else:
#            dc_genie_drug_data.at[i, drug_name_map[d]] = "Resistive"

In [None]:
#Survival plot for RIBOCICLIB, EVEROLIMUS

#create_drug_combo_survival_plot(genie_data, dc_pred_df, dc_pred_median_dict, [RIBOCICLIB, EVEROLIMUS])

In [None]:
#Survival plot for RIBOCICLIB

#tp, fn, fp, tn = create_drug_survival_plot(dc_genie_drug_data, RIBOCICLIB)

In [None]:
#create_kaplan_meier(tp, fn, fp, tn, drug_name_map[RIBOCICLIB])

In [None]:
#Survival plot for EVEROLIMUS

#tp, fn, fp, tn = create_drug_survival_plot(dc_genie_drug_data, EVEROLIMUS)

In [None]:
#create_kaplan_meier(tp, fn, fp, tn, drug_name_map[EVEROLIMUS])

In [None]:
#print(genie_data.groupby(by=['Primary_Race']).median()['Overall_Survival_Months'])

#print(rp.summary_cont(genie_data['Overall_Survival_Months'].groupby(genie_data['Primary_Race'])))