In [1]:
from scipy.cluster.hierarchy import dendrogram, linkage  
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

import os
import numpy as np
from copy import deepcopy
import py3Dmol
from ipywidgets import interact, IntSlider
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)

import plotly as pl
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
import seaborn as sns
sns.set(style="whitegrid")

import ast
import math
import random as random
import copy
np.random.seed(7678)
from sklearn.metrics import auc, roc_curve

import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

pl.offline.init_notebook_mode(connected=True)


from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [2]:
import sys
sys.path.insert(0, '../MutCat/')
#import ccal # Original repository is in https://github.com/UCSD-CCAL/ccal
#from ancillary_v1_0 import *

In [3]:
CCLE_pathways = pd.read_table('/home/ebc/variome/CT2/ccal_PT/Inference/data/CCLE_19Q3/CCLE_expression.PATHWAYS.c2_c5_h.gct',sep='\t', skiprows=2,index_col=0).drop('Description',1)

sig = CCLE_pathways.loc['SINGH_NFE2L2_TARGETS',:].sort_values(ascending = False)
sig_mean = sig.mean()
sig_std = sig.std()

sig_Act = pd.Series(np.where(sig >= sig_mean + sig_std, 1, 0), 
                     index = sig.index, name='Active_sample')

sig_Null = pd.Series(np.where(sig < sig_mean - sig_std, 1, 0), 
                     index = sig.index, name='Unactive_sample')

del CCLE_pathways

In [4]:
DF_CCLE = pd.read_csv('/media/ebc/F47A-25A8/PDBMap_Backup/CCLE/XYZ/GW_Full_Gene_XYZ.csv',index_col=0)

In [5]:
DF_prob_map = pd.read_csv('~/variome/GIT/MutCat/analysis/NRF2/dataframes/GW/DF_Prob_cluster_size_gt_2_MAP_TO_TUMORBARCODE.csv',index_col=0)
DF_Prob_All_dist = pd.read_csv('~/variome/GIT/MutCat/analysis/NRF2/dataframes/GW/DF_Prob_cluster_size_gt_2_MAP_TO_CLUSTERING.csv',index_col=0)

In [6]:
ROC_thres = 0.5
sig_thres = sig.mean()+sig.std()
ii = 0

In [7]:
df_sig = pd.DataFrame(sig)
df_sig = df_sig.reset_index()
df_sig.rename(columns={'index':'Tumor_Sample_Barcode'},inplace=True)


sig_thres_act = sig.mean()+sig.std()
sig_thres_unact = sig.mean()-sig.std()

In [8]:
# only predict variants where we have signature data for
for_test_train = DF_prob_map[DF_prob_map.Tumor_Sample_Barcode.isin(sig_Act.index)][['Hugo_Symbol','Tumor_Sample_Barcode','var_id']].drop_duplicates()

print("Num Genes: %s, Num variants: %s"%(len(for_test_train.Hugo_Symbol.unique()),len(for_test_train.var_id.unique())))

Num Genes: 4877, Num variants: 93464


In [9]:
test_train_split = 0.8

# get test/train data
msk = np.random.rand(len(df_sig)) < test_train_split
train = df_sig[msk]
test = df_sig[~msk]

print(len(train),len(test))

923 287


In [10]:
# separate pre-computed probabilities for train/test sets

train = for_test_train[for_test_train.Tumor_Sample_Barcode.isin(train.Tumor_Sample_Barcode)]
test = for_test_train[for_test_train.Tumor_Sample_Barcode.isin(test.Tumor_Sample_Barcode)]

test = test.sort_values('Tumor_Sample_Barcode',ascending=True)
train = train.sort_values('Tumor_Sample_Barcode',ascending=True)

print("Num Genes in train: %s, Num variants in train: %s"%(len(train.Hugo_Symbol.unique()),len(train.var_id.unique())))
print("Num tumor samples in train: %s"%(len(train.Tumor_Sample_Barcode.unique())))
print()
      
print("Num Genes in test: %s, Num variants in test: %s"%(len(test.Hugo_Symbol.unique()),len(test.var_id.unique())))
print("Num tumor samples in test: %s"%(len(test.Tumor_Sample_Barcode.unique())))

Num Genes in train: 4876, Num variants in train: 77302
Num tumor samples in train: 850

Num Genes in test: 4405, Num variants in test: 18229
Num tumor samples in test: 259


In [11]:
def get_naive_bayes_probability(c,sig_Act):

    N = len(sig_Act)-1

    prob_c = pd.Series(c['act'].value_counts()/N, name = 'prob_c')
    prob_Act = pd.Series(sig_Act.value_counts()/N, name = 'prob_Act')
    joint_counts_Act_vs_c = pd.crosstab(sig_Act, c['act'])
    joint_prob_Act_vs_c = joint_counts_Act_vs_c/N
    cond_prob_Act_given_c = joint_prob_Act_vs_c.div(prob_c, axis='columns')

    cond_prob_sum = np.mean(cond_prob_Act_given_c.sum(axis=0).values)

    try:
        cond_prob_1_1 = np.round(cond_prob_Act_given_c, decimals=3).values[1][1]
        #cond_prob_1_0 = np.round(cond_prob_Act_given_c, decimals=3).values[1][0]
        #cond_prob_0_0 = np.round(cond_prob_Act_given_c, decimals=3).values[0][0]
        cond_prob_0_1 = np.round(cond_prob_Act_given_c, decimals=3).values[0][1]
        
    except IndexError:
        cond_prob_1_1 = np.nan
        cond_prob_0_1 = np.nan
        LOR = np.nan
        
    if pd.notnull(cond_prob_1_1) & pd.notnull(cond_prob_0_1):
        nom = np.true_divide(cond_prob_1_1,cond_prob_0_1)
        denom = np.true_divide(prob_Act.values[1],prob_Act[0])
        
        LOR = np.log(np.true_divide(nom,denom))
        

    return(LOR)

## Reduce Redundancy (similar pdb structures, similar clusters etc.)

In [12]:
DF_prob_map['structureID'] = [i.split('.')[0] for i in DF_prob_map.structureChainId.tolist()]

In [80]:
DF_prob_map_reduced = DF_prob_map[['Hugo_Symbol','dist','P(Act=1|c=1)','Log_CI_OR','structureID']].drop_duplicates()  # removes chainID redundancy

ind = DF_prob_map_reduced.index.tolist()

print(len(DF_prob_map), len(DF_prob_map_reduced))

DF_prob_map_reduced_2 = DF_prob_map[DF_prob_map.index.isin(ind)]

DF_prob_map_reduced_2['orig_index'] = DF_prob_map_reduced_2.index.tolist()

5741864 383437


In [79]:
len(DF_prob_map_reduced_2[['Hugo_Symbol','P(Act=1|c=1)','Log_Odds_Ratio']].drop_duplicates())

53379

In [91]:
DF_indd = DF_prob_map_reduced_2[['Hugo_Symbol','P(Act=1|c=1)','Log_Odds_Ratio']].drop_duplicates()  # removes PDB structure redundancy

print(len(DF_indd))

DF_indd['dist'] = DF_prob_map_reduced_2[DF_prob_map_reduced_2.index.isin(DF_indd.index)]['dist']
DF_indd['itemset'] = DF_prob_map_reduced_2[DF_prob_map_reduced_2.index.isin(DF_indd.index)]['ID']
DF_indd['orig_index'] = DF_indd.index.tolist()
DF_indd['structureID'] = DF_prob_map_reduced_2[DF_prob_map_reduced_2.index.isin(DF_indd.index)]['structureID']
DF_indd['structureChainId'] = DF_prob_map_reduced_2[DF_prob_map_reduced_2.index.isin(DF_indd.index)]['structureChainId']
DF_indd['cluster'] = [i.split('_')[3] for i in DF_indd.itemset]

print(len(DF_indd))

DF_indd.head(2)

53379
53379


Unnamed: 0,Hugo_Symbol,P(Act=1|c=1),Log_Odds_Ratio,dist,itemset,orig_index,structureID,structureChainId,cluster
0,A1CF,0.0,-0.167129,6.229596,A1CF_2CPD.A_c_10_,0,2CPD,2CPD.A,10
3,A1CF,0.25,0.937231,6.49346,A1CF_2CPD.A_c_12_,3,2CPD,2CPD.A,12


In [95]:
DF_indd = DF_indd[DF_indd.dist<17]  # removes all clusters with diameters larger than 17 Angstrom

print(len(DF_indd))

19074


In [96]:
sample = 'NCIH2172_LUNG'

DF_LOR = pd.DataFrame()

cls = DF_prob_map[(DF_prob_map.Tumor_Sample_Barcode==sample)&(DF_prob_map.index.isin(DF_indd.index.tolist()))].ID.unique().tolist()

NB = 0

for item in cls:

    clear_output(wait=True)
    print(cls.index(item),"out of",len(cls))

    c = pd.DataFrame(sig)
    c.rename(columns={'SINGH_NFE2L2_TARGETS':'act'},inplace=True)
    c['act'] = [0 for i in c.index]

    tt = DF_prob_map[(DF_prob_map.var_id.isin(test.var_id))&(DF_prob_map.ID==item)&(DF_prob_map.Tumor_Sample_Barcode!=sample)].Tumor_Sample_Barcode.tolist()
    c.loc[((c.index.isin(tt))),'act'] = 1

    LOR = get_naive_bayes_probability(c,sig_Act)

    NB = NB + np.round(LOR, decimals=3)

    DF_LOR = pd.concat([DF_LOR, pd.DataFrame([{'itemset':item,'LOR':LOR, 'sample':sample}])])
    
DF_LOR = DF_LOR[(DF_LOR.LOR != np.inf)&(DF_LOR.LOR != -np.inf)&(pd.notnull(DF_LOR.LOR))]



45 out of 46


In [105]:

DF_LOR = pd.DataFrame()

for s in sig.index:
    
    clear_output(wait=True)
    print(sig.index.tolist().index(s),"out of",len(sig.index))

    cls = DF_prob_map[(DF_prob_map.Tumor_Sample_Barcode==s)&(DF_prob_map.index.isin(DF_indd.index.tolist()))].ID.unique().tolist()

    NB = 0

    for item in cls:

        c = pd.DataFrame(sig)
        c.rename(columns={'SINGH_NFE2L2_TARGETS':'act'},inplace=True)
        c['act'] = [0 for i in c.index]

        tt = DF_prob_map[(DF_prob_map.var_id.isin(test.var_id))&(DF_prob_map.ID==item)&(DF_prob_map.Tumor_Sample_Barcode!=sample)].Tumor_Sample_Barcode.tolist()
        c.loc[((c.index.isin(tt))),'act'] = 1

        LOR = get_naive_bayes_probability(c,sig_Act)

        NB = NB + np.round(LOR, decimals=3)

        DF_LOR = pd.concat([DF_LOR, pd.DataFrame([{'itemset':item,'LOR':LOR, 'sample':s}])])
    
DF_LOR = DF_LOR[(DF_LOR.LOR != np.inf)&(DF_LOR.LOR != -np.inf)&(pd.notnull(DF_LOR.LOR))]



1209 out of 1210


In [106]:
DF_LOR

Unnamed: 0,LOR,itemset,sample
0,1.784067,CTTN_2D1X.D_c_9_,SNU878_LIVER
0,1.784067,CTTN_2D1X.B_c_9_,SNU878_LIVER
0,1.784067,USP8_1WHB.A_c_5_,SNU878_LIVER
0,1.089420,EPHA5_2R2P.A_c_55_,CAKI2_KIDNEY
0,0.685455,CEACAM1_2GK2.B_c_3_,NCIH1944_LUNG
0,0.397773,CEACAM1_2GK2.B_c_4_,NCIH1944_LUNG
0,0.177027,CEACAM1_2GK2.B_c_5_,NCIH1944_LUNG
0,-0.006526,CEACAM1_2GK2.B_c_6_,NCIH1944_LUNG
0,-0.161843,CEACAM1_2GK2.B_c_7_,NCIH1944_LUNG
0,-0.296500,CEACAM1_2GK2.B_c_8_,NCIH1944_LUNG


In [101]:
DF_LOR

Unnamed: 0,LOR,itemset,sample
0,1.784067,CD22_5VKJ.A_c_24_,NCIH2172_LUNG
0,1.08942,CD22_5VKJ.A_c_28_,NCIH2172_LUNG
0,1.784067,HIST1H4C_5JA4.B_c_14_,NCIH2172_LUNG
0,0.685455,IRS1_1QQG.B_c_25_,NCIH2172_LUNG
0,1.08942,RNASE3_4X08.A_c_13_,NCIH2172_LUNG
0,0.685455,XDH_2E1Q.D_c_111_,NCIH2172_LUNG
0,0.177027,XDH_2CKJ.C_c_117_,NCIH2172_LUNG


In [102]:
np.sum(DF_LOR.LOR.unique())

3.73596882006622

In [69]:
DF_indd = DF_ind[(DF_ind.LOR != np.inf)&(DF_ind.LOR != -np.inf)]

DF_indd['Hugo_Symbol'] = [i.split('_')[0] for i in DF_indd.itemset]
DF_indd['cluster'] = [i.split('_')[3] for i in DF_indd.itemset]
DF_indd['structureChainID'] = [i.split('_')[1] for i in DF_indd.itemset]
DF_indd['structureID'] = [i.split('.')[0] for i in DF_indd.structureChainID]
DF_indd['orig_index'] = DF_indd.index.tolist()

DF_indd = pd.merge(DF_indd, DF_prob_map[['ID','dist']].drop_duplicates(),left_on='itemset',right_on='ID')

DF_indd['dist'] = [np.round(i,decimals=2) for i in DF_indd.dist]

DF_indd.head(2)

Unnamed: 0,LOR,itemset,sample,Hugo_Symbol,cluster,structureChainID,structureID,orig_index,ID,dist
0,1.784067,ANKRD27_4CZ2.D_c_5_,NCIH2172_LUNG,ANKRD27,5,4CZ2.D,4CZ2,0,ANKRD27_4CZ2.D_c_5_,23.99
1,1.784067,ANKRD27_4CYM.D_c_5_,NCIH2172_LUNG,ANKRD27,5,4CYM.D,4CYM,0,ANKRD27_4CYM.D_c_5_,24.0


In [73]:
DF_ind_red.drop('dist',1).drop_duplicates().sort_values('LOR',ascending=False)

Unnamed: 0,Hugo_Symbol,LOR,cluster
6,CD22,1.784067,24
66,HIST1H4C,1.784067,14
67,HIST1H4C,1.784067,13
7,CD22,1.08942,28
220,RB1,1.08942,45
229,RNASE3,1.08942,13
293,XDH,1.08942,70
73,IRS1,0.685455,25
74,IRS1,0.685455,30
77,IRS1,0.685455,9


159.72729506542697

In [75]:
np.sum(DF_ind_red[['Hugo_Symbol','LOR']].drop_duplicates())


Hugo_Symbol    CD22CD22CLCN1FBP1FCN1FIS1HGFHIST1H4CIL7RIRS1NA...
LOR                                                      9.47375
dtype: object