In [1]:
import pandas as pd
import numpy as np
import functools as ft
import itertools as it

import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from copy import copy
from scipy.stats import hypergeom

from tools import networkHelpers as nh

%matplotlib inline

In [23]:
hgnc_mapper = nh.fetch_hgnc_mapper()

In [3]:
understudied = set(pd.read_csv('../data/ref/dark_kinases.csv').symbol.tolist())

def test_understudied(clust, understudied, M):
    N = len(clust)
    
    rv = hypergeom(M, len(understudied), N)
    
    k = sum(map(lambda x: x in understudied, clust))
    
    return rv.pmf(k)

In [11]:
understudied_p_vals = []
louv = pd.read_csv('../results/weighted/louvain_clusters.txt', sep='\t', index_col=0)
louv.columns = ['cluster_super']
louv['cluster_sub'] = pd.read_csv('../results/weighted/louvain_small_clusters.txt', sep='\t', index_col=0)['cluster']
louv['understudied'] = pd.Series(louv.index.to_list()).apply(lambda x: x in understudied).to_list()

louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MST1R,3,11,False
YES1,3,11,False
TYRO3,3,11,False
FGR,3,11,False
SRC,3,11,False


In [12]:
understudied_super_p_vals = {}
for i in sorted(louv.cluster_super.unique()):
    understudied_super_p_vals[i]=test_understudied(louv[louv['cluster_super']==i].index.to_list(), understudied, M=len(louv))
    
understudied_sub_p_vals = {}
for i in sorted(louv.cluster_sub.unique()):
    understudied_sub_p_vals[i]=test_understudied(louv[louv['cluster_sub']==i].index.to_list(), understudied, M=len(louv))

In [15]:
sub_super_dict = louv.groupby('cluster_sub')['cluster_super'].agg('mean').to_dict()
print(sub_super_dict)

{1: 4, 2: 2, 3: 7, 4: 6, 5: 8, 6: 6, 7: 1, 8: 1, 9: 2, 10: 6, 11: 3, 12: 8, 13: 5, 14: 6, 15: 5, 16: 4, 17: 6, 18: 6, 19: 4, 20: 8, 21: 1, 22: 6, 23: 6, 24: 2, 25: 6, 26: 9}


In [16]:
understudied_super_clusts = []
understudied_sub_clusts = []

for clust, p_val in understudied_super_p_vals.items():
    if(p_val < 0.05):
        understudied_super_clusts += [clust]
        
for clust, p_val in understudied_sub_p_vals.items():
    if(p_val < 0.05):
        understudied_sub_clusts += [clust]
        
print(understudied_super_clusts)
print(list(zip(understudied_sub_clusts, [sub_super_dict[x] for x in understudied_sub_clusts])))

[2, 3, 4, 5, 7]
[(1, 4), (2, 2), (3, 7), (7, 1), (11, 3), (13, 5), (19, 4)]


In [20]:
super_under_map = (louv.groupby('cluster_super')['understudied'].agg(sum)/louv.groupby('cluster_super')['understudied'].agg(len)).to_dict()
sub_under_map = (louv.groupby('cluster_sub')['understudied'].agg(sum)/louv.groupby('cluster_sub')['understudied'].agg(len)).to_dict()
louv['super_prop_under'] = louv.cluster_super.apply(super_under_map.get)
louv['super_under_pval'] = louv.cluster_super.apply(understudied_super_p_vals.get)
louv['sub_prop_under'] = louv.cluster_sub.apply(sub_under_map.get)
louv['sub_under_pval'] = louv.cluster_sub.apply(understudied_sub_p_vals.get)

louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied,super_prop_under,super_under_pval,sub_prop_under,sub_under_pval
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MST1R,3,11,False,0.12,2.5e-05,0.12,2.5e-05
YES1,3,11,False,0.12,2.5e-05,0.12,2.5e-05
TYRO3,3,11,False,0.12,2.5e-05,0.12,2.5e-05
FGR,3,11,False,0.12,2.5e-05,0.12,2.5e-05
SRC,3,11,False,0.12,2.5e-05,0.12,2.5e-05


### Drug Data Analysis

In [32]:
drug_data = pd.read_excel('../data/ref/Klaeger_Science_2017 Supplementary Table 6 Selectivities.xlsx', sheet_name='CATDS target')

# a number of drugs have very similar targets listed in the same row
# we parse to separate these out 
double_drug_targets = drug_data[drug_data['Target'].apply(lambda x: ';' in x)]

# get the first and second gene
first_genes = double_drug_targets['Target'].apply(lambda x: x.split(';')[0])
second_genes = double_drug_targets['Target'].apply(lambda x: x.split(';')[1])

# fix the double targets
repaired_double_targets = double_drug_targets.drop('Target', axis=1).assign(**{'Target':first_genes})
repaired_double_targets = repaired_double_targets.append(double_drug_targets.drop('Target', axis=1).assign(**{'Target':second_genes})) 
drug_data = drug_data.drop(double_drug_targets.index).append(repaired_double_targets, sort=False)

# filter to only hgnc keys
drug_data = drug_data[drug_data.Target.isin(hgnc_mapper.keys())]

# convert to hgnc
drug_data.Target = drug_data.Target.apply(lambda x: hgnc_mapper[x])

# save total drug list information
drug_list = list(sorted(set(drug_data.Drug)))
print(len(drug_list), ' drugs present for our analysis')

# sort columns
drug_data = drug_data[['Target', 'Drug', 'At', 'CATDS']]
drug_data.head()

222  drugs present for our analysis


Unnamed: 0,Target,Drug,At,CATDS
0,AURKA,MK-5108,0.120921,0.657165
1,CHEK1,PF-477736,0.243725,0.733257
2,FLT3,Dovitinib,0.26524,0.860803
4,MET,Capmatinib,0.337514,1.0
5,MAPKAPK2,VX-702,0.342705,0.743594


## Formatting and LaTeX Output

In [21]:
agg_dict = {'understudied':len}
agg_dict.update({x:np.mean for x in ['super_prop_under','super_under_pval','sub_prop_under','sub_under_pval']})

understudied_table = louv.groupby(['cluster_super', 'cluster_sub']).agg(agg_dict)
understudied_table['super_prop_under'] = understudied_table['super_prop_under'].apply(lambda x: np.round(x*100,1))
understudied_table['sub_prop_under'] = understudied_table['sub_prop_under'].apply(lambda x: np.round(x*100,1))

raw_columns = ['Subclust\nSize', 'Superclust\n% Understud', 'Superclust \np-value', 'Subclust\n% Understud', 'Subclust \np-value']
multindex_column_tuples = [tuple(col.split('\n')) for col in raw_columns]

understudied_table.columns = pd.MultiIndex.from_tuples(multindex_column_tuples)

for col in understudied_table.columns:
    understudied_table[col] = understudied_table[col].apply(lambda x: np.round(x, 4))
understudied_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Subclust,Superclust,Superclust,Subclust,Subclust
Unnamed: 0_level_1,Unnamed: 1_level_1,Size,% Understud,p-value,% Understud,p-value
cluster_super,cluster_sub,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,7,17,22.5,0.0697,0.0,0.0016
1,8,19,22.5,0.0697,36.8,0.1651
1,21,4,22.5,0.0697,50.0,0.2766
2,2,123,43.7,0.0001,43.1,0.0005
2,9,14,43.7,0.0001,42.9,0.139
2,24,5,43.7,0.0001,60.0,0.1429
3,11,75,12.0,0.0,12.0,0.0
4,1,52,17.2,0.0056,13.5,0.0013
4,16,3,17.2,0.0056,0.0,0.3256
4,19,3,17.2,0.0056,100.0,0.0298


In [22]:
understudied_filled_table = copy(understudied_table)
to_keep = np.zeros_like(understudied_filled_table.index.get_level_values(0))

understudied_filled_table = understudied_filled_table[understudied_filled_table.columns[[1,2,0,3,4]]]

for sup_ in understudied_filled_table.index.get_level_values(0):
    local_tab = understudied_filled_table.loc[sup_]
    first_sub = local_tab.index.to_list()[0]
    to_keep = np.logical_or(to_keep, [x == first_sub for x in understudied_filled_table.index.get_level_values(1).to_list()])
    
understudied_filled_table[understudied_filled_table.columns[0]][~to_keep] = ''
understudied_filled_table[understudied_filled_table.columns[1]][~to_keep] = ''

display(understudied_filled_table)
print(understudied_filled_table.to_latex())

Unnamed: 0_level_0,Unnamed: 1_level_0,Superclust,Superclust,Subclust,Subclust,Subclust
Unnamed: 0_level_1,Unnamed: 1_level_1,% Understud,p-value,Size,% Understud,p-value
cluster_super,cluster_sub,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,7,22.5,0.0697,17,0.0,0.0016
1,8,,,19,36.8,0.1651
1,21,,,4,50.0,0.2766
2,2,43.7,0.0001,123,43.1,0.0005
2,9,,,14,42.9,0.139
2,24,,,5,60.0,0.1429
3,11,12.0,0.0,75,12.0,0.0
4,1,17.2,0.0056,52,13.5,0.0013
4,16,,,3,0.0,0.3256
4,19,,,3,100.0,0.0298


\begin{tabular}{llllrrr}
\toprule
  &    &  Superclust & Superclust  & \multicolumn{2}{l}{Subclust} & Subclust  \\
  &    & \% Understud &     p-value &     Size & \% Understud &   p-value \\
cluster\_super & cluster\_sub &             &             &          &             &           \\
\midrule
1 & 7  &        22.5 &      0.0697 &       17 &         0.0 &    0.0016 \\
  & 8  &             &             &       19 &        36.8 &    0.1651 \\
  & 21 &             &             &        4 &        50.0 &    0.2766 \\
2 & 2  &        43.7 &      0.0001 &      123 &        43.1 &    0.0005 \\
  & 9  &             &             &       14 &        42.9 &    0.1390 \\
  & 24 &             &             &        5 &        60.0 &    0.1429 \\
3 & 11 &          12 &           0 &       75 &        12.0 &    0.0000 \\
4 & 1  &        17.2 &      0.0056 &       52 &        13.5 &    0.0013 \\
  & 16 &             &             &        3 &         0.0 &    0.3256 \\
  & 19 &             &    