## Understudied Hypergeometric Test and Drug Target Table Creation

In [1]:
import pandas as pd
import numpy as np
import functools as ft
import itertools as it

import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from copy import copy
from scipy.stats import hypergeom

from tools import networkHelpers as nh

%matplotlib inline

### Get HGNC Mapper 

In [2]:
hgnc_mapper = nh.fetch_hgnc_mapper()

In [3]:
understudied = set(pd.read_csv('../data/ref/dark_kinases.csv').symbol.tolist())

def test_understudied(clust, understudied, M):
    N = len(clust)
    
    rv = hypergeom(M, len(understudied), N)
    
    k = sum(map(lambda x: x in understudied, clust))
    
    return rv.pmf(k)

### Read in Louvain clustering

In [4]:
louv = pd.read_csv('../results/weighted/louvain_clusters.txt', sep='\t', index_col=0)
louv.columns = ['cluster_super']
louv['cluster_sub'] = pd.read_csv('../results/weighted/louvain_small_clusters.txt', sep='\t', index_col=0)['cluster']
louv['understudied'] = pd.Series(louv.index.to_list()).apply(lambda x: x in understudied).to_list()

louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MST1R,3,11,False
YES1,3,11,False
TYRO3,3,11,False
FGR,3,11,False
SRC,3,11,False


In [5]:
#calculate super and sub p-values under the hypergeometric
understudied_super_p_vals = {}
for i in sorted(louv.cluster_super.unique()):
    understudied_super_p_vals[i]=test_understudied(louv[louv['cluster_super']==i].index.to_list(), understudied, M=len(louv))
    
understudied_sub_p_vals = {}
for i in sorted(louv.cluster_sub.unique()):
    understudied_sub_p_vals[i]=test_understudied(louv[louv['cluster_sub']==i].index.to_list(), understudied, M=len(louv))

In [6]:
sub_super_dict = louv.groupby('cluster_sub')['cluster_super'].agg('mean').to_dict()
print(sub_super_dict)

{1: 4, 2: 2, 3: 7, 4: 6, 5: 8, 6: 6, 7: 1, 8: 1, 9: 2, 10: 6, 11: 3, 12: 8, 13: 5, 14: 6, 15: 5, 16: 4, 17: 6, 18: 6, 19: 4, 20: 8, 21: 1, 22: 6, 23: 6, 24: 2, 25: 6, 26: 9}


In [7]:
understudied_super_clusts = []
understudied_sub_clusts = []

for clust, p_val in understudied_super_p_vals.items():
    if(p_val < 0.05):
        understudied_super_clusts += [clust]
        
for clust, p_val in understudied_sub_p_vals.items():
    if(p_val < 0.05):
        understudied_sub_clusts += [clust]
        
print(understudied_super_clusts)
print(list(zip(understudied_sub_clusts, [sub_super_dict[x] for x in understudied_sub_clusts])))

[2, 3, 4, 5, 7]
[(1, 4), (2, 2), (3, 7), (7, 1), (11, 3), (13, 5), (19, 4)]


In [8]:
super_under_map = (louv.groupby('cluster_super')['understudied'].agg(sum)/louv.groupby('cluster_super')['understudied'].agg(len)).to_dict()
sub_under_map = (louv.groupby('cluster_sub')['understudied'].agg(sum)/louv.groupby('cluster_sub')['understudied'].agg(len)).to_dict()
louv['super_prop_under'] = louv.cluster_super.apply(super_under_map.get)
louv['super_under_pval'] = louv.cluster_super.apply(understudied_super_p_vals.get)
louv['sub_prop_under'] = louv.cluster_sub.apply(sub_under_map.get)
louv['sub_under_pval'] = louv.cluster_sub.apply(understudied_sub_p_vals.get)

louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied,super_prop_under,super_under_pval,sub_prop_under,sub_under_pval
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MST1R,3,11,False,0.12,2.5e-05,0.12,2.5e-05
YES1,3,11,False,0.12,2.5e-05,0.12,2.5e-05
TYRO3,3,11,False,0.12,2.5e-05,0.12,2.5e-05
FGR,3,11,False,0.12,2.5e-05,0.12,2.5e-05
SRC,3,11,False,0.12,2.5e-05,0.12,2.5e-05


### Drug Data Analysis

In [9]:
drug_data = pd.read_excel('../data/ref/Klaeger_Science_2017 Supplementary Table 6 Selectivities.xlsx', sheet_name='CATDS target')

# a number of drugs have very similar targets listed in the same row
# we parse to separate these out 
double_drug_targets = drug_data[drug_data['Target'].apply(lambda x: ';' in x)]

# get the first and second gene
first_genes = double_drug_targets['Target'].apply(lambda x: x.split(';')[0])
second_genes = double_drug_targets['Target'].apply(lambda x: x.split(';')[1])

# fix the double targets
repaired_double_targets = double_drug_targets.drop('Target', axis=1).assign(**{'Target':first_genes})
repaired_double_targets = repaired_double_targets.append(double_drug_targets.drop('Target', axis=1).assign(**{'Target':second_genes})) 
drug_data = drug_data.drop(double_drug_targets.index).append(repaired_double_targets, sort=False)

# filter to only hgnc keys
drug_data = drug_data[drug_data.Target.isin(hgnc_mapper.keys())]

# convert to hgnc
drug_data.Target = drug_data.Target.apply(lambda x: hgnc_mapper[x])

# save total drug list information
drug_list = list(sorted(set(drug_data.Drug)))
print(len(drug_list), ' drugs present for our analysis')

# sort columns
drug_data = drug_data[['Target', 'Drug', 'At', 'CATDS']]
drug_data.head()

222  drugs present for our analysis


Unnamed: 0,Target,Drug,At,CATDS
0,AURKA,MK-5108,0.120921,0.657165
1,CHEK1,PF-477736,0.243725,0.733257
2,FLT3,Dovitinib,0.26524,0.860803
4,MET,Capmatinib,0.337514,1.0
5,MAPKAPK2,VX-702,0.342705,0.743594


In [10]:
kinase_num_drug_hits = drug_data.groupby('Target')['Drug'].agg(len).to_dict()
kinase_drug_sets = drug_data.groupby('Target')['Drug'].agg(list).to_dict()
louv['drug_hits'] = [kinase_num_drug_hits[k] if k in kinase_num_drug_hits.keys() else 0 for k in louv.index]
louv['drug_sets'] = [kinase_drug_sets[k] if k in kinase_drug_sets.keys() else set() for k in louv.index]
louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied,super_prop_under,super_under_pval,sub_prop_under,sub_under_pval,drug_hits,drug_sets
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
MST1R,3,11,False,0.12,2.5e-05,0.12,2.5e-05,16,"[Golvatinib, Crizotinib, Foretinib, MK-2461, O..."
YES1,3,11,False,0.12,2.5e-05,0.12,2.5e-05,31,"[TG-100572, Dasatinib, TAK-901, Ibrutinib, Dan..."
TYRO3,3,11,False,0.12,2.5e-05,0.12,2.5e-05,0,{}
FGR,3,11,False,0.12,2.5e-05,0.12,2.5e-05,5,"[Dasatinib, Axitinib, Saracatinib, AT-9283, Da..."
SRC,3,11,False,0.12,2.5e-05,0.12,2.5e-05,23,"[TG-100572, Dasatinib, Foretinib, Danusertib, ..."


In [11]:
drug_super_unique_hits = louv.groupby('cluster_super')['drug_sets'].agg(lambda x: len(set(it.chain.from_iterable(x))))
drug_super_strengths = (louv.groupby('cluster_super')['drug_hits'].agg(sum)/drug_super_unique_hits).fillna(0).to_dict()

drug_sub_unique_hits = louv.groupby('cluster_sub')['drug_sets'].agg(lambda x: len(set(it.chain.from_iterable(x))))
drug_sub_strengths = (louv.groupby('cluster_sub')['drug_hits'].agg(sum)/drug_sub_unique_hits).fillna(0).to_dict()

# convert to dictionaries for mapping
drug_super_unique_hits = drug_super_unique_hits.to_dict()
drug_sub_unique_hits = drug_sub_unique_hits.to_dict()

display(drug_super_unique_hits)
display(drug_super_strengths)
display(drug_sub_unique_hits)
display(drug_sub_strengths)
#louv.groupby['cluster_super']['drug_hits']

{1: 91, 2: 171, 3: 165, 4: 94, 5: 113, 6: 137, 7: 159, 8: 49, 9: 0}

{1: 2.9450549450549453,
 2: 7.0,
 3: 4.351515151515152,
 4: 4.053191489361702,
 5: 2.566371681415929,
 6: 2.6934306569343067,
 7: 4.949685534591195,
 8: 2.142857142857143,
 9: 0.0}

{1: 93,
 2: 171,
 3: 159,
 4: 34,
 5: 49,
 6: 28,
 7: 41,
 8: 80,
 9: 59,
 10: 20,
 11: 165,
 12: 0,
 13: 104,
 14: 46,
 15: 41,
 16: 0,
 17: 119,
 18: 4,
 19: 9,
 20: 0,
 21: 0,
 22: 0,
 23: 1,
 24: 26,
 25: 0,
 26: 0}

{1: 3.924731182795699,
 2: 6.333333333333333,
 3: 4.949685534591195,
 4: 1.1470588235294117,
 5: 2.142857142857143,
 6: 1.7142857142857142,
 7: 3.707317073170732,
 8: 1.45,
 9: 1.4915254237288136,
 10: 1.0,
 11: 4.351515151515152,
 12: 0.0,
 13: 1.9423076923076923,
 14: 1.673913043478261,
 15: 2.1463414634146343,
 16: 0.0,
 17: 1.5126050420168067,
 18: 1.0,
 19: 1.7777777777777777,
 20: 0.0,
 21: 0.0,
 22: 0.0,
 23: 1.0,
 24: 1.0,
 25: 0.0,
 26: 0.0}

In [12]:
louv['super_hits'] = louv.cluster_super.map(drug_super_unique_hits)
louv['super_strength'] = louv.cluster_super.map(drug_super_strengths)
louv['sub_hits'] = louv.cluster_sub.map(drug_sub_unique_hits)
louv['sub_strength'] = louv.cluster_sub.map(drug_sub_strengths)

super_sizes = louv.groupby('cluster_super')['understudied'].agg(len).to_dict()
louv['super_size'] = louv.cluster_super.map(super_sizes)
sub_sizes = louv.groupby('cluster_sub')['understudied'].agg(len).to_dict()
louv['sub_size'] = louv.cluster_super.map(sub_sizes)

# convert the avg drug hit to percentage
louv['super_strength'] = louv['super_strength']/louv['super_size']*100
louv['sub_strength'] = louv['sub_strength']/louv['sub_size']*100


louv.head()

Unnamed: 0_level_0,cluster_super,cluster_sub,understudied,super_prop_under,super_under_pval,sub_prop_under,sub_under_pval,drug_hits,drug_sets,super_hits,super_strength,sub_hits,sub_strength,super_size,sub_size
names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
MST1R,3,11,False,0.12,2.5e-05,0.12,2.5e-05,16,"[Golvatinib, Crizotinib, Foretinib, MK-2461, O...",165,5.80202,165,6.907167,75,63
YES1,3,11,False,0.12,2.5e-05,0.12,2.5e-05,31,"[TG-100572, Dasatinib, TAK-901, Ibrutinib, Dan...",165,5.80202,165,6.907167,75,63
TYRO3,3,11,False,0.12,2.5e-05,0.12,2.5e-05,0,{},165,5.80202,165,6.907167,75,63
FGR,3,11,False,0.12,2.5e-05,0.12,2.5e-05,5,"[Dasatinib, Axitinib, Saracatinib, AT-9283, Da...",165,5.80202,165,6.907167,75,63
SRC,3,11,False,0.12,2.5e-05,0.12,2.5e-05,23,"[TG-100572, Dasatinib, Foretinib, Danusertib, ...",165,5.80202,165,6.907167,75,63


## Formatting and LaTeX Output

In [13]:
agg_dict = {'understudied':len}
agg_dict.update({x:np.mean for x in ['super_prop_under','super_under_pval','sub_prop_under','sub_under_pval', 'super_hits', 'super_strength', 'sub_hits', 'sub_strength', 'super_size']})

# convert understudied and strengths to percentages
understudied_table = louv.groupby(['cluster_super', 'cluster_sub']).agg(agg_dict)
understudied_table['super_prop_under'] = understudied_table['super_prop_under'].apply(lambda x: np.round(x*100,1))
understudied_table['sub_prop_under'] = understudied_table['sub_prop_under'].apply(lambda x: np.round(x*100,1))
understudied_table['sub_strength'] = understudied_table['sub_strength'].apply(lambda x: np.round(x,1))
understudied_table['super_strength'] = understudied_table['super_strength'].apply(lambda x: np.round(x,1))

raw_columns = ['Subclust\nSize', 'Superclust\n% Understud', 'Superclust \np-value', 'Subclust\n% Understud', 'Subclust\np-value', 'Superclust\nUnique Drug Hits',  'Superclust Avg.\n% Hit per Drug','Subclust\nUnique Drug Hits', 'Subclust Avg.\n% Hit per Drug', 'Superclust\nSize']
multindex_column_tuples = [tuple(col.split('\n')) for col in raw_columns]

understudied_table.columns = pd.MultiIndex.from_tuples(multindex_column_tuples)

for col in understudied_table.columns:
    understudied_table[col] = understudied_table[col].apply(lambda x: np.round(x, 4))
    

understudied_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Subclust,Superclust,Superclust,Subclust,Subclust,Superclust,Superclust Avg.,Subclust,Subclust Avg.,Superclust
Unnamed: 0_level_1,Unnamed: 1_level_1,Size,% Understud,p-value,% Understud,p-value,Unique Drug Hits,% Hit per Drug,Unique Drug Hits,% Hit per Drug,Size
cluster_super,cluster_sub,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
1,7,17,22.5,0.0697,0.0,0.0016,91,7.4,41,7.1,40
1,8,19,22.5,0.0697,36.8,0.1651,91,7.4,80,2.8,40
1,21,4,22.5,0.0697,50.0,0.2766,91,7.4,0,0.0,40
2,2,123,43.7,0.0001,43.1,0.0005,171,4.9,171,5.1,142
2,9,14,43.7,0.0001,42.9,0.139,171,4.9,59,1.2,142
2,24,5,43.7,0.0001,60.0,0.1429,171,4.9,26,0.8,142
3,11,75,12.0,0.0,12.0,0.0,165,5.8,165,6.9,75
4,1,52,17.2,0.0056,13.5,0.0013,94,7.0,93,43.6,58
4,16,3,17.2,0.0056,0.0,0.3256,94,7.0,0,0.0,58
4,19,3,17.2,0.0056,100.0,0.0298,94,7.0,9,19.8,58


In [14]:
understudied_super_table = copy(understudied_table.reset_index(level=1))
understudied_super_table = understudied_super_table[understudied_table.columns[[-1, 1,2,5,6]]].groupby(understudied_super_table.index).agg(np.mean)

# rename the index
understudied_super_table.index = understudied_super_table.index.rename('Supercluster No.') 


display(understudied_super_table)
print(understudied_super_table.to_latex())

Unnamed: 0_level_0,Superclust,Superclust,Superclust,Superclust,Superclust Avg.
Unnamed: 0_level_1,Size,% Understud,p-value,Unique Drug Hits,% Hit per Drug
Supercluster No.,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,40,22.5,0.0697,91,7.4
2,142,43.7,0.0001,171,4.9
3,75,12.0,0.0,165,5.8
4,58,17.2,0.0056,94,7.0
5,40,15.0,0.0091,113,6.4
6,64,25.0,0.0624,137,4.2
7,63,14.3,0.0007,159,7.9
8,28,28.6,0.1617,49,7.7
9,7,14.3,0.232,0,0.0


\begin{tabular}{lrrrrr}
\toprule
{} & \multicolumn{2}{l}{Superclust} & Superclust  &       Superclust & Superclust Avg. \\
{} &       Size & \% Understud &     p-value & Unique Drug Hits &  \% Hit per Drug \\
Supercluster No. &            &             &             &                  &                 \\
\midrule
1                &         40 &        22.5 &      0.0697 &               91 &             7.4 \\
2                &        142 &        43.7 &      0.0001 &              171 &             4.9 \\
3                &         75 &        12.0 &      0.0000 &              165 &             5.8 \\
4                &         58 &        17.2 &      0.0056 &               94 &             7.0 \\
5                &         40 &        15.0 &      0.0091 &              113 &             6.4 \\
6                &         64 &        25.0 &      0.0624 &              137 &             4.2 \\
7                &         63 &        14.3 &      0.0007 &              159 &             7.9 \

In [15]:
understudied_sub_table = copy(understudied_table.reset_index(level=0))

# rename superclust in acceptable multicolumn format
cols = list(understudied_sub_table.columns.get_level_values(0))
cols[0] = 'Superclust'
cols = pd.MultiIndex.from_arrays([cols, list(understudied_sub_table.columns.get_level_values(1))])
understudied_sub_table.columns = cols

understudied_sub_table = understudied_sub_table[understudied_sub_table.columns[[0,4,5,8,9]]].sort_values(by=cols[0])

# rename the index
understudied_sub_table.index = understudied_sub_table.index.rename('Subcluster No.') 

display(understudied_sub_table)
print(understudied_sub_table.to_latex())

Unnamed: 0_level_0,Superclust,Subclust,Subclust,Subclust,Subclust Avg.
Unnamed: 0_level_1,Unnamed: 1_level_1,% Understud,p-value,Unique Drug Hits,% Hit per Drug
Subcluster No.,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
7,1,0.0,0.0016,41,7.1
8,1,36.8,0.1651,80,2.8
21,1,50.0,0.2766,0,0.0
2,2,43.1,0.0005,171,5.1
9,2,42.9,0.139,59,1.2
24,2,60.0,0.1429,26,0.8
11,3,12.0,0.0,165,6.9
1,4,13.5,0.0013,93,43.6
16,4,0.0,0.3256,0,0.0
19,4,100.0,0.0298,9,19.8


\begin{tabular}{lrrrrr}
\toprule
{} & Superclust & \multicolumn{3}{l}{Subclust} &  Subclust Avg. \\
{} & \% Understud & p-value & Unique Drug Hits & \% Hit per Drug \\
Subcluster No. &            &             &         &                  &                \\
\midrule
7              &          1 &         0.0 &  0.0016 &               41 &            7.1 \\
8              &          1 &        36.8 &  0.1651 &               80 &            2.8 \\
21             &          1 &        50.0 &  0.2766 &                0 &            0.0 \\
2              &          2 &        43.1 &  0.0005 &              171 &            5.1 \\
9              &          2 &        42.9 &  0.1390 &               59 &            1.2 \\
24             &          2 &        60.0 &  0.1429 &               26 &            0.8 \\
11             &          3 &        12.0 &  0.0000 &              165 &            6.9 \\
1              &          4 &        13.5 &  0.0013 &               93 &           43.6 \\
16  

### Pure Understudied table

In [16]:
understudied_filled_table = copy(understudied_table)
to_keep = np.zeros_like(understudied_filled_table.index.get_level_values(0))

understudied_filled_table = understudied_filled_table[understudied_filled_table.columns[[1,2,0,3,4]]]

for sup_ in understudied_filled_table.index.get_level_values(0):
    local_tab = understudied_filled_table.loc[sup_]
    first_sub = local_tab.index.to_list()[0]
    to_keep = np.logical_or(to_keep, [x == first_sub for x in understudied_filled_table.index.get_level_values(1).to_list()])
    
understudied_filled_table[understudied_filled_table.columns[0]][~to_keep] = ''
understudied_filled_table[understudied_filled_table.columns[1]][~to_keep] = ''

display(understudied_filled_table)
print(understudied_filled_table.to_latex())

Unnamed: 0_level_0,Unnamed: 1_level_0,Superclust,Superclust,Subclust,Subclust,Subclust
Unnamed: 0_level_1,Unnamed: 1_level_1,% Understud,p-value,Size,% Understud,p-value
cluster_super,cluster_sub,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,7,22.5,0.0697,17,0.0,0.0016
1,8,,,19,36.8,0.1651
1,21,,,4,50.0,0.2766
2,2,43.7,0.0001,123,43.1,0.0005
2,9,,,14,42.9,0.139
2,24,,,5,60.0,0.1429
3,11,12.0,0.0,75,12.0,0.0
4,1,17.2,0.0056,52,13.5,0.0013
4,16,,,3,0.0,0.3256
4,19,,,3,100.0,0.0298


\begin{tabular}{llllrrr}
\toprule
  &    &  Superclust & Superclust  & \multicolumn{3}{l}{Subclust} \\
  &    & \% Understud &     p-value &     Size & \% Understud & p-value \\
cluster\_super & cluster\_sub &             &             &          &             &         \\
\midrule
1 & 7  &        22.5 &      0.0697 &       17 &         0.0 &  0.0016 \\
  & 8  &             &             &       19 &        36.8 &  0.1651 \\
  & 21 &             &             &        4 &        50.0 &  0.2766 \\
2 & 2  &        43.7 &      0.0001 &      123 &        43.1 &  0.0005 \\
  & 9  &             &             &       14 &        42.9 &  0.1390 \\
  & 24 &             &             &        5 &        60.0 &  0.1429 \\
3 & 11 &          12 &           0 &       75 &        12.0 &  0.0000 \\
4 & 1  &        17.2 &      0.0056 &       52 &        13.5 &  0.0013 \\
  & 16 &             &             &        3 &         0.0 &  0.3256 \\
  & 19 &             &             &        3 &       100.0 