In [1]:
import pandas as pd
import numpy as np
import functools as ft
import itertools as it
import multiprocessing as mp
import matplotlib.pyplot as plt
from collections import Counter
%matplotlib inline

In [2]:
df = pd.read_csv('../data/go_synonym_data.txt', header=None, sep='\t', low_memory=False,)
df.columns = ['ID', 'Gene/Product', 'Name', 'GO Class Labels', 'Synonyms']
print(df.shape)
df.head(3)

(19712, 5)


Unnamed: 0,ID,Gene/Product,Name,GO Class Labels,Synonyms
0,UniProtKB:P25788,PSMA3,Proteasome subunit alpha type-3,protein deubiquitination|endopeptidase activit...,HC8|PSC8
1,UniProtKB:P25787,PSMA2,Proteasome subunit alpha type-2,protein deubiquitination|endopeptidase activit...,HC3|PSC3
2,UniProtKB:P25786,PSMA1,Proteasome subunit alpha type-1,protein deubiquitination|endopeptidase activit...,HC2|NU|PROS30|PSC2


In [3]:
print(len(df['ID'].unique()))

19712


In [4]:
print(len(df['Gene/Product'].unique()))

19659


In [5]:
def splitter(x):
    try:
        temp = x.split('|')
    except:
        temp = []
    return temp
        

In [6]:
df['Synonyms'] = df['Synonyms'].apply(splitter)

In [7]:
kinase_network_df = pd.read_csv('../data/Full_Kinome_Network_Compiled_no_header.txt', header=None, sep='\t')
known_kinases = list(set(kinase_network_df[0]) | set(kinase_network_df[1]))
print(len(known_kinases))

473


In [8]:
df[df['Gene/Product']=='RAF1']

Unnamed: 0,ID,Gene/Product,Name,GO Class Labels,Synonyms
18381,UniProtKB:P04049,RAF1,RAF proto-oncogene serine/threonine-protein ki...,nuclear speck|Golgi apparatus|face development...,[RAF]


In [9]:
df.dtypes

ID                 object
Gene/Product       object
Name               object
GO Class Labels    object
Synonyms           object
dtype: object

In [10]:
print(len(list(filter(lambda x: x != 'nan', it.chain.from_iterable(df['Synonyms'].apply(lambda x: str(x).split('|')))))))
print(len(set(filter(lambda x: x != 'nan', it.chain.from_iterable(df['Synonyms'].apply(lambda x: str(x).split('|')))))))

19712
13365


In [11]:
set_of_synonyms_present = set(filter(lambda x: x != 'nan', it.chain.from_iterable(df['Synonyms'].apply(lambda x: str(x).split('|')))))
set_of_gene_products = set(df['Gene/Product'].values)

In [12]:
missing = []

for k in known_kinases:
    if k not in set_of_synonyms_present and k not in set_of_gene_products:
        missing.append(k)

In [13]:
len(missing)

187

In [18]:
alias = pd.read_excel('../data/KINASESmasterlist_w_Aliases.xlsx', header = 0)

In [19]:
kin_map = alias.set_index('Uniprot Protein')['MS Gene'].to_dict()

In [20]:
all_aliases = alias.set_index('Uniprot Protein')['Aliases (Conservative)'].dropna().apply(lambda x: x.split(',')).to_dict()

In [21]:
go_dat = {}

for k in known_kinases:
    temp = df[df['Gene/Product']==k]
    if(temp.shape[0] == 0):
        temp = df[df['Synonyms'].apply(lambda x: k in x)]
        if(temp.shape[0] == 0):
            temp = df[df['Gene/Product']==kin_map[k]]
            if(temp.shape[0] == 0):
                temp = df[df['Synonyms'].apply(lambda x: kin_map[k] in x)]
                if(temp.shape[0]==0):
                    r = all_aliases.get(k)
                    if(r is not None):
                        for a in r:
                            temp = df[df['Gene/Product']==a]
                            if(temp.shape[0] ==0):
                                temp = df[df['Synonyms'].apply(lambda x: a in x)]
                                if(temp.shape[0]==0):
                                    pass
                                else:
                                    go_dat[k] = temp
                                    break
                            else:
                                go_dat[k] = temp
                else:
                    go_dat[k] = temp
            else:
                go_dat[k] = temp
        else:
            go_dat[k] = temp
    else:
        go_dat[k] = temp

In [22]:
go_dat[list(go_dat.keys())[0]]

Unnamed: 0,ID,Gene/Product,Name,GO Class Labels,Synonyms
7671,UniProtKB:P00558,PGK1,Phosphoglycerate kinase 1,gluconeogenesis|negative regulation of angioge...,"[PGKA, MIG10, OK/SW-cl.110]"


In [23]:
for x in go_dat.keys():
    try:
        go_dat[x]['GO Class Labels'].values[0]
    except:
        print(x)

In [24]:
go_dat[list(go_dat.keys())[0]]['GO Class Labels'].values[0]

'gluconeogenesis|negative regulation of angiogenesis|muscle cell cellular homeostasis|glycolytic process|myoblast fusion|chemical synaptic transmission|ADP binding|canonical glycolysis|somatic muscle development|epithelial cell differentiation|ATP binding|extracellular space|protein binding|extracellular exosome|phosphoglycerate kinase activity|plasminogen activation|protein-disulfide reductase activity|membrane|membrane raft|cytosol|phosphorylation|cellular response to hypoxia'

In [25]:
# find any kinases where multiple gene/product IDs were returned
for x in go_dat.keys():
    if(go_dat[x].shape[0] > 1):
        print(x)

P4K2B


In [26]:
go_dat['P4K2B'].groupby('Gene/Product').apply(sum)['GO Class Labels'][0]

'ATP binding|trans-Golgi network|endosome|endosome organization|Golgi organization|membrane|cytosol|phosphatidylinositol phosphorylation|1-phosphatidylinositol 4-kinase activity|phosphatidylinositol biosynthetic process|plasma membranetrans-Golgi network|endosome|endosome organization|Golgi organization|phosphatidylinositol phosphorylation|1-phosphatidylinositol 4-kinase activity|phosphorylation|kinase activity|plasma membrane'

In [27]:
go_dat['P4K2B'].iloc[0]['GO Class Labels']

'ATP binding|trans-Golgi network|endosome|endosome organization|Golgi organization|membrane|cytosol|phosphatidylinositol phosphorylation|1-phosphatidylinositol 4-kinase activity|phosphatidylinositol biosynthetic process|plasma membrane'

In [28]:
temp = go_dat['P4K2B'].iloc[0]
temp['GO Class Labels']='|'.join(list(set(go_dat['P4K2B']['GO Class Labels'].iloc[0].split('|')) | set(go_dat['P4K2B']['GO Class Labels'].iloc[1].split('|'))))
go_dat['P4K2B'] = temp
#print(np.array(temp['GO Class Labels']))#.values)

In [29]:
def helper(x):
    try:
        temp = go_dat[x]['GO Class Labels'].values[0].split('|')
    except:
        temp = go_dat[x]['GO Class Labels'].split('|')
    return temp

In [30]:
just_labels = {x:helper(x) for x in go_dat.keys()}

In [35]:
agg_labels = [x for y in just_labels.values() for x in y]

In [36]:
from collections import Counter

c = Counter(agg_labels)

In [37]:
c.most_common(n=50)

[('ATP binding', 460),
 ('protein binding', 402),
 ('cytoplasm', 306),
 ('protein phosphorylation', 299),
 ('protein serine/threonine kinase activity', 290),
 ('cytosol', 278),
 ('nucleus', 229),
 ('plasma membrane', 177),
 ('protein kinase activity', 156),
 ('protein autophosphorylation', 148),
 ('nucleoplasm', 140),
 ('intracellular signal transduction', 133),
 ('peptidyl-serine phosphorylation', 113),
 ('signal transduction', 108),
 ('metal ion binding', 97),
 ('peptidyl-tyrosine phosphorylation', 97),
 ('negative regulation of apoptotic process', 95),
 ('protein tyrosine kinase activity', 80),
 ('cell differentiation', 80),
 ('identical protein binding', 73),
 ('membrane', 72),
 ('MAPK cascade', 70),
 ('receptor complex', 68),
 ('integral component of plasma membrane', 68),
 ('positive regulation of ERK1 and ERK2 cascade', 65),
 ('magnesium ion binding', 61),
 ('kinase activity', 58),
 ('extracellular exosome', 57),
 ('apoptotic process', 57),
 ('negative regulation of signal trans

In [38]:
stopwords = ['ATP binding','protein binding', 'protein phosphorylation','protein serine/threonine kinase activity']

In [56]:
processes = set(pd.read_csv('./go_biological_processes.txt', sep='\t', header=None).set_index(0)[1].tolist())
print(len(processes))

29691


In [57]:
def stophelper_plus_is_process(x):
    try:
        temp = go_dat[x]['GO Class Labels'].values[0].split('|')
    except:
        temp = go_dat[x]['GO Class Labels'].split('|')
    return list(filter(lambda x: x in processes, filter(lambda x: x not in stopwords, temp)))

In [62]:
kinase_labels = {x:stophelper_plus_is_process(x) for x in go_dat.keys()}
labeled_kinases = pd.Series(kinase_labels)

In [63]:
len(labeled_kinases)

473

In [64]:
labeled_kinases.to_csv('kinase_go_processes.csv')