In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Get HGNC Mapper

In [228]:
hgnc = pd.read_csv('../data/hgnc_alias_list.txt', sep='\t')
#hgnc.head()

hgnc = hgnc[hgnc['Approved symbol'].apply(lambda x: 'withdrawn' not in x)]

# get the original keys
hgnc_original_keys = hgnc['Approved symbol'].unique()

# drop a lot of columns for efficiency
hgnc = hgnc[list(hgnc.columns)[0:6]].drop('Status', axis=1)

# filter out Nan synonyms (not helpful)
hgnc_syn_list = hgnc[~ hgnc.Synonyms.isna()]
hgnc_prev_symb_list = hgnc[~ hgnc['Previous symbols'].isna()]

print(hgnc_syn_list.shape)
print(hgnc_syn_list.columns)
display(hgnc_syn_list.head())

print(hgnc_prev_symb_list.shape)
print(hgnc_prev_symb_list.columns)
display(hgnc_prev_symb_list.head())

#convert the synonyms column to a list
# convert these lists to pd.Series
# merge with original dataframe
#drop old synonyms column 
# melt the new columns into rows


current_syn_list = hgnc_syn_list.Synonyms.apply(lambda x: x.split(',')) \
    .apply(pd.Series) \
    .merge(hgnc, left_index = True, right_index = True) \
    .drop(["Synonyms"], axis = 1) \
    .melt(id_vars = ['HGNC ID', 'Approved symbol', 'Approved name', 'Previous symbols'], value_name = "synonym") 

current_syn_list = current_syn_list[~ current_syn_list.synonym.isna()]
current_syn_list.synonym = current_syn_list.synonym.apply(lambda x: x.replace(' ',''))


current_syn_list.head()

prev_symb_list = hgnc_prev_symb_list['Previous symbols'].apply(lambda x: x.split(',')) \
    .apply(pd.Series) \
    .merge(hgnc, left_index = True, right_index = True) \
    .drop(['Previous symbols'], axis = 1) \
    .melt(id_vars = ['HGNC ID', 'Approved symbol', 'Approved name', 'Synonyms'], value_name = "synonym") 

prev_symb_list = prev_symb_list[~ prev_symb_list.synonym.isna()]
prev_symb_list.synonym = prev_symb_list.synonym.apply(lambda x: x.replace(' ',''))


prev_symb_list.head()

hgnc_mapper = dict(zip(current_syn_list['synonym'], current_syn_list['Approved symbol']))
# add in HGNC ID mapper
hgnc_mapper.update(dict(zip(current_syn_list['HGNC ID'], current_syn_list['Approved symbol'])))
hgnc_mapper_previous = dict(zip(prev_symb_list['synonym'], prev_symb_list['Approved symbol']))

trouble_list = list(filter(lambda x: hgnc_mapper[x] != hgnc_mapper_previous[x], set(hgnc_mapper.keys())&set(hgnc_mapper_previous.keys())))

louvain_tier1 = pd.read_csv('../results/louvain_clusters.txt', sep='\t')
for n in louvain_tier1.names:
    if (n in trouble_list):
        print(n)

hand_coded = {'RAGE':'MOK', 'SGK2':'SGK2', 'SGK196':'SGK196', 'MAPK3':'MAPK3'}

hgnc_mapper_previous.update(hgnc_mapper) #overwrite the previous symbol conflicts

hgnc_mapper = hgnc_mapper_previous
hgnc_mapper.update({x:x for x in hgnc_original_keys}) #keep the identify maps
hgnc_mapper.update(hand_coded) # overwrite the trouble list

(21435, 5)
Index(['HGNC ID', 'Approved symbol', 'Approved name', 'Previous symbols',
       'Synonyms'],
      dtype='object')


Unnamed: 0,HGNC ID,Approved symbol,Approved name,Previous symbols,Synonyms
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569
2,HGNC:24086,A1CF,APOBEC1 complementation factor,,"ACF, ASP, ACF64, ACF65, APOBEC1CF"
4,HGNC:7,A2M,alpha-2-macroglobulin,,"FWP007, S863-7, CPAMD5"
6,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,CPAMD9,"FLJ25179, p170"
12,HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",A3GALT2P,"IGBS3S, IGB3S"


(11702, 5)
Index(['HGNC ID', 'Approved symbol', 'Approved name', 'Previous symbols',
       'Synonyms'],
      dtype='object')


Unnamed: 0,HGNC ID,Approved symbol,Approved name,Previous symbols,Synonyms
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,"NCRNA00181, A1BGAS, A1BG-AS",FLJ23569
6,HGNC:23336,A2ML1,alpha-2-macroglobulin like 1,CPAMD9,"FLJ25179, p170"
9,HGNC:8,A2MP1,alpha-2-macroglobulin pseudogene 1,A2MP,
12,HGNC:30005,A3GALT2,"alpha 1,3-galactosyltransferase 2",A3GALT2P,"IGBS3S, IGB3S"
13,HGNC:18149,A4GALT,"alpha 1,4-galactosyltransferase (P blood group)",P1,"A14GALT, Gb3S, P(k)"


RAGE


### Get Uniprot mapper

In [307]:
uni_mapper['P27361']

'MAPK3'

In [231]:
uni_mapper = pd.read_csv('../data/network/HUMAN_9606_idmapping.dat', sep='\t', header=None)
uni_mapper = uni_mapper[uni_mapper[1] == 'Gene_Name'].set_index(0)[2].to_dict()
uni_mapper.update({'P27361'})

next(iter(uni_mapper.items()))

('P31946', 'YWHAB')

In [232]:
uni_rev = {y:x for x,y in uni_mapper.items()}

In [233]:
print(len(uni_mapper))
print(len(uni_rev))

146363
26468


### Extra mappers from UniProt, e.g. ENSG

In [372]:
t = pd.read_csv('../data/network/HUMAN_9606_idmapping_selected.tab', sep='\t', header=None, low_memory=False)
display(t.shape)
display(t.head())

entrez_to_uniprot_number = t.set_index(2)[0].to_dict()
ensg_to_uniprot_number = t.set_index(18)[0].to_dict()
#ensg_mapper = t[]
#t = uni_mapper[uni_mapper[1]                                   == 'Gene_Name'].set_index(0)[2].to_dict()

#next(iter(uni_mapper.items()))



(169389, 22)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,P31946,1433B_HUMAN,7529,NP_003395.1; NP_647539.1; XP_016883528.1,4507949; 377656702; 67464628; 1345590; 1034625...,2BQ0:A; 2BQ0:B; 2C23:A; 4DNK:A; 4DNK:B; 5N10:A...,GO:0005737; GO:0005829; GO:0070062; GO:0005925...,UniRef100_P31946,UniRef90_P31946,UniRef50_P31946,...,9606,601289,Hs.643544,8515476; 14702039; 11780052; 15489334; 2357255...,X57346; AK292717; AL008725; CH471077; CH471077...,CAA40621.1; BAF85406.1; -; EAW75893.1; EAW7589...,ENSG00000166913,ENST00000353703; ENST00000372839,ENSP00000300161; ENSP00000361930,11996670; 12364343; 12437930; 12468542; 124825...
1,P62258,1433E_HUMAN,7531,NP_006752.1,374074368; 62131678; 67464424; 194379794; 2210...,2BR9:A; 3UAL:A; 3UBW:A; 6EIH:A,GO:0090724; GO:0005737; GO:0005829; GO:0070062...,UniRef100_P62258,UniRef90_P62258,UniRef50_P62258,...,9606,605066,Hs.513851,7644510; 8858348; 8684458; 20417184; 14702039;...,U20972; U54778; U43399; U43430; U28936; AB0171...,AAC50175.1; AAC50710.1; AAC50625.1; AAD00026.1...,ENSG00000108953; ENSG00000274474,ENST00000264335; ENST00000571732; ENST00000616...,ENSP00000264335; ENSP00000461762; ENSP00000481...,15838597; 11782387; 12657644; 14966136; 153649...
2,Q04917,1433F_HUMAN,7533,NP_003396.1,4507951; 437363; 83754686; 83754699; 83754700;...,2C63:A; 2C63:B; 2C63:C; 2C63:D; 2C74:A; 2C74:B,GO:0005737; GO:0005829; GO:0070062; GO:0098978...,UniRef100_Q04917,UniRef90_Q04917,UniRef50_P62258,...,9606,113508,Hs.226755,8218406; 1578511; 8561965; 8812417; 15461802; ...,L20422; X80536; X78138; X57345; D78577; S80794...,AAA35483.1; CAA56676.1; CAA55017.1; CAA40620.1...,ENSG00000128245,ENST00000248975,ENSP00000248975,10206237; 11121172; 11996670; 12176995; 124801...
3,P61981,1433G_HUMAN,7532,NP_036611.2,6016838; 380764684; 635576381; 82407956; 82407...,2B05:A; 2B05:B; 2B05:C; 2B05:D; 2B05:E; 2B05:F...,GO:0005829; GO:0070062; GO:0005925; GO:0016020...,UniRef100_P61981,UniRef90_P61981,UniRef50_P61981,...,9606,605356; 617665,Hs.744840,10433554; 10486217; 12853948; 15489334; 235725...,AF142498; AB024334; CR541904; CR541925; AC0063...,AAD48408.1; BAA85184.1; CAG46702.1; CAG46723.1...,ENSG00000170027,ENST00000307630,ENSP00000306330,11824616; 11996670; 12364343; 12482592; 150572...
4,P31947,1433S_HUMAN,2810,NP_006133.1,436408756; 306991738; 969812714; 350610438; 35...,1YWT:A; 1YWT:B; 1YZ5:A; 1YZ5:B; 3IQJ:A; 3IQU:A...,GO:0005829; GO:0070062; GO:0005615; GO:0005739...,UniRef100_P31947,UniRef90_P31947,UniRef50_P61981,...,9606,601290,Hs.523718,1390337; 8515476; 9659898; 16710414; 15489334;...,M93010; X57348; AF029081; AF029082; CR541905; ...,AAA59546.1; CAA40623.1; AAC52029.1; AAC52030.1...,ENSG00000175793,ENST00000339276,ENSP00000340989,10969776; 12582028; 12730237; 12787309; 145172...


### Kinase Master List (to filter)

In [234]:
kmast = pd.read_excel('../data/KINASESmasterlist_w_Aliases.xlsx')
kmast.head()

Unnamed: 0,Uniprot Protein,MS Gene,RNAseq Gene,RNAseq Accession,Family,Mouse Uniprot Protein,Mouse RNAseq gene,Mouse RNAseq Accession,Kinome Render Tree Name,Aliases (Conservative),...,Aliases,description,other_designations,Entrez_Symbol,Old_Name,Entrez_Synonyms,Entrez_other_designations,Unnamed: 18,Gene Symbol,Gene Symbol and Synonyms
0,AAK1,AAK1,AAK1,22848.0,Other,Aak1,Aak1,269774,AAK1,"AAK1,",...,,AP2 associated kinase 1,adaptor-associated kinase 1,AAK1,AAK1,KIAA1048|MGC138170,adaptor-associated kinase 1,,AAK1,AAK1
1,AAPK1,PRKAA1,PRKAA1,5562.0,CAMK,Aapk1,Prkaa1,105787,AMPK[alpha]1,"PRKAA1, AMPKa1",...,"AMPK, AMPKa1","protein kinase, AMP-activated, alpha 1 catalyt...","5'-AMP-activated protein kinase, catalytic alp...",PRKAA1,AMPKa1,AMPK|AMPKa1|MGC33776|MGC57364,"5'-AMP-activated protein kinase, catalytic alp...",,PRKAA1,"AMPK, PRKAA1, AMPKa1"
2,AAPK2,PRKAA2,PRKAA2,5563.0,CAMK,Aapk2,Prkaa2,108079,AMPK[alpha]2,"PRKAA2, AMPK2, AMPKa2, PRKAA",...,"AMPK, AMPK2, AMPKa2, PRKAA","protein kinase, AMP-activated, alpha 2 catalyt...","5'-AMP-activated protein kinase, catalytic alp...",PRKAA2,AMPKa2,AMPK|AMPK2|PRKAA,"5'-AMP-activated protein kinase, catalytic alp...",,PRKAA2,"AMPK2, AMPK, PRKAA, AMPKa2, PRKAA2"
3,ABL1,ABL1,ABL1,25.0,TK,Abl1,Abl1,11350,Abl,"ABL1,ABL, JTK7, bcr/abl, c-ABL, c-ABL1, v-abl",...,"ABL, JTK7, bcr/abl, c-ABL, c-ABL1, p150, v-abl","ABL proto-oncogene 1, non-receptor tyrosine ki...",Abelson tyrosine-protein kinase 1|bcr/c-abl on...,ABL1,ABL,ABL|JTK7|c-ABL|p150|v-abl,Abelson murine leukemia viral (v-abl) oncogene...,,ABL1,"c-ABL1, p150, ABL, JTK7, v-abl, bcr/abl, c-ABL..."
4,ABL2,ABL2,ABL2,27.0,TK,Abl2,Abl2,11352,Arg,"ABL2,ABLL, ARG",...,"ABLL, ARG","ABL proto-oncogene 2, non-receptor tyrosine ki...","abelson-related gene protein|c-abl oncogene 2,...",ABL2,ARG,ABLL|ARG,Abelson murine leukemia viral (v-abl) oncogene...,,ABL2,"ABL2, ABLL, ARG"


In [235]:
[x for x,y in hgnc_mapper.items() if 'SGK' in y]

['SGK', 'SGK2', 'SGKL', 'C8orf44-SGK3', 'SGK1', 'SGK3', 'SGK196']

In [236]:
hgnc_mapper['MAPK3']

'MAPK3'

In [237]:
dict(zip(kmast['MS Gene'], kmast['Uniprot Protein']))

{'AAK1': 'AAK1',
 'PRKAA1': 'AAPK1',
 'PRKAA2': 'AAPK2',
 'ABL1': 'ABL1',
 'ABL2': 'ABL2',
 'TNK2': 'ACK1',
 'ACVR1B': 'ACV1B',
 'ACVR1C': 'ACV1C',
 'ACVRL1': 'ACVL1',
 'ACVR1': 'ACVR1',
 'ADCK1': 'ADCK1',
 'ADCK2': 'ADCK2',
 'ADCK3': 'ADCK3',
 'ADCK4': 'ADCK4',
 'ADCK5': 'ADCK5',
 'ADK': 'ADK',
 'AGK': 'AGK',
 'AKT1': 'AKT1',
 'AKT2': 'AKT2',
 'AKT3': 'AKT3',
 'ALK': 'ALK',
 'ALPK1': 'ALPK1',
 'ALPK2': 'ALPK2',
 'ALPK3': 'ALPK3',
 'AMHR2': 'AMHR2',
 'ANKK1': 'ANKK1',
 'NPR1': 'ANPRA',
 'NPR2': 'ANPRB',
 'ARAF': 'ARAF',
 'ADRBK1': 'ARBK1',
 'ADRBK2': 'ARBK2',
 'ATM': 'ATM',
 'ATR': 'ATR',
 'AURKB': 'AURKB',
 'AURKC': 'AURKC',
 'ACVR2A': 'AVR2A',
 'ACVR2B': 'AVR2B',
 'BCKDK': 'BCKD',
 'BLK': 'BLK',
 'BMP2K': 'BMP2K',
 'BMPR2': 'BMPR2',
 'BMPR1A': 'BMR1A',
 'BMPR1B': 'BMR1B',
 'BMX': 'BMX',
 'BRAF': 'BRAF',
 'BRD2': 'BRD2',
 'BRD3': 'BRD3',
 'BRD4': 'BRD4',
 'BRSK1': 'BRSK1',
 'BRSK2': 'BRSK2',
 'BTK': 'BTK',
 'BUB1': 'BUB1',
 'BUB1B': 'BUB1B',
 'CAMKV': 'CAMKV',
 'CDK11A': 'CD11A',
 'CD

In [267]:
hard_coded = {'ACV1B':'ACVR1B', 'PDPK2':'PDK2', 'SGK110':'SBK3', 'SGK223':'PRAG1'}
hgnc_mapper.update(hard_coded)
kinome_set = set(hgnc_mapper[x] for x in kmast['MS Gene'] if 'GYK' != x and 'SGK494' != x)# if x in hgnc_mapper.keys())


uniprot_to_hgnc_map_helper = {y:hgnc_mapper[x] for x,y in zip(kmast['MS Gene'], kmast['Uniprot Protein']) if 'GYK' != x and 'SGK494' != x}
uniprot_to_hgnc_map_helper.update({'RAC1':'RAC1'})
#hgnc_mapper.update(uniprot_to_hgnc_map_helper)
# ^ this causes a lot of collision issues e.g. mapk3 and mapkapk3

len(kinome_set)

567

### Filter our uniprot queries down

In [240]:
print(len(set(kmast['Uniprot Protein'])))
print(len(set(kmast['Symbol'])))
print(len(set(kmast['Symbol'])|set(kmast['Uniprot Protein'])))

570
499
744


In [241]:
uniprot_targets = set(kmast['Uniprot Protein'])
uniprot_targets

{'AAK1',
 'AAPK1',
 'AAPK2',
 'ABL1',
 'ABL2',
 'ACK1',
 'ACV1B',
 'ACV1C',
 'ACVL1',
 'ACVR1',
 'ADCK1',
 'ADCK2',
 'ADCK3',
 'ADCK4',
 'ADCK5',
 'ADK',
 'AGK',
 'AKT1',
 'AKT2',
 'AKT3',
 'ALK',
 'ALPK1',
 'ALPK2',
 'ALPK3',
 'AMHR2',
 'ANKK1',
 'ANPRA',
 'ANPRB',
 'ARAF',
 'ARBK1',
 'ARBK2',
 'ATM',
 'ATR',
 'AURKB',
 'AURKC',
 'AVR2A',
 'AVR2B',
 'BCKD',
 'BLK',
 'BMP2K',
 'BMPR2',
 'BMR1A',
 'BMR1B',
 'BMX',
 'BRAF',
 'BRD2',
 'BRD3',
 'BRD4',
 'BRSK1',
 'BRSK2',
 'BTK',
 'BUB1',
 'BUB1B',
 'CAMKV',
 'CD11A',
 'CD11B',
 'CDC7',
 'CDK1',
 'CDK10',
 'CDK12',
 'CDK13',
 'CDK14',
 'CDK15',
 'CDK16',
 'CDK17',
 'CDK18',
 'CDK19',
 'CDK2',
 'CDK20',
 'CDK3',
 'CDK4',
 'CDK5',
 'CDK6',
 'CDK7',
 'CDK8',
 'CDK9',
 'CDKL1',
 'CDKL2',
 'CDKL3',
 'CDKL4',
 'CDKL5',
 'CHK1',
 'CHK2',
 'CHKA',
 'CHKB',
 'CLK1',
 'CLK2',
 'CLK3',
 'CLK4',
 'CSF1R',
 'CSK',
 'CSK21',
 'CSK22',
 'CSK23',
 'CSKP',
 'CTRO',
 'DAPK1',
 'DAPK2',
 'DAPK3',
 'DCK',
 'DCLK1',
 'DCLK2',
 'DCLK3',
 'DDR1',
 'DDR2',
 'DGKQ

In [306]:
print(len(uni_mapper))

146363


In [58]:
#uni_mapper = {x:y for x,y in uni_mapper.items() if y in uniprot_targets}

#uni_mapper.update({x:y for x,y in uni})

### Start importing sources

In [318]:
[x for x,y in uni_mapper.items() if y =='RAC1']

['P63000', 'A4D2P2', 'A4D2P0', 'A4D2P1', 'W0UV93']

In [447]:
data_sources = {}
data_preproc = {}

# hardcode MAPK3 due to MAPKAPK3 issues
id_dict = lambda x: hgnc_mapper[x]  if x in hgnc_mapper.keys()\
                                        else 'MAPK3' if x == 'P27361'\
                                        else uniprot_to_hgnc_map_helper[x] if x in uniprot_to_hgnc_map_helper.keys()\
                                        else uniprot_to_hgnc_map_helper[uni_mapper[x]] if x in uni_mapper.keys() and uni_mapper[x] in uniprot_to_hgnc_map_helper.keys()\
                                        else hgnc_mapper[uni_mapper[x]] if x in uni_mapper.keys() and uni_mapper[x] in hgnc_mapper.keys()\
                                        else None

#else uniprot_to_hgnc_map_helper[uni_mapper[entrez_to_uniprot_number[x]]] if x in entrez_to_uniprot_number.keys() and entrez_to_uniprot_number[x] in uni_mapper.keys() and uni_mapper[entrez_to_uniprot_number[x]] in uniprot_to_hgnc_map_helper.keys()\
#else uniprot_to_hgnc_map_helper[uni_mapper[ensg_to_uniprot_number[x]]] if x in ensg_to_uniprot_number.keys() and ensg_to_uniprot_number[x] in uni_mapper.keys() and uni_mapper[ensg_to_uniprot_number[x]] in uniprot_to_hgnc_map_helper.keys()\
                                        

In [448]:
hippie = pd.read_csv('../data/network/hippie_current.txt', sep='\t', header=None)

data_sources['hippie'] = hippie
data_preproc['hippie'] = {
    'columns':[0,2],
    'maps':[
        lambda x: id_dict(x.split('_')[0]),
        lambda x: id_dict(x.split('_')[0])
    ]
}

print(hippie.columns)
hippie.head()

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')


Unnamed: 0,0,1,2,3,4,5
0,AL1A1_HUMAN,216,AL1A1_HUMAN,216,0.76,"experiments:in vivo,Two-hybrid;pmids:12081471,..."
1,ITA7_HUMAN,3679,ACHA_HUMAN,1134,0.73,"experiments:in vivo,Affinity Capture-Western,a..."
2,NEB1_HUMAN,55607,ACTG_HUMAN,71,0.65,"experiments:in vitro,in vivo;pmids:9362513,120..."
3,SRGN_HUMAN,5552,CD44_HUMAN,960,0.63,"experiments:in vivo;pmids:9334256,16189514,167..."
4,GRB7_HUMAN,2886,ERBB2_HUMAN,2064,0.9,"experiments:in vitro,in vivo,Reconstituted Com..."


In [449]:
phosphosite_substrate = pd.read_csv('../data/network/Kinase_Substrate_Dataset.txt', sep='\t', skiprows=2)

data_sources['phosphosite_substrate'] = phosphosite_substrate
data_preproc['phosphosite_substrate'] = {
    'columns':['GENE','SUB_GENE'],
    'maps':[
        lambda x: id_dict(x.upper()),
        lambda x: id_dict(x.upper())
    ]
}

print(phosphosite_substrate.columns)
display(phosphosite_substrate.head())


phosphosite_regulatory = pd.read_csv('../data/network/Regulatory_sites.txt', sep='\t', skiprows=3, error_bad_lines=False)

#data_sources['phosphosite_regulatory'] =  phosphosite_regulatory
data_preproc['phosphosite_regulatory'] = {
    'columns':['GENE','SUBSTRATE'],
    'maps':[
        lambda x: id_dict(x.upper()),
        lambda x: id_dict(x.upper())
    ]
}

#print(phosphosite_regulatory.columns)
#display(phosphosite_regulatory.head())

Index(['GENE', 'KINASE', 'KIN_ACC_ID', 'KIN_ORGANISM', 'SUBSTRATE',
       'SUB_GENE_ID', 'SUB_ACC_ID', 'SUB_GENE', 'SUB_ORGANISM', 'SUB_MOD_RSD',
       'SITE_GRP_ID', 'SITE_+/-7_AA', 'DOMAIN', 'IN_VIVO_RXN', 'IN_VITRO_RXN',
       'CST_CAT#'],
      dtype='object')


Unnamed: 0,GENE,KINASE,KIN_ACC_ID,KIN_ORGANISM,SUBSTRATE,SUB_GENE_ID,SUB_ACC_ID,SUB_GENE,SUB_ORGANISM,SUB_MOD_RSD,SITE_GRP_ID,SITE_+/-7_AA,DOMAIN,IN_VIVO_RXN,IN_VITRO_RXN,CST_CAT#
0,Pak2,PAK2,Q64303,rat,MEK1,170851.0,Q01986,Map2k1,rat,S298,448284,RtPGRPLsSYGMDSR,Pkinase,,X,9128; 98195
1,Pak2,PAK2,Q64303,rat,PRKD1,85421.0,Q9WTQ1,Prkd1,rat,S203,449896,GVRRRRLsNVsLTGL,,X,,
2,Pak2,PAK2,Q64303,rat,prolactin,5617.0,P01236,PRL,human,S207,451732,LHCLRRDsHKIDNYL,Hormone_1,,X,
3,Pak2,PAK2,Q64303,rat,prolactin,24683.0,P01237,Prl,rat,S206,451732,IRCLRRDsHKVDNYL,Hormone_1,,X,
4,EIF2AK1,HRI,Q9BQI3,human,eIF2-alpha,54318.0,P68101,Eif2s1,rat,S52,447635,MILLSELsRRRIRSI,S1,,X,3597; 9721; 3398; 5199


In [450]:
hprd = pd.read_csv('../data/network/HPRD_ALL_BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt', sep='\t', header=None)

data_sources['hprd'] = hprd
data_preproc['hprd'] = {
    'columns':[0,3],
    'maps':[
        lambda x: id_dict(x.upper()),
        lambda x: id_dict(x.upper())
    ]
}

print(hprd.columns)
hprd.head()

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')


Unnamed: 0,0,1,2,3,4,5,6,7
0,ALDH1A1,1,NP_000680.2,ALDH1A1,1,NP_000680.2,in vivo;yeast 2-hybrid,1208147116189510
1,ITGA7,2761,NP_001138468.1,CHRNA1,7,NP_001034612.1,in vivo,10910772
2,PPP1R9A,16000,NP_060120.2,ACTG1,17,NP_001605.1,in vitro;in vivo,936251312052877
3,SRGN,1513,NP_002718.2,CD44,115,NP_000601.3,in vivo,9334256
4,GRB7,3311,NP_005301.2,ERBB2,1281,NP_004439.2,in vitro;in vivo,9079677


In [451]:
entrez_to_uniprot_number['31']

'Q7Z5W8'

In [452]:
reactome_missing[reactome_missing['# Interactor 1 uniprot id'].apply(lambda x: len(x) < 5)]

Unnamed: 0,# Interactor 1 uniprot id,Interactor 1 Ensembl gene id,Interactor 1 Entrez Gene id,Interactor 2 uniprot id,Interactor 2 Ensembl gene id,Interactor 2 Entrez Gene id,Interaction type,Interaction context,Pubmed references


In [453]:
reactome_missing.shape

(19956, 9)

In [454]:
for i, row in reactome_missing.iterrows():
    
    for j in row:
        print(j.split('|'))
    break

['uniprotkb:Q13085']
['ENSEMBL:ENSG00000275176', 'ENSEMBL:ENSG00000278540']
['entrezgene/locuslink:31']
['ChEBI:29035']
['-']
['-']
['physical association']
['reactome:R-HSA-8876883']
['24243840']


In [455]:
reactome_missing

Unnamed: 0,# Interactor 1 uniprot id,Interactor 1 Ensembl gene id,Interactor 1 Entrez Gene id,Interactor 2 uniprot id,Interactor 2 Ensembl gene id,Interactor 2 Entrez Gene id,Interaction type,Interaction context,Pubmed references
11,uniprotkb:Q13085,ENSEMBL:ENSG00000275176|ENSEMBL:ENSG00000278540,entrezgene/locuslink:31,ChEBI:29035,-,-,physical association,reactome:R-HSA-8876883,24243840
13,ChEBI:29035,-,-,uniprotkb:O00763,ENSEMBL:ENSG00000076555,entrezgene/locuslink:32,physical association,reactome:R-HSA-8867341,24243840
16,uniprotkb:O00763,ENSEMBL:ENSG00000076555,entrezgene/locuslink:32,ChEBI:29035,-,-,physical association,reactome:R-HSA-8876893,15060529
322,uniprotkb:Q9C000,ENSEMBL:ENSG00000091592,entrezgene/locuslink:22861|entrezgene/locuslin...,ChEBI:59414,-,-,physical association,reactome:R-HSA-1296412,17349957
325,ChEBI:46661,-,-,uniprotkb:P05067,ENSEMBL:ENSG00000142192,entrezgene/locuslink:351,physical association,reactome:R-NUL-1296409,17703304|17349957
329,ChEBI:16336,-,-,uniprotkb:P09616,-,-,physical association,reactome:R-NUL-1296409,17703304|17349957
331,ChEBI:46661,-,-,uniprotkb:P08238,ENSEMBL:ENSG00000096384,entrezgene/locuslink:3326,physical association,reactome:R-NUL-1296409,17703304|17349957
332,ChEBI:30563,-,-,uniprotkb:Q9Y2Z0,ENSEMBL:ENSG00000165416,entrezgene/locuslink:10910,physical association,reactome:R-NUL-1296409,17703304|17349957
333,ChEBI:16336,-,-,uniprotkb:Q9Y2Z0,ENSEMBL:ENSG00000165416,entrezgene/locuslink:10910,physical association,reactome:R-NUL-1296409,17703304|17349957
336,ChEBI:30563,-,-,uniprotkb:P09616,-,-,physical association,reactome:R-NUL-1296409,17703304|17349957


In [456]:
r1 = reactome['# Interactor 1 uniprot id'].apply(lambda x: id_dict(x.split(':')[1]))
r2 = reactome['Interactor 2 uniprot id'].apply(lambda x: id_dict(x.split(':')[1]))

In [457]:
new_react = pd.DataFrame()
new_react['Node1'] = r1
new_react['Node2'] = r2

new_react = new_react[r1.isin(kinome_set)&r2.isin(kinome_set)]
display(new_react.shape)
new_react.head()

(3067, 2)

Unnamed: 0,Node1,Node2
546,RIPK1,RIPK1
547,RIPK3,RIPK3
548,RIPK3,RIPK1
550,MLKL,MLKL
638,MLKL,RIPK3


In [458]:
for i in new_react.index:
    new_react.loc[i] = sorted(new_react.loc[i])
    
new_react.drop_duplicates(inplace=True)

display(new_react.shape)
new_react.head()

(554, 2)

Unnamed: 0,Node1,Node2
546,RIPK1,RIPK1
547,RIPK3,RIPK3
548,RIPK1,RIPK3
550,MLKL,MLKL
638,MLKL,RIPK3


In [459]:
from copy import copy
reactome = pd.read_csv('../data/network/reactome.homo_sapiens.interactions.tab-delimited.txt', sep='\t')

data_sources['reactome'] = reactome
data_preproc['reactome'] = {
    'columns': ['# Interactor 1 uniprot id','Interactor 2 uniprot id'],
    'maps': [
        lambda x: id_dict(x.split(':')[1]),
        lambda x: id_dict(x.split(':')[1])
    ]
}

"""
reactome_combined = copy(reactome[reactome['# Interactor 1 uniprot id'].apply(lambda x: 'uniprot' in x)&reactome['Interactor 2 uniprot id'].apply(lambda x: 'uniprot' in x)][['# Interactor 1 uniprot id', 'Interactor 2 uniprot id']])
reactome_combined.head()

present = reactome_combined.index

reactome_missing = copy(reactome[~ pd.Series(reactome.index).isin(present)])



#reactome_ensembl = reactome[~ pd.Series(reactome.index).isin(present)]"""

# combine multiple columns for ease
#reactome_combined = copy(reactome)

print(reactome.columns)
reactome.head()

Index(['# Interactor 1 uniprot id', 'Interactor 1 Ensembl gene id',
       'Interactor 1 Entrez Gene id', 'Interactor 2 uniprot id',
       'Interactor 2 Ensembl gene id', 'Interactor 2 Entrez Gene id',
       'Interaction type', 'Interaction context', 'Pubmed references'],
      dtype='object')


Unnamed: 0,# Interactor 1 uniprot id,Interactor 1 Ensembl gene id,Interactor 1 Entrez Gene id,Interactor 2 uniprot id,Interactor 2 Ensembl gene id,Interactor 2 Entrez Gene id,Interaction type,Interaction context,Pubmed references
0,uniprotkb:Q9Y287,ENSEMBL:ENSG00000136156,entrezgene/locuslink:9445,uniprotkb:Q9Y287,ENSEMBL:ENSG00000136156,entrezgene/locuslink:9445,physical association,reactome:R-HSA-976871,14690516|10391242
1,uniprotkb:P37840,ENSEMBL:ENSG00000145335,entrezgene/locuslink:6622,uniprotkb:P37840,ENSEMBL:ENSG00000145335,entrezgene/locuslink:6622,physical association,reactome:R-HSA-1247852,24243840
2,uniprotkb:P0DJI8,ENSEMBL:ENSG00000173432,entrezgene/locuslink:6288,uniprotkb:P0DJI8,ENSEMBL:ENSG00000173432,entrezgene/locuslink:6288,physical association,reactome:R-HSA-976898,19393650|103558
3,uniprotkb:P06727,ENSEMBL:ENSG00000110244,entrezgene/locuslink:337,uniprotkb:P06727,ENSEMBL:ENSG00000110244,entrezgene/locuslink:337,physical association,reactome:R-HSA-976889,15146166
4,uniprotkb:P01160,ENSEMBL:ENSG00000175206,entrezgene/locuslink:4878,uniprotkb:P01160,ENSEMBL:ENSG00000175206,entrezgene/locuslink:4878,physical association,reactome:R-HSA-976987,2142465|2945573


In [460]:
biogrid = pd.read_csv('../data/network/BIOGRID-ORGANISM-Homo_sapiens-3.5.168.tab2.txt', low_memory=False, sep='\t')

data_sources['biogrid'] = biogrid
data_preproc['biogrid'] = {
    'columns': ['Official Symbol Interactor A','Official Symbol Interactor B'],
    'maps': [
        lambda x: id_dict(x),
        lambda x: id_dict(x)
    ]
}

print(biogrid.columns)
biogrid.head()

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [461]:
i2d = pd.read_csv('../data/network/i2d.2_9.txt', sep='\t')

data_sources['i2d'] = i2d
data_preproc['i2d'] = {
    'columns': ['SwissProt1','SwissProt2'],
    'maps': [
        lambda x: id_dict(x),
        lambda x: id_dict(x)
    ]
}

print(i2d.columns)
i2d.head()

Index(['Dataset', 'SwissProt1', 'SwissProt2'], dtype='object')


Unnamed: 0,Dataset,SwissProt1,SwissProt2
0,SOURAV_MAPK_LOW,P63000,A0AUZ9
1,IntAct,Q96CV9,A0AUZ9
2,BioGrid,P0CG48,A0AV96
3,IntAct_Mouse,P62258,A0AV96
4,IntAct_Mouse,P63104,A0AV96


In [462]:
mint = pd.read_csv('../data/network/species_human.txt', sep='\t', header=None)

# drop empty interaction columns
mint = mint[mint[0].apply(lambda x: ':' in x)]
mint = mint[mint[1].apply(lambda x: ':' in x)]

data_sources['mint'] = mint
data_preproc['mint'] = {
    'columns': [0,1],
    'maps':[
        lambda x: id_dict(x.split(':')[1]),
        lambda x: id_dict(x.split(':')[1])
    ]
}

print(mint.columns)
mint.head()

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484777|mint:MINT-8206988|imex:IM-1...,intact-miscore:0.40
1,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484916|mint:MINT-8207003|imex:IM-1...,intact-miscore:0.40
2,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484965|mint:MINT-8207016|imex:IM-1...,intact-miscore:0.40
3,uniprotkb:Q90VU7,uniprotkb:Q900A7,intact:EBI-7460704|intact:MINT-5281653,intact:EBI-7460739|intact:MINT-8208185,psi-mi:q90vu7_9hiv1(display_long)|uniprotkb:ne...,psi-mi:q900a7_9hiv1(display_long)|uniprotkb:ta...,"psi-mi:""MI:0416""(fluorescence microscopy)",Sugiyama et al. (2011),pubmed:21970979|imex:IM-16787|mint:MINT-820620...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0403""(colocalization)","psi-mi:""MI:0471""(MINT)",intact:EBI-7460732|mint:MINT-8206271|imex:IM-1...,intact-miscore:0.27
4,uniprotkb:Q2Q067,uniprotkb:Q13015,intact:EBI-9675545,intact:EBI-6269719,psi-mi:q2q067_9dela(display_long)|uniprotkb:HB...,psi-mi:af1q_human(display_long)|uniprotkb:MLLT...,"psi-mi:""MI:0397""(two hybrid array)",Simonis et al. (2012),imex:IM-22977|pubmed:22458338,taxid:11908(humt-)|taxid:11908(Human T-lymphot...,taxid:9606(human)|taxid:9606(Homo sapiens),"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-9675551|imex:IM-22977-4,intact-miscore:0.49


## Network Assembly

In [463]:
network = pd.DataFrame(columns=['Node1', 'Node2'])
network.head()

Unnamed: 0,Node1,Node2


In [464]:
network = pd.DataFrame(columns=['Node1', 'Node2'])

for source_name, data in data_sources.items():
    
    #if source_name in ['hippie', 'phosphosite_substrate', 'hprd', 'biogrid']:
    #    continue
    
    print('Adding', source_name, 'to network')
    
    left_col, right_col = data_preproc[source_name]['columns']
    left_map, right_map = data_preproc[source_name]['maps']
    
    data = data[[left_col, right_col]].dropna(axis=0)
    
    #print(data.dtypes)#[left_col])
    
    to_add = pd.DataFrame(columns=network.columns)
    to_add['Node1'] = data[left_col].apply(left_map)
    to_add['Node2'] = data[right_col].apply(right_map)
    to_add.dropna(axis=0, inplace = True)
   
    network = pd.concat([network, to_add], ignore_index=True)
    
    #break
    
# drop obvious duplicates
network.drop_duplicates(inplace = True)

# drop items not in the target kinome set
network = network[(network['Node1'].isin(kinome_set)) & (network['Node2'].isin(kinome_set))]

# drop a -> b b -> a redundancies 
for i in network.index:
    network.loc[i] = sorted(network.loc[i])
    
network.drop_duplicates(inplace=True)

network

Adding hippie to network
Adding phosphosite_substrate to network
Adding hprd to network
Adding reactome to network
Adding biogrid to network
Adding i2d to network
Adding mint to network


Unnamed: 0,Node1,Node2
2,ERBB2,PAK1
45,YES1,YES1
47,MST1R,YES1
49,TYRO3,YES1
72,KIT,KIT
73,KIT,LYN
76,KIT,PRKCA
78,KIT,SRC
79,KIT,MATK
82,KIT,TEC


### Hippie Analysis


In [109]:
kyla_hippie = pd.read_csv('/Users/isrobson/Documents/MATLAB/hippie_compiled.txt', sep='\t', skiprows=3, header=None)

kyla_hippie[0] = kyla_hippie[0].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])
kyla_hippie[1] = kyla_hippie[1].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])

for i in kyla_hippie.index:
    kyla_hippie.loc[i] = sorted(kyla_hippie.loc[i])
    
kyla_hippie

Unnamed: 0,0,1
0,ERBB2,PAK1
1,MST1R,YES1
2,TYRO3,YES1
3,KIT,LYN
4,KIT,PRKCA
5,KIT,SRC
6,KIT,MATK
7,KIT,TEC
8,FGR,SRC
9,FGR,SYK


In [110]:
new_hippie_edges = set(zip(network['Node1'], network['Node2']))
kyla_hippie_edges = set(zip(kyla_hippie[0], kyla_hippie[1]))

In [111]:
print('kyla: ', len(kyla_hippie_edges))
print('new: ', len(new_hippie_edges))
print('overlap: ',len(new_hippie_edges & kyla_hippie_edges))

kyla:  2384
new:  3134
overlap:  2373


In [112]:
new_hippie_tot = set(network['Node1']) | set(network['Node2'])
kyla_hippie_tot = set(kyla_hippie[0]) | set(kyla_hippie[1])

In [113]:
print('kyla: ', len(kyla_hippie_tot))
print('new: ', len(new_hippie_tot))
print(len(new_hippie_tot & kyla_hippie_tot))

kyla:  461
new:  496
457


###  Phosphosite Substrate Analysis

In [120]:
kyla_phosphosite_substrate = pd.read_csv('/Users/isrobson/Documents/MATLAB/phosphosite_substrate_compiled.txt', sep='\t', skiprows=3, header=None)

kyla_phosphosite_substrate[0] = kyla_phosphosite_substrate[0].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])
kyla_phosphosite_substrate[1] = kyla_phosphosite_substrate[1].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])

for i in kyla_phosphosite_substrate.index:
    kyla_phosphosite_substrate.loc[i] = sorted(kyla_phosphosite_substrate.loc[i])
    
kyla_phosphosite_substrate

Unnamed: 0,0,1
0,PRKCD,RPS6KB2
1,PRKCD,PRKD1
2,LIMK2,PRKCD
3,GSK3A,PRKCD
4,GSK3B,PRKCD
5,MET,PRKCD
6,PRKCD,SRC
7,CSNK2A1,PAK1
8,CSNK2A1,RPS6KA5
9,CSNK2A1,MAPK9


In [121]:
new_edges = set(zip(network['Node1'], network['Node2']))
kyla_edges = set(zip(kyla_phosphosite_substrate[0], kyla_phosphosite_substrate[1]))

In [122]:
kyla_edges - new_edges

{('AKT1', 'MAPK1'),
 ('CAMKK1', 'SLC12A7'),
 ('LIMK1', 'MAPK1'),
 ('MAPK1', 'MAPK14'),
 ('MAPK1', 'PLK1'),
 ('MAPK1', 'TRIM28')}

In [124]:
aurka_hits = set(phosphosite_substrate[phosphosite_substrate['SUB_GENE']=='AURKA']['GENE'].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() and hgnc_mapper[x] in kinome_set else None))
aurka_hits

{'AURKA', 'PAK1', 'PRKACA', 'SRC'}

In [125]:
[x for x,y in hgnc_mapper.items() if y == 'MAPK3'] 

['PRKM3', 'ERK1', 'p44mapk', 'p44erk1', 'HGNC:6877', 'MAPK3']

In [126]:
set(phosphosite_substrate[phosphosite_substrate['GENE']=='MAPK3']['SUB_ACC_ID'].apply(lambda x: uni_mapper[x] if x in uni_mapper.keys() else None))

{'BRAF',
 'CDK2',
 'DAPK1',
 'EGFR',
 'ERBB2',
 'FGFR1',
 'GSK3B',
 'LCK',
 'MAPK3',
 'MKNK1',
 'MYLK',
 None,
 'RAF1',
 'TTK'}

In [127]:
print('kyla: ', len(kyla_edges))
print('new: ', len(new_edges))
print('overlap: ',len(new_edges & kyla_edges))

kyla:  487
new:  1171
overlap:  481


In [128]:
new_tot = set(network['Node1']) | set(network['Node2'])
kyla_tot = set(kyla_phosphosite_substrate[0]) | set(kyla_phosphosite_substrate[1])

In [129]:
print('kyla: ', len(kyla_tot))
print('new: ', len(new_tot))
print(len(new_tot & kyla_tot))

kyla:  218
new:  370
217


#### Missing some kind of MAPK1

In [132]:
e = [x for x,y in hgnc_mapper.items() if y == 'MAPK1']
[x for x in e if x in new_tot]

['MAPK1']

### HPRD Analysis

In [150]:
kyla_hprd = pd.read_csv('/Users/isrobson/Documents/MATLAB/hprd_compiled.txt', sep='\t', skiprows=3, header=None)

kyla_hprd[0] = kyla_hprd[0].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])
kyla_hprd[1] = kyla_hprd[1].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])

for i in kyla_hprd.index:
    kyla_hprd.loc[i] = sorted(kyla_hprd.loc[i])
    
kyla_hprd

Unnamed: 0,0,1
0,ERBB2,PAK1
1,MST1R,YES1
2,TYRO3,YES1
3,KIT,LYN
4,KIT,PRKCA
5,KIT,SRC
6,KIT,MATK
7,KIT,TEC
8,FGR,SRC
9,FGR,SYK


In [153]:
new_edges = set(zip(network['Node1'], network['Node2']))
kyla_edges = set(zip(kyla_hprd[0], kyla_hprd[1]))

In [154]:
kyla_edges - new_edges

set()

In [155]:
print('kyla: ', len(kyla_edges))
print('new: ', len(new_edges))
print('overlap: ',len(new_edges & kyla_edges))

kyla:  247
new:  1039
overlap:  247


In [161]:
new_tot = set(network['Node1']) | set(network['Node2'])
kyla_tot = set(kyla_hprd[0]) | set(kyla_hprd[1])

In [162]:
print('kyla: ', len(kyla_tot))
print('new: ', len(new_tot))
print(len(new_tot & kyla_tot))

kyla:  102
new:  363
102


### I2D analysis

In [329]:
kyla_i2d = pd.read_csv('/Users/isrobson/Documents/MATLAB/i2d_compiled.txt', sep='\t', skiprows=3, header=None)

kyla_i2d[0] = kyla_i2d[0].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])
kyla_i2d[1] = kyla_i2d[1].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else uniprot_to_hgnc_map_helper[x])

for i in kyla_i2d.index:
    kyla_i2d.loc[i] = sorted(kyla_i2d.loc[i])
    
kyla_i2d.drop_duplicates(inplace=True)
kyla_i2d

Unnamed: 0,0,1
0,BMPR1B,MAP2K7
1,BMPR1B,BRAF
2,ACVR2A,BMPR1B
3,BMPR1A,BMPR1B
4,CDC7,CDK4
5,CDC7,CDK2
6,CDC7,CDK9
7,CDC7,ICK
8,EEF2K,MAPK13
9,EEF2K,PRKACA


In [330]:
new_edges = set(zip(network['Node1'], network['Node2']))
kyla_edges = set(zip(kyla_i2d[0], kyla_i2d[1]))

In [331]:
r = [x for x,y in uni_mapper.items() if y == 'MAPK3']
r

['P27361',
 'L7RXH5',
 'B3KR49',
 'E9PJF0',
 'E9PBK7',
 'E9PQW4',
 'H0YEX6',
 'H0YDH9',
 'E9PRH7',
 'Q9BWJ1']

In [332]:
uniprot_to_hgnc_map_helper['MAPK3']

'MAPKAPK3'

In [333]:
[id_dict(x) for x in r]

['MAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3',
 'MAPKAPK3']

In [334]:
new_edges - kyla_edges

{('ABL1', 'ABL1'),
 ('ABL1', 'CDK1'),
 ('ABL1', 'MTOR'),
 ('ABL1', 'UHMK1'),
 ('ABL2', 'ABL2'),
 ('ACVR1', 'ACVR1'),
 ('ACVR1', 'CDK14'),
 ('ACVR1', 'TGFBR2'),
 ('ACVR1', 'UHMK1'),
 ('ACVR1B', 'ACVR1B'),
 ('ACVR2A', 'ACVR2A'),
 ('ACVRL1', 'TGFBR2'),
 ('AGK', 'HCK'),
 ('AGK', 'RYK'),
 ('AK1', 'AKT1'),
 ('AK1', 'BRD4'),
 ('AK2', 'AK2'),
 ('AK2', 'CAMKK2'),
 ('AK2', 'CDK5'),
 ('AK2', 'CHEK2'),
 ('AK2', 'IKBKE'),
 ('AK2', 'NME2'),
 ('AK2', 'PAK2'),
 ('AK2', 'PFKP'),
 ('AK2', 'PRKAA2'),
 ('AK2', 'STK11'),
 ('AKT1', 'AKT1'),
 ('AKT1', 'HK1'),
 ('AKT1', 'MAPKAPK2'),
 ('AKT1', 'MTOR'),
 ('AKT1', 'PI4K2B'),
 ('AKT1', 'PIK3CA'),
 ('AKT1', 'PRKCB'),
 ('AKT1', 'PRKCQ'),
 ('AKT2', 'AKT2'),
 ('AKT2', 'MAPKAPK2'),
 ('AKT2', 'PIK3CD'),
 ('AKT2', 'SIK2'),
 ('ALK', 'ALK'),
 ('ALK', 'CDK13'),
 ('ALK', 'JAK3'),
 ('ALK', 'PIK3CB'),
 ('AMHR2', 'BMPR1B'),
 ('AMHR2', 'TGFBR1'),
 ('ARAF', 'ARAF'),
 ('ARAF', 'BRAF'),
 ('ARAF', 'CAMKK2'),
 ('ARAF', 'HIPK4'),
 ('ARAF', 'IRAK2'),
 ('ARAF', 'MAP2K1'),
 ('ARAF', 'MA

In [335]:
kyla_edges - new_edges

{('AKT1', 'MAPK1'),
 ('AKT2', 'MAPK1'),
 ('CAMKK1', 'SLC12A7'),
 ('CAMKK2', 'SLC12A7'),
 ('CHEK1', 'MAPK1'),
 ('CHEK2', 'TSSK1A'),
 ('EEF2K', 'MAPK1'),
 ('IRAK2', 'MAPK1'),
 ('LIMK1', 'MAPK1'),
 ('MAPK1', 'MAPK11'),
 ('TGFBR1', 'TSSK1A')}

In [336]:
print('kyla: ', len(kyla_edges))
print('new: ', len(new_edges))
print('overlap: ',len(new_edges & kyla_edges))

kyla:  1751
new:  2643
overlap:  1740


In [337]:
new_tot = set(network['Node1']) | set(network['Node2'])
kyla_tot = set(kyla_i2d[0]) | set(kyla_i2d[1])

In [338]:
print('kyla: ', len(kyla_tot))
print('new: ', len(new_tot))
print(len(new_tot & kyla_tot))

kyla:  355
new:  479
353


In [305]:
uniprot_to_hgnc_map_helper[uni_mapper['P27361']]

'MAPKAPK3'

In [303]:
'MAPK3' in new_tot

False

In [177]:
np.sum(network['Node1'] == network['Node2'])

209

In [None]:
t = network[network['Node1'] != network['Node2']]
t

In [None]:
len(set(t['Node1'].unique()) | set(t['Node2'].unique()))

In [35]:
[x for x,y in hgnc_mapper.items() if 'PRKCQ' in y]

['ENST00000414894.1', 'HGNC:44689', 'PRKCQ', 'PRKCQ-AS1']

## Full Comparison

In [340]:
kyla_network = pd.read_csv('../data/KIN_edges_no_weights.txt', header=None, sep='\t')

rna_to_MS = kmast.set_index('RNAseq Gene')['MS Gene'].to_dict()
hgnc_mapper.update({'KPCA':'PRKCA','KSYK':'SYK', 'VGFR2':'KDR', 'KPCT':'PRKCQ'})
kyla_network[0] = kyla_network[0].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else hgnc_mapper[rna_to_MS[x]])
kyla_network[1] = kyla_network[1].apply(lambda x: hgnc_mapper[x] if x in hgnc_mapper.keys() else hgnc_mapper[rna_to_MS[x]])

kyla_network

Unnamed: 0,0,1
0,PAK1,ERBB2
1,MST1R,YES1
2,TYRO3,YES1
3,LYN,KIT
4,PRKCA,KIT
5,SRC,KIT
6,MATK,KIT
7,TEC,KIT
8,FGR,SRC
9,SYK,FGR


In [341]:
len(set(kyla_network[0].unique()) | set(kyla_network[1].unique()))

473

In [342]:
np.sum(kyla_network[0] == kyla_network[1])

0

In [343]:
for i in kyla_network.index:
    kyla_network.loc[i] = sorted(kyla_network.loc[i])

In [344]:
kyla_tuples = list(zip(kyla_network[0], kyla_network[1]))
new_tuples = list(zip(network['Node1'], network['Node2']))

### This used to be > 2700

### Best: 2805

In [351]:
missing = set(kyla_tuples) - set(new_tuples)
l1, l2 = zip(*missing)
problem_kin = list(l1)+list(l2)

In [432]:
[k for k in problem_kin if k not in set(uni_mapper.values())]

[]

In [428]:
'CAMK2B' in kinome_set

True

In [437]:
reactome_set = set(r1) | set(r2)

In [438]:
'EPHB4' in reactome_set

False

In [444]:
network

Unnamed: 0,Node1,Node2
520,RIPK1,RIPK1
521,RIPK3,RIPK3
522,RIPK1,RIPK3
524,MLKL,MLKL
581,MLKL,RIPK3
582,MLKL,RIPK1
766,RET,RET
909,PIK3CB,RET
924,PIK3CD,RET
945,PIK3CA,RET


In [442]:
list(filter(lambda x: 'EPHB4' == x[0] or 'EPHB4' == x[1], kyla_tuples))

[('EGFR', 'EPHB4'),
 ('EPHB4', 'EPHB6'),
 ('EPHB4', 'PKM'),
 ('EPHA1', 'EPHB4'),
 ('EPHA2', 'EPHB4'),
 ('EPHA3', 'EPHB4'),
 ('EPHA8', 'EPHB4'),
 ('EPHB3', 'EPHB4'),
 ('EPHB4', 'KALRN'),
 ('EPHB4', 'ROCK2'),
 ('EPHB4', 'PAK3'),
 ('EPHB4', 'FYN'),
 ('EPHB4', 'YES1'),
 ('EPHB4', 'LYN'),
 ('EPHB4', 'SRC'),
 ('EPHB2', 'EPHB4'),
 ('EPHB4', 'LIMK1'),
 ('EPHB4', 'PTK2'),
 ('EPHB4', 'PAK1'),
 ('EPHB4', 'ROCK1'),
 ('EPHA10', 'EPHB4'),
 ('EPHA6', 'EPHB4'),
 ('EPHA4', 'EPHB4'),
 ('EPHA7', 'EPHB4')]

In [441]:
list(filter(lambda x: 'EPHB4' == x[0] or 'EPHB4' == x[1], new_tuples))

[('EGFR', 'EPHB4'), ('EPHB4', 'EPHB6'), ('EPHB4', 'PKM')]

In [443]:
'EPHB3' in new_tot

True

In [None]:
P54760

In [None]:
network.to_csv('../renew_net.txt', sep='\t' header=None)

In [354]:
from collections import Counter

c = Counter(problem_kin)
c.most_common()

[('EPHB4', 21),
 ('CAMK2B', 21),
 ('FGFR4', 21),
 ('FYN', 21),
 ('FGFR3', 21),
 ('CAMK2D', 20),
 ('CAMK2G', 20),
 ('EPHB3', 20),
 ('FGFR2', 19),
 ('ROCK1', 19),
 ('ROCK2', 19),
 ('BRAF', 18),
 ('TEK', 18),
 ('ERBB4', 17),
 ('FGFR1', 17),
 ('CAMK2A', 17),
 ('PDPK1', 17),
 ('KIT', 17),
 ('PTK2', 17),
 ('LYN', 16),
 ('EPHB2', 16),
 ('EPHA8', 15),
 ('EPHA10', 15),
 ('EPHA6', 14),
 ('EPHA2', 14),
 ('ERBB3', 14),
 ('PDGFRA', 14),
 ('EPHA1', 13),
 ('PDGFRB', 13),
 ('EPHA4', 12),
 ('ERBB2', 12),
 ('EPHA3', 12),
 ('YES1', 12),
 ('EPHA7', 11),
 ('MAPK10', 11),
 ('JAK1', 10),
 ('IRAK2', 9),
 ('RPS6KA6', 9),
 ('RIPK2', 9),
 ('NEK2', 8),
 ('RPS6KA3', 8),
 ('PLK1', 8),
 ('RAF1', 8),
 ('EGFR', 8),
 ('JAK2', 7),
 ('IRAK1', 7),
 ('CSNK1E', 7),
 ('MAPK8', 7),
 ('PLK4', 7),
 ('MAPK9', 7),
 ('PRKACA', 6),
 ('CSNK1D', 6),
 ('KALRN', 6),
 ('PAK3', 6),
 ('RPS6KA2', 6),
 ('MARK4', 6),
 ('RPS6KA1', 6),
 ('MAPK1', 6),
 ('SRC', 6),
 ('TTBK2', 6),
 ('TRIB3', 6),
 ('MAP3K1', 5),
 ('MAP2K3', 5),
 ('MAP2K6', 5),
 ('

In [345]:
len(set(kyla_tuples) & set(new_tuples))

3070

In [355]:
kyla_tot = set(kyla_network[0]) | set(kyla_network[1])

In [356]:
(set(kyla_tuples) | set(new_tuples)) - set(new_tuples)

{('ABL1', 'MAPK1'),
 ('ABL1', 'MAPK3'),
 ('ACVR1B', 'ACVR1C'),
 ('AKT1', 'FYN'),
 ('AKT1', 'LCK'),
 ('AKT1', 'LYN'),
 ('AKT1', 'YES1'),
 ('AKT2', 'FGFR1'),
 ('AKT2', 'FGFR2'),
 ('AKT2', 'FGFR3'),
 ('AKT2', 'FGFR4'),
 ('AKT2', 'PIK3R4'),
 ('ATM', 'CDK4'),
 ('ATM', 'WEE1'),
 ('AURKA', 'CSNK1D'),
 ('AURKA', 'CSNK1E'),
 ('AURKA', 'NEK2'),
 ('AURKA', 'PLK4'),
 ('AURKB', 'BUB1'),
 ('AURKB', 'TAOK1'),
 ('BRAF', 'CAMK2A'),
 ('BRAF', 'CAMK2B'),
 ('BRAF', 'CAMK2D'),
 ('BRAF', 'CAMK2G'),
 ('BRAF', 'ERBB2'),
 ('BRAF', 'ERBB3'),
 ('BRAF', 'ERBB4'),
 ('BRAF', 'FGFR1'),
 ('BRAF', 'FGFR3'),
 ('BRAF', 'FGFR4'),
 ('BRAF', 'FYN'),
 ('BRAF', 'JAK1'),
 ('BRAF', 'JAK2'),
 ('BRAF', 'KIT'),
 ('BRAF', 'PDGFRA'),
 ('BRAF', 'PDGFRB'),
 ('BRAF', 'PTK2'),
 ('BRAF', 'TEK'),
 ('BTK', 'MAP3K1'),
 ('CAMK2A', 'ERBB3'),
 ('CAMK2A', 'ERBB4'),
 ('CAMK2A', 'FGFR1'),
 ('CAMK2A', 'FGFR2'),
 ('CAMK2A', 'FGFR3'),
 ('CAMK2A', 'FGFR4'),
 ('CAMK2A', 'FYN'),
 ('CAMK2A', 'KIT'),
 ('CAMK2A', 'MAPK1'),
 ('CAMK2A', 'PDGFRA'),
 ('CAMK2

In [357]:
(set(kyla_tuples) | set(new_tuples)) - set(kyla_tuples)

{('PAK1', 'PAK1'),
 ('PIK3CA', 'PIK3CA'),
 ('PFKP', 'PIK3C2A'),
 ('FGFR2', 'RPS6KA3'),
 ('SIK3', 'STK11'),
 ('MAP2K3', 'STK11'),
 ('CSNK2A1', 'SLK'),
 ('PRKAA1', 'PRKCB'),
 ('ERBB2', 'ERBB2'),
 ('PKM', 'PKM'),
 ('ABL2', 'FLT3'),
 ('CDK5', 'PIK3C3'),
 ('ATM', 'ATM'),
 ('JAK3', 'JAK3'),
 ('NME2', 'NME2'),
 ('PHKG1', 'PRKACA'),
 ('MAP2K6', 'RAF1'),
 ('ICK', 'ICK'),
 ('CASK', 'CDK5'),
 ('AK2', 'CHEK2'),
 ('FRK', 'FRK'),
 ('WEE1', 'WEE1'),
 ('ERBB3', 'ERBB3'),
 ('PAN3', 'PAN3'),
 ('AK2', 'NTRK1'),
 ('BTK', 'BTK'),
 ('MAP2K5', 'MAP2K5'),
 ('ILK', 'RIPK4'),
 ('PFKP', 'PRKAA2'),
 ('TEK', 'TEK'),
 ('CHEK1', 'MET'),
 ('TNK1', 'TNK1'),
 ('BRAF', 'SGK1'),
 ('ARAF', 'CDK4'),
 ('ERBB2', 'RAF1'),
 ('DAPK1', 'SGK2'),
 ('LATS1', 'SRC'),
 ('GRK6', 'GRK6'),
 ('ARAF', 'CDK6'),
 ('MAPK8', 'PKM'),
 ('KSR1', 'YES1'),
 ('PRKACA', 'SYK'),
 ('CAMKV', 'CDK5'),
 ('GUCY2C', 'GUCY2C'),
 ('CDK17', 'TGFBR1'),
 ('CSNK2A2', 'RIOK2'),
 ('FGFR4', 'MAP3K5'),
 ('PRKCD', 'YES1'),
 ('CDK7', 'CDK7'),
 ('ITK', 'SYK'),
 ('MTOR'

In [358]:
new_tot = set(network['Node1']) | set(network['Node2'])

In [359]:
len(new_tot & kyla_tot)

473

In [None]:
print(len(new_tot))
len(set([hgnc_mapper[x] for x in new_tot]))

In [None]:
p = new_tot - (new_tot & kyla_tot)
[hgnc_mapper[x] for x in p]

In [None]:
big_net = pd.read_csv('../data/bigNetwork.txt', error_bad_lines=False, sep='\t', header=None)
big_net.head()

In [None]:
big_net.drop_duplicates(inplace=True)

for i in big_net.index:
    big_net.loc[i] = sorted(big_net.loc[i])
    
big_net.drop_duplicates(inplace=True)

In [None]:
big_tot = set(big_net[0])|set(big_net[1])
len(big_tot)

In [None]:
q = [x.upper() for x in set(big_tot[0])]
print(len(q))
len([x for x in q if x in hgnc_mapper.keys()])