# Network Assembly Notebook

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import copy
from tools import networkHelpers as nh
from importlib import reload
%matplotlib inline

### Get HGNC Mapper

In [2]:
hgnc_mapper = nh.fetch_hgnc_mapper()

### Get Uniprot mapper

In [3]:
uni_mapper = pd.read_csv('../data/networkInputs/HUMAN_9606_idmapping.dat', sep='\t', header=None)
uni_mapper = uni_mapper[uni_mapper[1] == 'Gene_Name'].set_index(0)[2].to_dict()

next(iter(uni_mapper.items()))

('P31946', 'YWHAB')

In [4]:
# size info
uni_rev = {y:x for x,y in uni_mapper.items()}
print(len(uni_mapper))
print(len(uni_rev))

146363
26468


### Extra mappers from UniProt, e.g. ENSG

In [5]:
#!wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz
#!mv HUMAN_9606_idmapping_selected.tab.gz ../data/networkInputs/
#!gunzip ../data/networkInputs/HUMAN_9606_idmapping_selected.tab.gz

In [6]:
t = pd.read_csv('../data/networkInputs/HUMAN_9606_idmapping_selected.tab', sep='\t', header=None, low_memory=False)
display(t.shape)
display(t.head())

entrez_to_uniprot_number = t.set_index(2)[0].to_dict()
ensg_to_uniprot_number = t.set_index(18)[0].to_dict()

(169389, 22)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,P31946,1433B_HUMAN,7529,NP_003395.1; NP_647539.1; XP_016883528.1,4507949; 377656702; 67464628; 1345590; 1034625...,2BQ0:A; 2BQ0:B; 2C23:A; 4DNK:A; 4DNK:B; 5N10:A...,GO:0005737; GO:0005829; GO:0070062; GO:0005925...,UniRef100_P31946,UniRef90_P31946,UniRef50_P31946,...,9606,601289,Hs.643544,8515476; 14702039; 11780052; 15489334; 2357255...,X57346; AK292717; AL008725; CH471077; CH471077...,CAA40621.1; BAF85406.1; -; EAW75893.1; EAW7589...,ENSG00000166913,ENST00000353703; ENST00000372839,ENSP00000300161; ENSP00000361930,11996670; 12364343; 12437930; 12468542; 124825...
1,P62258,1433E_HUMAN,7531,NP_006752.1,374074368; 62131678; 67464424; 194379794; 2210...,2BR9:A; 3UAL:A; 3UBW:A; 6EIH:A,GO:0090724; GO:0005737; GO:0005829; GO:0070062...,UniRef100_P62258,UniRef90_P62258,UniRef50_P62258,...,9606,605066,Hs.513851,7644510; 8858348; 8684458; 20417184; 14702039;...,U20972; U54778; U43399; U43430; U28936; AB0171...,AAC50175.1; AAC50710.1; AAC50625.1; AAD00026.1...,ENSG00000108953; ENSG00000274474,ENST00000264335; ENST00000571732; ENST00000616...,ENSP00000264335; ENSP00000461762; ENSP00000481...,15838597; 11782387; 12657644; 14966136; 153649...
2,Q04917,1433F_HUMAN,7533,NP_003396.1,4507951; 437363; 83754686; 83754699; 83754700;...,2C63:A; 2C63:B; 2C63:C; 2C63:D; 2C74:A; 2C74:B,GO:0005737; GO:0005829; GO:0070062; GO:0098978...,UniRef100_Q04917,UniRef90_Q04917,UniRef50_P62258,...,9606,113508,Hs.226755,8218406; 1578511; 8561965; 8812417; 15461802; ...,L20422; X80536; X78138; X57345; D78577; S80794...,AAA35483.1; CAA56676.1; CAA55017.1; CAA40620.1...,ENSG00000128245,ENST00000248975,ENSP00000248975,10206237; 11121172; 11996670; 12176995; 124801...
3,P61981,1433G_HUMAN,7532,NP_036611.2,6016838; 380764684; 635576381; 82407956; 82407...,2B05:A; 2B05:B; 2B05:C; 2B05:D; 2B05:E; 2B05:F...,GO:0005829; GO:0070062; GO:0005925; GO:0016020...,UniRef100_P61981,UniRef90_P61981,UniRef50_P61981,...,9606,605356; 617665,Hs.744840,10433554; 10486217; 12853948; 15489334; 235725...,AF142498; AB024334; CR541904; CR541925; AC0063...,AAD48408.1; BAA85184.1; CAG46702.1; CAG46723.1...,ENSG00000170027,ENST00000307630,ENSP00000306330,11824616; 11996670; 12364343; 12482592; 150572...
4,P31947,1433S_HUMAN,2810,NP_006133.1,436408756; 306991738; 969812714; 350610438; 35...,1YWT:A; 1YWT:B; 1YZ5:A; 1YZ5:B; 3IQJ:A; 3IQU:A...,GO:0005829; GO:0070062; GO:0005615; GO:0005739...,UniRef100_P31947,UniRef90_P31947,UniRef50_P61981,...,9606,601290,Hs.523718,1390337; 8515476; 9659898; 16710414; 15489334;...,M93010; X57348; AF029081; AF029082; CR541905; ...,AAA59546.1; CAA40623.1; AAC52029.1; AAC52030.1...,ENSG00000175793,ENST00000339276,ENSP00000340989,10969776; 12582028; 12730237; 12787309; 145172...


### Kinase Master List (to filter)

In [7]:
kmast = pd.read_excel('../data/ref/KINASESmasterlist_w_Aliases.xlsx')
kmast.head()

Unnamed: 0,Uniprot Protein,MS Gene,RNAseq Gene,RNAseq Accession,Family,Mouse Uniprot Protein,Mouse RNAseq gene,Mouse RNAseq Accession,Kinome Render Tree Name,Aliases (Conservative),...,Aliases,description,other_designations,Entrez_Symbol,Old_Name,Entrez_Synonyms,Entrez_other_designations,Unnamed: 18,Gene Symbol,Gene Symbol and Synonyms
0,AAK1,AAK1,AAK1,22848.0,Other,Aak1,Aak1,269774,AAK1,"AAK1,",...,,AP2 associated kinase 1,adaptor-associated kinase 1,AAK1,AAK1,KIAA1048|MGC138170,adaptor-associated kinase 1,,AAK1,AAK1
1,AAPK1,PRKAA1,PRKAA1,5562.0,CAMK,Aapk1,Prkaa1,105787,AMPK[alpha]1,"PRKAA1, AMPKa1",...,"AMPK, AMPKa1","protein kinase, AMP-activated, alpha 1 catalyt...","5'-AMP-activated protein kinase, catalytic alp...",PRKAA1,AMPKa1,AMPK|AMPKa1|MGC33776|MGC57364,"5'-AMP-activated protein kinase, catalytic alp...",,PRKAA1,"AMPK, PRKAA1, AMPKa1"
2,AAPK2,PRKAA2,PRKAA2,5563.0,CAMK,Aapk2,Prkaa2,108079,AMPK[alpha]2,"PRKAA2, AMPK2, AMPKa2, PRKAA",...,"AMPK, AMPK2, AMPKa2, PRKAA","protein kinase, AMP-activated, alpha 2 catalyt...","5'-AMP-activated protein kinase, catalytic alp...",PRKAA2,AMPKa2,AMPK|AMPK2|PRKAA,"5'-AMP-activated protein kinase, catalytic alp...",,PRKAA2,"AMPK2, AMPK, PRKAA, AMPKa2, PRKAA2"
3,ABL1,ABL1,ABL1,25.0,TK,Abl1,Abl1,11350,Abl,"ABL1,ABL, JTK7, bcr/abl, c-ABL, c-ABL1, v-abl",...,"ABL, JTK7, bcr/abl, c-ABL, c-ABL1, p150, v-abl","ABL proto-oncogene 1, non-receptor tyrosine ki...",Abelson tyrosine-protein kinase 1|bcr/c-abl on...,ABL1,ABL,ABL|JTK7|c-ABL|p150|v-abl,Abelson murine leukemia viral (v-abl) oncogene...,,ABL1,"c-ABL1, p150, ABL, JTK7, v-abl, bcr/abl, c-ABL..."
4,ABL2,ABL2,ABL2,27.0,TK,Abl2,Abl2,11352,Arg,"ABL2,ABLL, ARG",...,"ABLL, ARG","ABL proto-oncogene 2, non-receptor tyrosine ki...","abelson-related gene protein|c-abl oncogene 2,...",ABL2,ARG,ABLL|ARG,Abelson murine leukemia viral (v-abl) oncogene...,,ABL2,"ABL2, ABLL, ARG"


In [8]:
kinome_set = set(hgnc_mapper[x] for x in kmast['MS Gene'] if 'GYK' != x and 'SGK494' != x)

In [9]:
print(len(set(kmast['Uniprot Protein'])))
print(len(set(kmast['Symbol'])))
print(len(set(kmast['Symbol'])|set(kmast['Uniprot Protein'])))

570
499
744


### Start importing sources

In [10]:
data_sources = {}
data_preproc = {}

# hardcode MAPK3 due to MAPKAPK3 issues
id_dict = lambda x: hgnc_mapper[x]  if x in hgnc_mapper.keys()\
                                        else 'MAPK3' if x == 'P27361'\
                                        else hgnc_mapper[uni_mapper[x]] if x in uni_mapper.keys() and uni_mapper[x] in hgnc_mapper.keys()\
                                        else None
                                        

In [11]:
hippie = pd.read_csv('../data/networkInputs/hippie_current.txt', sep='\t', header=None)

data_sources['hippie'] = hippie
data_preproc['hippie'] = {
    'columns':[0,2],
    'maps':[
        lambda x: id_dict(x.split('_')[0]),
        lambda x: id_dict(x.split('_')[0])
    ]
}

print(hippie.columns)
hippie.head()

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')


Unnamed: 0,0,1,2,3,4,5
0,AL1A1_HUMAN,216,AL1A1_HUMAN,216,0.76,"experiments:in vivo,Two-hybrid;pmids:12081471,..."
1,ITA7_HUMAN,3679,ACHA_HUMAN,1134,0.73,"experiments:in vivo,Affinity Capture-Western,a..."
2,NEB1_HUMAN,55607,ACTG_HUMAN,71,0.65,"experiments:in vitro,in vivo;pmids:9362513,120..."
3,SRGN_HUMAN,5552,CD44_HUMAN,960,0.63,"experiments:in vivo;pmids:9334256,16189514,167..."
4,GRB7_HUMAN,2886,ERBB2_HUMAN,2064,0.9,"experiments:in vitro,in vivo,Reconstituted Com..."


In [12]:
phosphosite_substrate = pd.read_csv('../data/networkInputs/Kinase_Substrate_Dataset.txt', sep='\t', skiprows=2)

data_sources['phosphosite_substrate'] = phosphosite_substrate
data_preproc['phosphosite_substrate'] = {
    'columns':['GENE','SUB_GENE'],
    'maps':[
        lambda x: id_dict(x.upper()),
        lambda x: id_dict(x.upper())
    ]
}

print(phosphosite_substrate.columns)
display(phosphosite_substrate.head())

Index(['GENE', 'KINASE', 'KIN_ACC_ID', 'KIN_ORGANISM', 'SUBSTRATE',
       'SUB_GENE_ID', 'SUB_ACC_ID', 'SUB_GENE', 'SUB_ORGANISM', 'SUB_MOD_RSD',
       'SITE_GRP_ID', 'SITE_+/-7_AA', 'DOMAIN', 'IN_VIVO_RXN', 'IN_VITRO_RXN',
       'CST_CAT#'],
      dtype='object')


Unnamed: 0,GENE,KINASE,KIN_ACC_ID,KIN_ORGANISM,SUBSTRATE,SUB_GENE_ID,SUB_ACC_ID,SUB_GENE,SUB_ORGANISM,SUB_MOD_RSD,SITE_GRP_ID,SITE_+/-7_AA,DOMAIN,IN_VIVO_RXN,IN_VITRO_RXN,CST_CAT#
0,Pak2,PAK2,Q64303,rat,MEK1,170851.0,Q01986,Map2k1,rat,S298,448284,RtPGRPLsSYGMDSR,Pkinase,,X,9128; 98195
1,Pak2,PAK2,Q64303,rat,PRKD1,85421.0,Q9WTQ1,Prkd1,rat,S203,449896,GVRRRRLsNVsLTGL,,X,,
2,Pak2,PAK2,Q64303,rat,prolactin,5617.0,P01236,PRL,human,S207,451732,LHCLRRDsHKIDNYL,Hormone_1,,X,
3,Pak2,PAK2,Q64303,rat,prolactin,24683.0,P01237,Prl,rat,S206,451732,IRCLRRDsHKVDNYL,Hormone_1,,X,
4,EIF2AK1,HRI,Q9BQI3,human,eIF2-alpha,54318.0,P68101,Eif2s1,rat,S52,447635,MILLSELsRRRIRSI,S1,,X,3597; 9721; 3398; 5199


In [13]:
hprd = pd.read_csv('../data/networkInputs/HPRD_ALL_BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt', sep='\t', header=None)

data_sources['hprd'] = hprd
data_preproc['hprd'] = {
    'columns':[0,3],
    'maps':[
        lambda x: id_dict(x.upper()),
        lambda x: id_dict(x.upper())
    ]
}

print(hprd.columns)
hprd.head()

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')


Unnamed: 0,0,1,2,3,4,5,6,7
0,ALDH1A1,1,NP_000680.2,ALDH1A1,1,NP_000680.2,in vivo;yeast 2-hybrid,1208147116189510
1,ITGA7,2761,NP_001138468.1,CHRNA1,7,NP_001034612.1,in vivo,10910772
2,PPP1R9A,16000,NP_060120.2,ACTG1,17,NP_001605.1,in vitro;in vivo,936251312052877
3,SRGN,1513,NP_002718.2,CD44,115,NP_000601.3,in vivo,9334256
4,GRB7,3311,NP_005301.2,ERBB2,1281,NP_004439.2,in vitro;in vivo,9079677


In [14]:
reactome = pd.read_csv('../data/networkInputs/reactome.homo_sapiens.interactions.tab-delimited.txt', sep='\t')

data_sources['reactome'] = reactome
data_preproc['reactome'] = {
    'columns': ['# Interactor 1 uniprot id','Interactor 2 uniprot id'],
    'maps': [
        lambda x: id_dict(x.split(':')[1]),
        lambda x: id_dict(x.split(':')[1])
    ]
}


print(reactome.columns)
reactome.head()

Index(['# Interactor 1 uniprot id', 'Interactor 1 Ensembl gene id',
       'Interactor 1 Entrez Gene id', 'Interactor 2 uniprot id',
       'Interactor 2 Ensembl gene id', 'Interactor 2 Entrez Gene id',
       'Interaction type', 'Interaction context', 'Pubmed references'],
      dtype='object')


Unnamed: 0,# Interactor 1 uniprot id,Interactor 1 Ensembl gene id,Interactor 1 Entrez Gene id,Interactor 2 uniprot id,Interactor 2 Ensembl gene id,Interactor 2 Entrez Gene id,Interaction type,Interaction context,Pubmed references
0,uniprotkb:Q9Y287,ENSEMBL:ENSG00000136156,entrezgene/locuslink:9445,uniprotkb:Q9Y287,ENSEMBL:ENSG00000136156,entrezgene/locuslink:9445,physical association,reactome:R-HSA-976871,14690516|10391242
1,uniprotkb:P37840,ENSEMBL:ENSG00000145335,entrezgene/locuslink:6622,uniprotkb:P37840,ENSEMBL:ENSG00000145335,entrezgene/locuslink:6622,physical association,reactome:R-HSA-1247852,24243840
2,uniprotkb:P0DJI8,ENSEMBL:ENSG00000173432,entrezgene/locuslink:6288,uniprotkb:P0DJI8,ENSEMBL:ENSG00000173432,entrezgene/locuslink:6288,physical association,reactome:R-HSA-976898,19393650|103558
3,uniprotkb:P06727,ENSEMBL:ENSG00000110244,entrezgene/locuslink:337,uniprotkb:P06727,ENSEMBL:ENSG00000110244,entrezgene/locuslink:337,physical association,reactome:R-HSA-976889,15146166
4,uniprotkb:P01160,ENSEMBL:ENSG00000175206,entrezgene/locuslink:4878,uniprotkb:P01160,ENSEMBL:ENSG00000175206,entrezgene/locuslink:4878,physical association,reactome:R-HSA-976987,2142465|2945573


In [15]:
#!wget https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.169/BIOGRID-ORGANISM-3.5.169.tab2.zip
#!unzip -q BIOGRID-ORGANISM-3.5.169.tab2.zip
#!mv BIOGRID-ORGANISM-Homo_sapiens-3.5.169.tab2.txt ../data/networkInputs/
#!rm `ls | grep 'BIOGRID-ORGANISM'`

In [16]:
biogrid = pd.read_csv('../data/networkInputs/BIOGRID-ORGANISM-Homo_sapiens-3.5.169.tab2.txt', low_memory=False, sep='\t')

data_sources['biogrid'] = biogrid
data_preproc['biogrid'] = {
    'columns': ['Official Symbol Interactor A','Official Symbol Interactor B'],
    'maps': [
        lambda x: id_dict(x),
        lambda x: id_dict(x)
    ]
}

print(biogrid.columns)
biogrid.head()

Index(['#BioGRID Interaction ID', 'Entrez Gene Interactor A',
       'Entrez Gene Interactor B', 'BioGRID ID Interactor A',
       'BioGRID ID Interactor B', 'Systematic Name Interactor A',
       'Systematic Name Interactor B', 'Official Symbol Interactor A',
       'Official Symbol Interactor B', 'Synonyms Interactor A',
       'Synonyms Interactor B', 'Experimental System',
       'Experimental System Type', 'Author', 'Pubmed ID',
       'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score',
       'Modification', 'Phenotypes', 'Qualifications', 'Tags',
       'Source Database'],
      dtype='object')


Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,Pubmed ID,Organism Interactor A,Organism Interactor B,Throughput,Score,Modification,Phenotypes,Qualifications,Tags,Source Database
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,9006895,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
1,117,84665,88,124185,106603,-,-,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,...,11309420,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
2,183,90,2339,106605,108625,-,-,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,...,8599089,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
3,278,2624,5371,108894,111384,-,-,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,...,10938104,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID
4,418,6118,6774,112038,112651,RP4-547C9.3,-,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,...,10875894,9606,9606,Low Throughput,-,-,-,-,-,BIOGRID


In [17]:
i2d = pd.read_csv('../data/networkInputs/i2d.2_9.txt', sep='\t')

data_sources['i2d'] = i2d
data_preproc['i2d'] = {
    'columns': ['SwissProt1','SwissProt2'],
    'maps': [
        lambda x: id_dict(x),
        lambda x: id_dict(x)
    ]
}

print(i2d.columns)
i2d.head()

Index(['Dataset', 'SwissProt1', 'SwissProt2'], dtype='object')


Unnamed: 0,Dataset,SwissProt1,SwissProt2
0,SOURAV_MAPK_LOW,P63000,A0AUZ9
1,IntAct,Q96CV9,A0AUZ9
2,BioGrid,P0CG48,A0AV96
3,IntAct_Mouse,P62258,A0AV96
4,IntAct_Mouse,P63104,A0AV96


In [18]:
mint = pd.read_csv('../data/networkInputs/species_human.txt', sep='\t', header=None)

# drop empty interaction columns
mint = mint[mint[0].apply(lambda x: ':' in x)]
mint = mint[mint[1].apply(lambda x: ':' in x)]

data_sources['mint'] = mint
data_preproc['mint'] = {
    'columns': [0,1],
    'maps':[
        lambda x: id_dict(x.split(':')[1]),
        lambda x: id_dict(x.split(':')[1])
    ]
}

print(mint.columns)
mint.head()

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484777|mint:MINT-8206988|imex:IM-1...,intact-miscore:0.40
1,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484916|mint:MINT-8207003|imex:IM-1...,intact-miscore:0.40
2,uniprotkb:Q72547,uniprotkb:Q72547,intact:EBI-7484755|intact:MINT-8208544,intact:EBI-7484755|intact:MINT-8208544,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,psi-mi:q72547_9hiv1(display_long)|uniprotkb:po...,"psi-mi:""MI:0030""(cross-linking study)",Nishitsuji et al. (2011),pubmed:22004763|imex:IM-16791|mint:MINT-820698...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-7484965|mint:MINT-8207016|imex:IM-1...,intact-miscore:0.40
3,uniprotkb:Q90VU7,uniprotkb:Q900A7,intact:EBI-7460704|intact:MINT-5281653,intact:EBI-7460739|intact:MINT-8208185,psi-mi:q90vu7_9hiv1(display_long)|uniprotkb:ne...,psi-mi:q900a7_9hiv1(display_long)|uniprotkb:ta...,"psi-mi:""MI:0416""(fluorescence microscopy)",Sugiyama et al. (2011),pubmed:21970979|imex:IM-16787|mint:MINT-820620...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,taxid:11676(9hiv1)|taxid:11676(Human immunodef...,"psi-mi:""MI:0403""(colocalization)","psi-mi:""MI:0471""(MINT)",intact:EBI-7460732|mint:MINT-8206271|imex:IM-1...,intact-miscore:0.27
4,uniprotkb:Q2Q067,uniprotkb:Q13015,intact:EBI-9675545,intact:EBI-6269719,psi-mi:q2q067_9dela(display_long)|uniprotkb:HB...,psi-mi:af1q_human(display_long)|uniprotkb:MLLT...,"psi-mi:""MI:0397""(two hybrid array)",Simonis et al. (2012),imex:IM-22977|pubmed:22458338,taxid:11908(humt-)|taxid:11908(Human T-lymphot...,taxid:9606(human)|taxid:9606(Homo sapiens),"psi-mi:""MI:0915""(physical association)","psi-mi:""MI:0471""(MINT)",intact:EBI-9675551|imex:IM-22977-4,intact-miscore:0.49


In [19]:
#!wget https://stringdb-static.org/download/protein.links.full.v11.0/9606.protein.links.full.v11.0.txt.gz
#!mv 9606.protein.links.full.v11.0.txt.gz ../data/networkInputs/
#!gunzip ../data/networkInputs/9606.protein.links.full.v11.0.txt.gz

In [20]:
string_database_score_filter = 900

string = pd.read_csv('../data/networkInputs/9606.protein.links.full.v11.0.txt', sep=' ')
string = string[string['database']>=string_database_score_filter]
string_alias = pd.read_csv('../data/networkInputs/human.uniprot_2_string.2018.tsv', sep='\t', header=None)

string_to_uniprot = dict(zip(string_alias[2], string_alias[1].apply(lambda x: x.split('|')[1].split('_')[0])))

data_sources['string'] = string
data_preproc['string'] = {
    'columns': ['protein1','protein2'],
    'maps': [
        lambda x: id_dict(string_to_uniprot[x]) if x in string_to_uniprot.keys() else None,
        lambda x: id_dict(string_to_uniprot[x]) if x in string_to_uniprot.keys() else None
    ]
}

print(string.columns)
display(string.head())
print(string_alias.columns)
display(string_alias.head())

Index(['protein1', 'protein2', 'neighborhood', 'neighborhood_transferred',
       'fusion', 'cooccurence', 'homology', 'coexpression',
       'coexpression_transferred', 'experiments', 'experiments_transferred',
       'database', 'database_transferred', 'textmining',
       'textmining_transferred', 'combined_score'],
      dtype='object')


Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
15,9606.ENSP00000000233,9606.ENSP00000432568,0,0,0,0,0,0,62,0,63,900,0,0,94,909
16,9606.ENSP00000000233,9606.ENSP00000427900,0,0,0,0,0,0,0,0,131,900,0,0,56,910
40,9606.ENSP00000000233,9606.ENSP00000354878,0,0,0,0,0,0,0,0,131,900,0,0,56,910
52,9606.ENSP00000000233,9606.ENSP00000405926,0,0,0,0,0,0,63,0,0,900,0,64,58,906
78,9606.ENSP00000000233,9606.ENSP00000314615,0,0,0,0,0,61,63,0,262,900,0,473,317,971


Int64Index([0, 1, 2, 3, 4], dtype='int64')


Unnamed: 0,0,1,2,3,4
0,9606,Q03135|CAV1_HUMAN,9606.ENSP00000339191,100.0,370.0
1,9606,P35125|UBP6_HUMAN,9606.ENSP00000460380,100.0,2941.0
2,9606,P41161|ETV5_HUMAN,9606.ENSP00000306894,100.0,1045.0
3,9606,Q6KF10|GDF6_HUMAN,9606.ENSP00000287020,100.0,928.0
4,9606,Q495C1|RN212_HUMAN,9606.ENSP00000389709,100.0,610.0


## Network Assembly

In [21]:
network = pd.DataFrame(columns=['Node1', 'Node2'])
network.head()

Unnamed: 0,Node1,Node2


In [22]:
network = pd.DataFrame(columns=['Node1', 'Node2'])

for source_name, data in data_sources.items():
    
    #if source_name in ['hippie', 'phosphosite_substrate', 'hprd', 'biogrid']:
    #    continue
    
    print('Adding', source_name, 'to network')
    
    left_col, right_col = data_preproc[source_name]['columns']
    left_map, right_map = data_preproc[source_name]['maps']
    
    data = data[[left_col, right_col]].dropna(axis=0)
    
    #print(data.dtypes)#[left_col])
    
    to_add = pd.DataFrame(columns=network.columns)
    to_add['Node1'] = data[left_col].apply(left_map)
    to_add['Node2'] = data[right_col].apply(right_map)
    to_add.dropna(axis=0, inplace = True)
   
    network = pd.concat([network, to_add], ignore_index=True)
    
    #break
    
# drop obvious duplicates
network.drop_duplicates(inplace = True)

# drop items not in the target kinome set
network = network[(network['Node1'].isin(kinome_set)) & (network['Node2'].isin(kinome_set))]

network_cop = copy(network[['Node2','Node1']])
network_cop.columns = ['Node1', 'Node2']

# drop a -> b b -> a redundancies 
network = network.append(network_cop, ignore_index=True).drop_duplicates()
network = network[network['Node1']<network['Node2']]

#for i in network.index:
#    network.loc[i] = sorted(network.loc[i])
    
#network.drop_duplicates(inplace=True)

display(network.shape)
network.head()

Adding hippie to network
Adding phosphosite_substrate to network
Adding hprd to network
Adding reactome to network
Adding biogrid to network
Adding i2d to network
Adding mint to network
Adding string to network


(5066, 2)

Unnamed: 0,Node1,Node2
2,MST1R,YES1
3,TYRO3,YES1
11,FGR,SRC
16,ATR,FLT1
19,LYN,PRKCD


In [24]:
network.to_csv('../data/kin/kin_unweighted.csv', sep='\t', header=None, index=False)