# Reference PPI network Topology Generation

Uses the pypoteinsExt.psicquic to manipulate mitab data and produce a 
mitabToplogy object serialized for further use

API mitabToplogy object:




In [1]:
import sys, re
#%load_ext autoreload

# Development library
sys.path.append("/Users/guillaumelaunay/work/DVL/python3/pyproteinsExt/src")
sys.path.append("/Users/guillaumelaunay/work/DVL/python3/pyproteins/src")

import pyproteinsExt.psicquic as psq




Acknowledged 11221 entries (/Users/guillaumelaunay/work/data/pfam)


### Filter out mitab interactions w/out homologs in target proteome

The file `uniprot_R6_homology.json` stores _"one-to-many"_ homology relationship the following way

```json
{
    "PSICQUIC_INTERACTOR" : {
        "R6_UNIPROT_ID" : ["HSP_INFORMATIONS", ...],
    ...
}
```

Where,
  * `PSICQUIC_INTERACTOR` is the uniprot identifier of an interactor present in the mitab collection
  * `R6_UNIPROT_ID` is the uniprot identifier of a R6 protein
  * `HSP_INFORMATIONS` is a tuple storing characteristics of a BLAST hit between  `PSICQUIC_INTERACTOR`. There are currently only one HSP reported per BLAST hit ( ie: the list `["HSP_INFORMATIONS", ...]` length is always one). 
  
  The `HSP_INFORMATIONS` tuple has the following content:
  
```json
    [ R6_sequence Length,
      R6_hsp start position,
      R6_hsp stop position,
      PSQ_sequence Length,
      PSQ_hsp start position,
      PSQ_hsp stop position,
      HSP positive match number,
      HSP identical match number,
      HSP eValue
      ]
```
### Create a subset

1. Load all mitab
2. Take n-fisrt with both interactors having homologs in R6

In [8]:
#%autoreload 2
#mitabFile = "/Users/guillaumelaunay/tmp/buildNetwork2/merged_uniprot_safe.H50.mitab"
mitabFile="/Users/guillaumelaunay/tmp/buildNetwork2/merged_uniprot_safe.mitab"
homologyFile = "/Users/guillaumelaunay/tmp/buildNetwork2/uniprot_R6_homology.json"


In [3]:
import json


allInR6 = {}
with open(homologyFile) as fp:
    allInR6 = json.load(fp)
    
def f(psqData):
    
    a = psqData.interactors[0][0][1]
    b = psqData.interactors[1][0][1]
    
    if not a in allInR6:
        return False
    if not b in allInR6:
        return False

    for n in (a, b):
        t = 0
        for k in allInR6[n]:
            t += len(allInR6[n][k])
        if t == 0:
            return False
    
    return True

In [4]:
allInR6

{'P97760': {'P66709': [['311',
    '5',
    '235',
    '275',
    '5',
    '272',
    '95',
    '45',
    '4.79675e-27']]},
 'P98084': {'Q59947': [['1963',
    '262',
    '811',
    '750',
    '24',
    '572',
    '176',
    '95',
    '2.1715e-07']],
  'Q8DQN5': [['1876',
    '191',
    '395',
    '750',
    '52',
    '281',
    '75',
    '47',
    '1.90067e-06']]},
 'P94542': {'Q8CZ65': [['103',
    '8',
    '85',
    '85',
    '7',
    '84',
    '35',
    '17',
    '2.60153e-13']]},
 'P97839': {'Q59947': [['1963',
    '392',
    '690',
    '992',
    '515',
    '808',
    '82',
    '30',
    '2.66999e-08']]},
 'P91875': {'Q8DNF1': [['1225',
    '2',
    '899',
    '1642',
    '7',
    '1064',
    '366',
    '197',
    '5.76447e-91'],
   ['1225',
    '961',
    '1179',
    '1642',
    '1459',
    '1634',
    '69',
    '31',
    '3.22885e-18']]},
 'P96593': {'Q8DQ95': [['650',
    '293',
    '646',
    '425',
    '91',
    '425',
    '117',
    '59',
    '7.58989']],
  'Q8DRF3': [['420

In [9]:
psqObj = psq.PSICQUIC(offLine=True)
psqObj.read(mitabFile)
psqAllInR6 = psqObj.filter(predicate=f)

In [5]:
print(len(psqObj))
print(len(psqAllInR6))

637100
120928


In [10]:
M = psq.MitabTopology(psqAllInR6)

True


In [11]:
len(M)
#M.tmpAdj

54993

In [12]:
M['P53692']['O13710']

[uniprotkb:O13710	uniprotkb:P53692	biogrid:278084|entrez gene/locuslink:smc5|entrez gene/locuslink:SPAC14C4.02c|entrez gene/locuslink:2541587	biogrid:275593|entrez gene/locuslink:smc6|entrez gene/locuslink:SPCC5E4.06|entrez gene/locuslink:2539020	entrez gene/locuslink:spr18(gene name synonym)	entrez gene/locuslink:rad18(gene name synonym)	psi-mi:"MI:0004"(affinity chromatography technology)	"Pebernard S (2004)"	pubmed:15331764	taxid:284812	taxid:284812	psi-mi:"MI:0915"(physical association)	psi-mi:"MI:0463"(biogrid)	biogrid:250203	-,
 uniprotkb:P53692	uniprotkb:O13710	intact:EBI-603745	intact:EBI-603756	psi-mi:smc6_schpo(display_long)|uniprotkb:smc6(gene name)|psi-mi:smc6(display_short)|uniprotkb:SPCC5E4.06(orf name)|uniprotkb:DNA repair protein rad18(gene name synonym)|uniprotkb:rad18(gene name synonym)	psi-mi:smc5_schpo(display_long)|uniprotkb:smc5(gene name)|psi-mi:smc5(display_short)|uniprotkb:SMC partner of rad18(gene name synonym)|uniprotkb:DNA repair protein spr18(gene name syno

In [9]:
#M['O60260']
for x in M:
    print(x)

('P20459', 'P32501', [uniprotkb:P32501	uniprotkb:P20459	biogrid:32265|entrez gene/locuslink:GCD6|entrez gene/locuslink:YDR211W|entrez gene/locuslink:851797	biogrid:33763|entrez gene/locuslink:SUI2|entrez gene/locuslink:YJR007W|entrez gene/locuslink:853463	entrez gene/locuslink:L000000674(gene name synonym)	entrez gene/locuslink:L000002178(gene name synonym)|entrez gene/locuslink:translation initiation factor eIF2 subunit alpha(gene name synonym)	psi-mi:"MI:0004"(affinity chromatography technology)	"Gavin AC (2002)"	pubmed:11805826	taxid:559292	taxid:559292	psi-mi:"MI:0915"(physical association)	psi-mi:"MI:0463"(biogrid)	biogrid:97449	-])
('P30750', 'P24178', [uniprotkb:P30750	uniprotkb:P24178	intact:EBI-541886|uniprotkb:P77517|uniprotkb:Q47615	intact:EBI-554242	psi-mi:metn_ecoli(display_long)|uniprotkb:metN(gene name)|psi-mi:metN(display_short)|uniprotkb:abc(gene name synonym)|uniprotkb:b0199(locus name)|uniprotkb:JW0195(locus name)	psi-mi:yffb_ecoli(display_long)|uniprotkb:yffB(gene n

In [10]:
len(M.tmpAdj)

54993

In [9]:
print(psqObj[0].interactors[0][0][1])
print(psqObj[0].interactors[1][0][1])
print(psqObj[0].interactors)

P51587
Q8NEM0
([('uniprotkb:', 'P51587'), ('biogrid:', '107142'), ('entrez gene/locuslink:', 'BRCA2'), ('entrez gene/locuslink:', 'RP11-298P3.4'), ('entrez gene/locuslink:', '675')], [('uniprotkb:', 'Q8NEM0'), ('biogrid:', '122776'), ('entrez gene/locuslink:', 'MCPH1'), ('entrez gene/locuslink:', '79648')])


In [14]:
import pickle
#with open('/Users/guillaumelaunay/tmp/buildNetwork2/uniprot_safe.H50.mitabToplogy.pickle', 'wb') as fP:
with open('/Users/guillaumelaunay/tmp/buildNetwork2/uniprot_safe.mitabToplogy.pickle', 'wb') as fP:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(M, fP, pickle.HIGHEST_PROTOCOL)


In [9]:
## DEPRECATED
## extract allIn R6 subtree
import json

subTree = {}
for e in psqAllInR6.getBiomolecules():
    if e not in allInR6:
        raise ValueError('WW')
    
    subTree[e] = allInR6[e]

with open('/Users/guillaumelaunay/tmp/buildNetwork2/allInR6.head.json', 'w') as fOut:
    json.dump(subTree, fOut)

In [3]:
import pyproteins.container.Core as c
dummyData = [
    ("A", "B", 10),
    ("A", "B", 20),
    ("A", "C", 30),
    ("B", "E", 40),
    ("A", "B", 50),
    ("G", "B", 60),
    ("C", "G", 70)    
]

t = c.dnTree()
for d in dummyData:
    t.add(*d)
    