# A dedicated method to identify target molecules and extract the corresponding graph of interactions in BioPAX

In [20]:
import importlib
import json
import matplotlib.pyplot as plt
import os
import pandas
import rdflib
import rdflib.namespace
import sparqldataframe
from SPARQLWrapper import SPARQLWrapper, JSON
import sys

In [21]:
reactomeVersion = 81
endpointURL = "http://localhost:3030/REACTOME/query"
rdfFormat = "turtle"
prefixes = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>

PREFIX chebi: <http://purl.obolibrary.org/obo/chebi/>
PREFIX chebidb: <http://purl.obolibrary.org/obo/CHEBI_>
PREFIX chebirel: <http://purl.obolibrary.org/obo/CHEBI#>

PREFIX bp3: <http://www.biopax.org/release/biopax-level3.owl#>
PREFIX reactome: <http://www.reactome.org/biopax/81/48887#>
"""
pandas.set_option("max_colwidth", 80)

## Proteins and ProteinReferences

##### Summary

- Protein : 31755
- ProteinReference : 11685
    - ProteinReference that has a xref uniprot : 11319 (97%)
    - There are no ProteinReferences that has the same UniProt ID
- Each protein point to an unique ProteinReference
- More than one protein can point to a ProteinReference (up to 1328)
- There are two ways to get the UniProt ID of a ProteinReference :
    - follow a `bp3:xref` edge to a `bp3:UnificationXref` and then follow the `bp3:db` edge with the value "UniProt"
        - this represents 10947 IDs
    - follow a `bp3:name` edge to a value like "UniProt: *%ID_UniProt% %ID_gene%*"
        - this represents 11228 IDs
- 281 ProteinReference have an UniProt ID by their name field but not by their xref field
    - these 281 are isoforms
    - there are other ProteinReference for their main UniProt ID
    - it seems that there is no other isoform 

In [22]:
## Number of Protein
query = """
SELECT (COUNT (DISTINCT ?prot) AS ?nb_prot)
WHERE {
  ?prot rdf:type bp3:Protein .
}
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,nb_prot
0,31755


In [23]:
## Number of ProteinReference
query = """
SELECT (COUNT (DISTINCT ?protRef) AS ?nb_protref)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
}
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,nb_protref
0,11685


There are two ways to get a UniProt ID :

1. by following the `bp3:xref` property to a `bp3:UnificationXref` whose `bp3:db` property is equal to "UniProt"
2. by following the `bp3:name` property which is like "UniProt: ?ID_UniProt ?ID_gene"


In [24]:
query = """
# Number of ProteinReference that has an Xref UniProt

SELECT (COUNT( DISTINCT(?protRef)) AS ?nbProtRefUniProt)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
}  
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,nbProtRefUniProt
0,11319


In [25]:
## Can a ProteinReference point to more than one UniProt Xref? NO
query = """
# Number of ProteinReference that has more than one Xref UniProt

SELECT ?protRef (COUNT(DISTINCT (?xref)) as ?nb_xref ) #DISTINCT ?protRef
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
}  
GROUP BY ?protRef
HAVING (?nb_xref > 1)"""

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,protRef,nb_xref


In [26]:
## Can several Xref have the same UniProt ID? No
query = """
# Number of different UniProt IDs (i.e. : Are there UnificationXref's that have the same UniProt ID?)

SELECT (COUNT (DISTINCT ?id) AS ?distinct_uniprot)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
  ?xref bp3:id ?id .
}  
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,distinct_uniprot
0,11319


In [27]:
# How many Protein nodes per UniProt ID ?
query = """
# Number of Protein per Uniprot ID
SELECT ?id (COUNT (DISTINCT ?protein) AS ?distinct_prot)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
  ?xref bp3:id ?id .
  ?protein bp3:entityReference ?protRef .
}
GROUP BY ?id
ORDER BY DESC (?distinct_prot)
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,id,distinct_prot
0,P04637,1328
1,P06400,393
2,P51587,357
3,P38398,166
4,P0CG48,145
...,...,...
11314,Q9Y6X2,1
11315,Q9Y6X3,1
11316,Q9Y6X4,1
11317,Q9Y6X9,1


Problems related with UniProt ID in Reactome :

In [28]:
## ProteinReference qui ont un identifiant UniProt dans un champs bp3:name
query = """
SELECT (COUNT (DISTINCT ?protRef) AS ?distinct_uniprot)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:name ?name .
  FILTER(CONTAINS(?name, "UniProt")).
}  """

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,distinct_uniprot
0,11604


In [29]:
11604-11319

285

In [30]:
## ProteinReferences that have a UniProt ID in bp3:name, but don't have a UnificationXref UniProt
## -> Only ISOFORMS
query = """
SELECT * #(COUNT (DISTINCT ?protRef) AS ?distinct_uniprot)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:name ?name .
  FILTER(CONTAINS(?name, "UniProt")).
  FILTER NOT EXISTS {
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
  }
}  """

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,protRef,name
0,http://www.reactome.org/biopax/81/48887#ProteinReference72,UniProt:O75521-2 ECI2
1,http://www.reactome.org/biopax/81/48887#ProteinReference95,UniProt:O95822-2 MLYCD
2,http://www.reactome.org/biopax/81/48887#ProteinReference107,UniProt:P50542-1 PEX5
3,http://www.reactome.org/biopax/81/48887#ProteinReference108,UniProt:P50542-2 PEX5
4,http://www.reactome.org/biopax/81/48887#ProteinReference134,UniProt:P51648-2 ALDH3A2
...,...,...
280,http://www.reactome.org/biopax/81/48887#ProteinReference11575,UniProt:Q9UPP1-1 PHF8
281,http://www.reactome.org/biopax/81/48887#ProteinReference11576,UniProt:Q9UPP1-3 PHF8
282,http://www.reactome.org/biopax/81/48887#ProteinReference11590,UniProt:P42167-1 TMPO
283,http://www.reactome.org/biopax/81/48887#ProteinReference11593,UniProt:Q86Y07-2 VRK2


In [31]:
## Is the main form of the protein referenced by another ProteinReference ? NO
query = """
SELECT * 
WHERE {
  VALUES ?uniprot {"Q14028" "O14775" "Q14155" "P04150" "O75521" "O95822" } # Quelques exemples
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:xref ?xref .
  ?xref bp3:db "UniProt" .
  ?xref bp3:id ?uniprot .
} 
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,uniprot,protRef,xref
0,Q14028,http://www.reactome.org/biopax/81/48887#ProteinReference11011,http://www.reactome.org/biopax/81/48887#UnificationXref156361
1,O14775,http://www.reactome.org/biopax/81/48887#ProteinReference1730,http://www.reactome.org/biopax/81/48887#UnificationXref29191
2,Q14155,http://www.reactome.org/biopax/81/48887#ProteinReference8563,http://www.reactome.org/biopax/81/48887#UnificationXref116390
3,P04150,http://www.reactome.org/biopax/81/48887#ProteinReference2316,http://www.reactome.org/biopax/81/48887#UnificationXref35390


In [32]:
## Are there other isoforms? NO (en se basant sur la présence d'un "-" en 15ème position)
query = """
SELECT * #(COUNT (DISTINCT ?protRef) AS ?distinct_uniprot)
WHERE {
  ?protRef rdf:type bp3:ProteinReference .
  ?protRef bp3:name ?name .
  FILTER(CONTAINS(?name, "UniProt")).
  bind(substr(?name, 1, 15 ) as ?uniprot )
  FILTER(CONTAINS(?uniprot, "-")).
}  """

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,protRef,name,uniprot
0,http://www.reactome.org/biopax/81/48887#ProteinReference72,UniProt:O75521-2 ECI2,UniProt:O75521-
1,http://www.reactome.org/biopax/81/48887#ProteinReference95,UniProt:O95822-2 MLYCD,UniProt:O95822-
2,http://www.reactome.org/biopax/81/48887#ProteinReference107,UniProt:P50542-1 PEX5,UniProt:P50542-
3,http://www.reactome.org/biopax/81/48887#ProteinReference108,UniProt:P50542-2 PEX5,UniProt:P50542-
4,http://www.reactome.org/biopax/81/48887#ProteinReference134,UniProt:P51648-2 ALDH3A2,UniProt:P51648-
...,...,...,...
280,http://www.reactome.org/biopax/81/48887#ProteinReference11575,UniProt:Q9UPP1-1 PHF8,UniProt:Q9UPP1-
281,http://www.reactome.org/biopax/81/48887#ProteinReference11576,UniProt:Q9UPP1-3 PHF8,UniProt:Q9UPP1-
282,http://www.reactome.org/biopax/81/48887#ProteinReference11590,UniProt:P42167-1 TMPO,UniProt:P42167-
283,http://www.reactome.org/biopax/81/48887#ProteinReference11593,UniProt:Q86Y07-2 VRK2,UniProt:Q86Y07-


## SmallMolecules and SmallMoleculeReference

##### Summary
- Entity that has a ChEBI ID :

| bp3 type | count |
|------|-------|
|bp3:SmallMoleculeReference| 1929|
| bp3:ProteinReference | 1|
| bp3:Protein | 5|
| bp3:SmallMolecule | 337|
| bp3:Rna | 115|
| bp3:SequenceModificationVocabulary | 5|
| bp3:PhysicalEntity | 1027|
|Total(1929+1+5+337+115+5+1027)| 3419|

- XrefUnification that has a bp3:db = "ChEBI" : 3565
    - &rarr; So there are UnificationXref having a ChEBI that are not used?
- There are 4866 triples like ?x bp3:xref bp3:UnificationXref 
    - &rarr; So there are several entites that have the same ChEBI ID?
- Only one bp3:db properties per UnificationXref

In [33]:
# Type of entites that have a ChEBI ID
query="""
# 
SELECT ?type (COUNT(DISTINCT(?x)) AS ?element)
WHERE {

  ?x bp3:xref/bp3:db "ChEBI" .
  ?x rdf:type ?type 
  }
GROUP BY ?type
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
pandas.set_option("max_colwidth", 800)
df

Unnamed: 0,type,element
0,http://www.biopax.org/release/biopax-level3.owl#SmallMoleculeReference,1929
1,http://www.biopax.org/release/biopax-level3.owl#ProteinReference,1
2,http://www.biopax.org/release/biopax-level3.owl#Protein,5
3,http://www.biopax.org/release/biopax-level3.owl#SmallMolecule,337
4,http://www.biopax.org/release/biopax-level3.owl#Rna,115
5,http://www.biopax.org/release/biopax-level3.owl#SequenceModificationVocabulary,5
6,http://www.biopax.org/release/biopax-level3.owl#PhysicalEntity,1027


In [34]:
query="""
# 
SELECT *
WHERE {
  ?xref bp3:db "ChEBI" .
  ?x bp3:xref ?xref .
  }
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
pandas.set_option("max_colwidth", 800)
df

Unnamed: 0,xref,x
0,http://www.reactome.org/biopax/81/48887#UnificationXref345,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference1
1,http://www.reactome.org/biopax/81/48887#UnificationXref435,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2
2,http://www.reactome.org/biopax/81/48887#UnificationXref438,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference3
3,http://www.reactome.org/biopax/81/48887#UnificationXref441,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference4
4,http://www.reactome.org/biopax/81/48887#UnificationXref1400,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference5
...,...,...
5002,http://www.reactome.org/biopax/81/48887#RelationshipXref15486,http://www.reactome.org/biopax/81/48887#SmallMolecule5042
5003,http://www.reactome.org/biopax/81/48887#RelationshipXref15487,http://www.reactome.org/biopax/81/48887#SmallMolecule5043
5004,http://www.reactome.org/biopax/81/48887#RelationshipXref15488,http://www.reactome.org/biopax/81/48887#SmallMolecule5044
5005,http://www.reactome.org/biopax/81/48887#RelationshipXref15489,http://www.reactome.org/biopax/81/48887#SmallMolecule5045


In [35]:
query="""
# 
SELECT ?val (COUNT(?val) AS ?count)
WHERE {
  ?xref bp3:db "ChEBI" .
  ?xref rdf:type ?val .
  }
  GROUP BY ?val
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
pandas.set_option("max_colwidth", 800)
df

Unnamed: 0,val,count
0,http://www.biopax.org/release/biopax-level3.owl#RelationshipXref,1766
1,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,1935


In [50]:
# SmallMoleculeReference that has a ChEBI ID
query="""
# 
SELECT *
WHERE {
  ?smRef rdf:type bp3:SmallMoleculeReference .
  ?smRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "ChEBI" .
  ?xref rdf:type ?val .
  ?xref bp3:id ?id .
  }
#  GROUP BY ?val
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
pandas.set_option("max_colwidth", 800)
df

Unnamed: 0,smRef,xref,val,id
0,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference1,http://www.reactome.org/biopax/81/48887#UnificationXref345,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:15377
1,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2,http://www.reactome.org/biopax/81/48887#UnificationXref435,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:30616
2,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference3,http://www.reactome.org/biopax/81/48887#UnificationXref438,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:43474
3,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference4,http://www.reactome.org/biopax/81/48887#UnificationXref441,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:456216
4,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference5,http://www.reactome.org/biopax/81/48887#UnificationXref1400,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:58189
...,...,...,...,...
1924,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2811,http://www.reactome.org/biopax/81/48887#UnificationXref160794,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:23736
1925,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2812,http://www.reactome.org/biopax/81/48887#UnificationXref160797,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:7789
1926,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2813,http://www.reactome.org/biopax/81/48887#UnificationXref160804,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:44975
1927,http://www.reactome.org/biopax/81/48887#SmallMoleculeReference2832,http://www.reactome.org/biopax/81/48887#UnificationXref163618,http://www.biopax.org/release/biopax-level3.owl#UnificationXref,CHEBI:15986


In [45]:
# How many SmallMolecule nodes per ChEBI ID ?

query = """

SELECT ?id (COUNT (DISTINCT ?smallmolecule) AS ?distinct_sm)
WHERE {
  ?smRef rdf:type bp3:SmallMoleculeReference .
  ?smRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "ChEBI" .
  ?xref bp3:id ?id .
  
  ?smallmolecule bp3:entityReference ?smRef .
}
GROUP BY ?id
ORDER BY DESC (?distinct_sm)
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
df

Unnamed: 0,id,distinct_sm
0,CHEBI:15378,28
1,CHEBI:29105,21
2,CHEBI:16113,20
3,CHEBI:29108,20
4,CHEBI:15377,18
...,...,...
1924,CHEBI:9162,1
1925,CHEBI:9171,1
1926,CHEBI:9407,1
1927,CHEBI:9463,1


## Retrieving Proteins by HGNC genes IDs in the BioPAX export of Reactome

Federated query 

In [None]:
query = """

SELECT *

WHERE {

  SERVICE <https://sparql.uniprot.org/sparql> {
    VALUES ?hgncName { "RPL14" "NR3C1" }
    ?hgnc rdf:type up:Resource .
    ?hgnc up:database udb:HGNC .
    ?hgnc rdfs:comment ?hgncName .

    ?uniprotID rdfs:seeAlso ?hgnc .
    ?uniprotID rdf:type up:Protein .
    ?uniprotID up:reviewed "true"^^xsd:boolean .
  }
  
  BIND (REPLACE(STR(?uniprotID), "http://purl.uniprot.org/uniprot/", "") AS ?localUniprotID)
  
  OPTIONAL 
  { #get isoforms ; /!\ mais on récupère aussi les autres, comment gagner du temps de parcours ? /!\
  	?protref rdf:type bp3:ProteinReference .
  	?protref bp3:name ?name . 
   	FILTER(CONTAINS(?name, ?localUniprotID)).
  }

  OPTIONAL 
  {
	?xref bp3:id ?localUniprotID .
  	?xref bp3:db "UniProt" .
  	?protref bp3:xref ?xref .
   
  } 
  ?prot bp3:entityReference ?protref .  
}
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df

## Number of Interactions a ChEBI or a Uniprot ID is involed 

In [None]:
query = """
SELECT DISTINCT ?id (COUNT (?interaction) AS ?nb_int)
WHERE {
  ?protein rdf:type bp3:Protein .
  ?protein bp3:entityReference ?protRef .
  ?protRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "UniProt" .
  ?xref bp3:id ?id .
    
  ?rel rdfs:subPropertyOf* bp3:participant .
  
    { 
    	?interaction ?rel ?protein .
    } 
    UNION 
    {
        ?complex rdf:type bp3:Complex .
        ?complex bp3:component+ ?protein .
        ?interaction ?rel ?complex .
    }
   ?interaction rdf:type/rdfs:subClassOf* bp3:Interaction .

  }
GROUP BY ?id #?interaction
ORDER BY DESC(?nb_int) #?nb_chebi_per_int)
"""

df = sparqldataframe.query(endpointURL, prefixes+query)
df

In [None]:
query = """
SELECT DISTINCT ?id (COUNT (?interaction) AS ?nb_int)#?interaction (COUNT (?id) AS ?nb_chebi_per_int)
WHERE {
  ?smallMolecule rdf:type bp3:SmallMolecule .
  ?smallMolecule bp3:entityReference ?SMRef .
  ?SMRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "ChEBI" .
  ?xref bp3:id ?id .
    
  ?rel rdfs:subPropertyOf* bp3:participant .
  
    { 
    	?interaction ?rel ?smallMolecule .
    } 
    UNION 
    {
        ?complex rdf:type bp3:Complex .
        ?complex bp3:component+ ?smallMolecule .
        ?interaction ?rel ?complex .
    }
   ?interaction rdf:type/rdfs:subClassOf* bp3:Interaction .

  }
GROUP BY ?id #?interaction
ORDER BY DESC(?nb_int) """
df = sparqldataframe.query(endpointURL, prefixes+query)
df


## Number of ChEBI or UniProt ID per interaction

In [None]:
query = """ 
SELECT DISTINCT ?interaction (COUNT (?id) AS ?nb_chebi_per_int)
WHERE {
  ?smallMolecule rdf:type bp3:SmallMolecule .
  ?smallMolecule bp3:entityReference ?SMRef .
  ?SMRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "ChEBI" .
  ?xref bp3:id ?id .
    
  ?rel rdfs:subPropertyOf* bp3:participant .
  
    { 
    	?interaction ?rel ?smallMolecule .
    } 
    UNION 
    {
        ?complex rdf:type bp3:Complex .
        ?complex bp3:component+ ?smallMolecule .
        ?interaction ?rel ?complex .
    }
   ?interaction rdf:type/rdfs:subClassOf* bp3:Interaction .

  }
GROUP BY ?interaction
ORDER BY DESC(?nb_chebi_per_int)
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df


In [None]:
query = """ 
SELECT DISTINCT ?interaction (COUNT (?id) AS ?nb_uniprot_per_int)
WHERE {
  ?Protein rdf:type bp3:Protein .
  ?Protein bp3:entityReference ?ProteinRef .
  ?ProteinRef bp3:xref ?xref .
  ?xref rdf:type bp3:UnificationXref .
  ?xref bp3:db "UniProt" .
  ?xref bp3:id ?id .
    
  ?rel rdfs:subPropertyOf* bp3:participant .
  
    { 
    	?interaction ?rel ?Protein .
    } 
    UNION 
    {
        ?complex rdf:type bp3:Complex .
        ?complex bp3:component+ ?Protein .
        ?interaction ?rel ?complex .
    }
   ?interaction rdf:type/rdfs:subClassOf* bp3:Interaction .

  }
GROUP BY ?interaction
ORDER BY DESC(?nb_uniprot_per_int)
"""
df = sparqldataframe.query(endpointURL, prefixes+query)
df