# Querying BEL with CX RDF

This notebook outlines converting BEL graphs to CX, then serializing in RDF to make queryable.

In [1]:
import sys
import os
import time

import cx_rdf
import pandas as pd
from pybel.utils import get_version as get_pybel_version
from pybel.examples import sialic_acid_graph as sialic_acid_bel
import pybel_cx
from pybel_tools.visualization import to_jupyter

INFO:rdflib:RDFLib Version: 4.2.1


print(sys.version)

In [2]:
print(time.asctime())

Wed Jul 25 11:04:14 2018


In [3]:
print(f'PyBEL Version: {get_pybel_version()}')
print(f'PyBEL-CX Version: {pybel_cx.get_version()}')
print(f'CX-RDF Version: {cx_rdf.get_version()}')

PyBEL Version: 0.11.11-dev
PyBEL-CX Version: 0.1.2-dev
CX-RDF Version: 0.0.1-dev


## Getting BEL

This notebook uses an example BEL graph from the `PyBEL` package describing the sialic acid-CD33-TREM2 pathway, which has been implicated in Alzheimer's disease.

In [4]:
to_jupyter(sialic_acid_bel, height=400)

<IPython.core.display.Javascript object>

The example BEL graph is converted to CX using the `pybel_cx.to_cx` function in the [PyBEL-CX](https://github.com/pybel/pybel-cx) package.

In [5]:
cx = pybel_cx.to_cx(sialic_acid_bel)

CX is serialized to RDF using the `cx_rdf.cx_to_rdf_graph` function in the [CX-RDF](https://github.com/cthoyt/cx-rdf) package.

In [6]:
rdf = cx_rdf.cx_to_rdf_graph(cx)

print(f'Serialization resulted in {len(rdf)} triples')

Serialization resulted in 684 triples


## Querying with SPARQL

This dictionary is passed to the `RDFLib` query function to make prettier queries without writing the prefixes directly in the SPARQL.

In [7]:
init_ns = {
    'cx': cx_rdf.CX,
}

### Peek at the attributes and values used for nodes

In [8]:
query = """
SELECT ?node ?label ?attribute_name ?attribute_value
WHERE {
    ?node a cx:node .
    ?node rdfs:label ?label .
    ?node cx:node_has_attribute ?name_attribute .
    ?name_attribute cx:attribute_has_name ?attribute_name.
    ?name_attribute cx:attribute_has_value ?attribute_value .
}
LIMIT 13
"""

result = rdf.query(query, initNs=init_ns)
pd.DataFrame(list(result), columns=['Node', 'Label', 'Attribute', 'Value'])

Unnamed: 0,Node,Label,Attribute,Value
0,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11,cname,PTPN11
1,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11,function,Protein
2,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11,label,PTPN11
3,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11,namespace,HGNC
4,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11,identifier,9644
5,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6,cname,PTPN6
6,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6,namespace,HGNC
7,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6,function,Protein
8,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6,identifier,9658
9,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6,label,PTPN6


### Get all protein nodes

In [9]:
query = """
SELECT ?node ?label
WHERE {
    ?node a cx:node .
    ?node rdfs:label ?label .
    ?node cx:node_has_attribute ?name_attribute .
    ?name_attribute cx:attribute_has_name "function".
    ?name_attribute cx:attribute_has_value "Protein" .
}
"""

result = rdf.query(query, initNs=init_ns)
pd.DataFrame.from_records(list(result), columns=['Node', 'Label'])

Unnamed: 0,Node,Label
0,Na3477f5ad4ca40878eb4dbe449936a76,HGNC:TYROBP
1,Nc5b20eee3d18498f9ecf99a2f9dab951,HGNC:CD33
2,Ncdc4a4459463457c8501e9d45d032519,HGNC:TREM2
3,Nc16535fbcf924e60a12520a25ba6e42b,HGNC:PTPN11
4,Nae8bb9f552884d35ae254be3ea1b8e61,"p(HGNC:CD33, pmod(Ph))"
5,Nf5871b5fc7144f1c85c62784f8bb2261,HGNC:SYK
6,Nb4ad6ae74da74d0e8f5ace8d228f97d8,HGNC:PTPN6


### Get all modified protein nodes

In [10]:
query = """
SELECT DISTINCT ?s ?source_label 
WHERE {
    ?s a cx:node .
    ?s rdfs:label ?source_label .
    ?s cx:node_has_attribute ?source_name_attribute .
    ?source_name_attribute cx:attribute_has_name ?attribute_name .
    FILTER STRSTARTS(?attribute_name, "variants")
}
"""

result = rdf.query(query, initNs=init_ns)
pd.DataFrame.from_records(list(result), columns=['Node', 'Label'])

Unnamed: 0,Node,Label
0,Nae8bb9f552884d35ae254be3ea1b8e61,"p(HGNC:CD33, pmod(Ph))"


### Get all Protein-Protein Relations

In [11]:
query = """
SELECT ?source_label ?interaction_label ?target_label
WHERE {
    ?s a cx:node .
    ?s rdfs:label ?source_label .
    ?s cx:node_has_attribute ?source_name_attribute .
    ?source_name_attribute cx:attribute_has_name "function".
    ?source_name_attribute cx:attribute_has_value "Protein" .
    
    ?p cx:edge_has_interaction ?interaction_label .
    
    ?o a cx:node .
    ?o rdfs:label ?target_label .
    ?o cx:node_has_attribute ?target_name_attribute .
    ?target_name_attribute cx:attribute_has_name "function".
    ?target_name_attribute cx:attribute_has_value "Protein" .
    
    ?s ?p ?o .
}
"""

result = rdf.query(query, initNs=init_ns)
columns = ['Source', 'Target', 'Interaction']
pd.DataFrame.from_records(list(result), columns=columns)

Unnamed: 0,Source,Target,Interaction
0,"p(HGNC:CD33, pmod(Ph))",directlyIncreases,HGNC:PTPN11
1,"p(HGNC:CD33, pmod(Ph))",directlyIncreases,HGNC:PTPN6
2,HGNC:SYK,increases,HGNC:TREM2
3,HGNC:SYK,increases,HGNC:TYROBP
4,HGNC:PTPN11,directlyDecreases,HGNC:SYK
5,HGNC:PTPN6,directlyDecreases,HGNC:SYK
6,HGNC:CD33,increases,"p(HGNC:CD33, pmod(Ph))"
7,HGNC:CD33,hasVariant,"p(HGNC:CD33, pmod(Ph))"
