# Notebook Aim
To investigate .ttl files and parse them

In [9]:
# https://github.com/RDFLib/rdflib
#!pip install rdflib
from rdflib import Graph

## Ontology File
The RDf data dump comes with an ontology file that can use to know which RDF types to expect in the `.ttl` files.

In [10]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
# import networkx as nx
# import matplotlib.pyplot as plt

url = 'https://www.w3.org/TeamSubmission/turtle/tests/test-30.ttl'

onto_graph = Graph()
result = onto_graph.parse("./wpOntology.ttl", format='turtle')

# G = rdflib_to_networkx_multidigraph(result)

# # Plot Networkx instance of RDF Graph
# pos = nx.spring_layout(G, scale=2)
# plt.figure(figsize=(50,50))
# edge_labels = nx.get_edge_attributes(G, 'r')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
# nx.draw(G, with_labels=True)

# #if not in interactive mode for 

# plt.savefig("ontology_graph.png")

## Test WP File - WP111

In [11]:
g = Graph()

# Parse an RDF file
wp111g = g.parse("./WP111.ttl")

In [12]:
len(wp111g)  # Number of subj/pred/obj

2353

2353

In [13]:
# Getting labels of GeneProducts
q = """
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?label
    WHERE {
        ?p rdf:type wp:GeneProduct .

        ?p rdfs:label ?label .
    }
"""
for r in wp111g.query(q):
    print(r["label"])
    break

NDUFS3
NDUFS3


## Parse PubRefs

In [14]:
pubref_query = """
SELECT ?id ?data_source ?data_source_id ?link
    WHERE {
        ?id rdf:type wp:PublicationReference ;
            rdf:type ?type .
            ?id foaf:page ?link .
            ?id dc:source ?data_source .
            ?id dcterms:identifier ?data_source_id .
    }
"""

for match in wp111g.query(pubref_query):
    result_dict = {key: str(val) for key, val in match.asdict().items()}
    print(result_dict)

{'id': 'https://identifiers.org/pubmed/23746447', 'data_source': 'PubMed', 'data_source_id': '23746447', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/23746447'}
{'id': 'https://identifiers.org/pubmed/22902835', 'data_source': 'PubMed', 'data_source_id': '22902835', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/22902835'}
{'id': 'https://identifiers.org/pubmed/30030361', 'data_source': 'PubMed', 'data_source_id': '30030361', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/30030361'}
{'id': 'https://identifiers.org/pubmed/23746447', 'data_source': 'PubMed', 'data_source_id': '23746447', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/23746447'}
{'id': 'https://identifiers.org/pubmed/22902835', 'data_source': 'PubMed', 'data_source_id': '22902835', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/22902835'}
{'id': 'https://identifiers.org/pubmed/30030361', 'data_source': 'PubMed', 'data_source_id': '30030361', 'link': 'http://www.ncbi.nlm.nih.gov/pubmed/30030361'}


### Parse Pathways

In [15]:
pathway_query = """
        SELECT ?id ?data_source ?data_source_id ?description ?organism
            WHERE {
                ?id rdf:type wp:Pathway .
                ?id rdf:type ?type .
                ?id dc:source ?data_source .
                ?id dcterms:identifier ?data_source_id .
                OPTIONAL {
                    ?id dcterms:description ?description .
                    ?id wp:organismName ?organism . 
                }
            FILTER (STR(?type) = "http://vocabularies.wikipathways.org/wp#Pathway") .
            }
        """

for match in wp111g.query(pathway_query):
    result_dict = {key: str(val) for key, val in match.asdict().items()}
    print(result_dict)

{'id': 'https://identifiers.org/wikipathways/WP111_r117097', 'data_source': 'WikiPathways', 'data_source_id': 'WP111', 'description': 'An electron transport chain(ETC) couples a chemical reaction between an electron donor (such as NADH) and an electron acceptor (such as O2) to the transfer of H+ ions across a membrane, through a set of mediating biochemical reactions. These H+ ions are used to produce adenosine triphosphate (ATP), the main energy intermediate in living organisms, as they move back across the membrane. \nIn mitochondria, it is the conversion of oxygen to water, NADH to NAD+ and succinate to fumarate that drives the transfer of H+ ions.\nSource: Wikipedia ([[wikipedia:Electron_transport_chain]])\n\nProteins on this pathway have targeted assays available via the [https://assays.cancer.gov/available_assays?wp_id=WP111 CPTAC Assay Portal]', 'organism': 'Homo sapiens'}
{'id': 'https://identifiers.org/wikipathways/WP4324', 'data_source': 'WikiPathways', 'data_source_id': 'WP4

### Parse Nodes

In [38]:
def parse_nodes(wikipathway, query: str) -> dict:
    """Parse nodes in the wikipathway graph."""
    nodes = dict()
    query_results = wikipathway.query(query)
    for match in query_results:
        result_dict = {key: str(val) for key, val in match.asdict().items()}
        node_types = list({int_type.split("#")[1] for int_type in result_dict.pop("types").split(",")})

        # AOP nodes only have "DataNode" type
        if "data_source" in result_dict and result_dict["data_source"].startswith("AOP-Wiki"):
            node_types.add("AOP-Wiki")

        # don't need "DataNode" since it's a duplicate
        node_types.pop('DataNode')

        # Defensive code
        if len(node_types) != 1:  # Should only be one node type
            raise ValueError(f"Too many node types: {node_types}")

        # Parse participants of wp:Complex nodes if present, Reactome complexes sometimes don't have participants
        if "Complex" in node_types and "participants" in result_dict:
            result_dict["participants"] = set(result_dict["participants"].split(","))

        result_dict["type"] = node_types[0]

        wp_id = result_dict["id"]
        nodes[wp_id] = result_dict

    return nodes

noncomplex_node_query = """
    SELECT ?id (GROUP_CONCAT(?type;SEPARATOR=",") AS ?types) ?label ?data_source ?data_source_id
    WHERE {
        ?id rdf:type wp:DataNode ;
            rdf:type ?type .
            ?id rdfs:label ?label .
            ?id dc:source ?data_source .
            ?id dcterms:identifier ?data_source_id .
    }
    GROUP BY ?id
"""

complex_node_query = """
    SELECT ?id ?types (GROUP_CONCAT(?ps ;SEPARATOR=",") AS ?participants)
    WHERE {
        ?id rdf:type wp:Complex ;
            rdf:type ?types ;
            wp:participants ?ps .
    FILTER (STR(?types) = "http://vocabularies.wikipathways.org/wp#Complex") .
    }
    GROUP BY ?id
"""

# noncomplex_nodes = parse_nodes(wikipathway=wp111g, query=noncomplex_node_query)
complex_nodes = parse_nodes(wikipathway=wp111g, query=complex_node_query)
print(len(complex_nodes))
nodes = {**noncomplex_nodes, **complex_nodes}

TypeError: 'str' object cannot be interpreted as an integer

TypeError: 'str' object cannot be interpreted as an integer

### Parse Interactions

In [None]:
# Get and parse edges
# ";" can be used when same subj/pred/obj used - https://www.stardog.com/tutorials/sparql#ordering-results
int_q = """
    SELECT ?wp_id (GROUP_CONCAT(?type;SEPARATOR=",") AS ?types) ?source ?target ?identifier (GROUP_CONCAT(?ps ;SEPARATOR=",") AS ?participants)
    WHERE {
        ?wp_id rdf:type wp:Interaction ;
            wp:participants ?ps ;
            rdf:type ?type .
        OPTIONAL {
            ?wp_id wp:source ?source .
            ?wp_id wp:target ?target .
        }
    }
    GROUP BY ?wp_id
"""

interactions = dict()
for match in wp111g.query(int_q):
    result_dict = {key: str(val) for key, val in match.asdict().items()}
    interaction_types = {int_type.split("#")[1] for int_type in result_dict["types"].split(",")}
    result_dict["participants"] = set(result_dict["participants"].split(","))
    
    # don't need "Interaction"
    interaction_types.remove("Interaction")
    
    # Binding always with ComplexBinding so keep only ComplexBinding
    if "ComplexBinding" in interaction_types:
        result_dict["types"] = {"ComplexBinding"}
        
    else:result_dict["types"]  = interaction_types
    
    wp_id = result_dict.pop("wp_id")
    interactions[wp_id] = result_dict

result_dict

## Check Nodes in Parsed Interactions

In [None]:
for iid, metadata in interactions.items():
    for node_id in metadata["participants"]:
        if node_id not in nodes:
            print(node_id)