# Notebook Aim
To investigate .ttl files and parse them

In [1]:
# https://github.com/RDFLib/rdflib
#!pip install rdflib
from rdflib import Graph

## Ontology File
The RDf data dump comes with an ontology file that can use to know which RDF types to expect in the `.ttl` files.

In [2]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
# import networkx as nx
# import matplotlib.pyplot as plt

url = 'https://www.w3.org/TeamSubmission/turtle/tests/test-30.ttl'

onto_graph = Graph()
result = onto_graph.parse("./wpOntology.ttl", format='turtle')

# G = rdflib_to_networkx_multidigraph(result)

# # Plot Networkx instance of RDF Graph
# pos = nx.spring_layout(G, scale=2)
# plt.figure(figsize=(50,50))
# edge_labels = nx.get_edge_attributes(G, 'r')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
# nx.draw(G, with_labels=True)

# #if not in interactive mode for 

# plt.savefig("ontology_graph.png")

## Test WP File - WP111

In [3]:
g = Graph()

# Parse an RDF file
wp111g = g.parse("./WP111.ttl")

In [4]:
len(wp111g)  # Number of subj/pred/obj

2353

In [5]:
# Getting labels of GeneProducts
q = """
    PREFIX dc: <http://purl.org/dc/elements/1.1/>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?label
    WHERE {
        ?p rdf:type wp:GeneProduct .

        ?p rdfs:label ?label .
    }
"""
for r in wp111g.query(q):
    print(r["label"])
    break

NDUFS3


### Parse Nodes

In [20]:
# Get and parse nodes
# ";" can be used when same subj/pred/obj used - https://www.stardog.com/tutorials/sparql#ordering-results
q = """
    SELECT ?ttl_id (GROUP_CONCAT(?type;SEPARATOR=",") AS ?types) ?label ?id_source ?identifier
    WHERE {
        ?ttl_id rdf:type wp:DataNode ;
            rdfs:label ?label ;
            dc:source ?id_source ;
            dcterms:identifier ?identifier ;
            rdf:type ?type .
    }
    GROUP BY ?ttl_id
"""

nodes = dict()
for match in wp111g.query(q):
    result_dict = {key: str(val) for key, val in match.asdict().items()}
    node_types = [int_type.split("#")[1] for int_type in result_dict.pop("types").split(",")]
    
    # don't need "DataNode" since it's a duplicate
    node_types.remove('DataNode')
    
    # Defensive code
    if len(node_types) != 1:  # Should only be one node type
        raise ValueError(f"Too many node types: {node_types}")
    
    result_dict["type"] = node_types[0]
    
    ttl_id = result_dict.pop("ttl_id")
    nodes[ttl_id] = result_dict
result_dict

{'label': 'COX8A',
 'id_source': 'Entrez Gene',
 'identifier': '1351',
 'type': 'GeneProduct'}

### Parse Interactions Using Node Data

In [23]:
# Get and parse edges
# ";" can be used when same subj/pred/obj used - https://www.stardog.com/tutorials/sparql#ordering-results
int_q = """
    SELECT ?ttl_id (GROUP_CONCAT(?type;SEPARATOR=",") AS ?types) ?source ?target ?identifier (GROUP_CONCAT(?ps ;SEPARATOR=",") AS ?participants)
    WHERE {
        ?ttl_id rdf:type wp:Interaction ;
            wp:participants ?ps ;
            wp:source ?source ;
            wp:target ?target ;
            rdf:type ?type .
    }
    GROUP BY ?ttl_id
"""

interactions = dict()
for match in wp111g.query(int_q):
    result_dict = {key: str(val) for key, val in match.asdict().items()}
    result_dict["types"] = {int_type.split("#")[1] for int_type in result_dict["types"].split(",")}
    result_dict["participants"] = set(result_dict["participants"].split(","))
    
#     # rdf type comes after #, don't need "Interaction" since it's always a duplicate
#     if result_dict["type"] == "Interaction":
#         continue
    
    # Binding always with ComplexBinding so skip Binding
    
    ttl_id = result_dict.pop("ttl_id")
    interactions[ttl_id] = result_dict
for x in interactions.keys():
    print(x.split("/")[-1])

ad726
d057a
a4aef
b92fd
c4d38
b2b00
eb56f
b3d7d
b17ae
id86f308da
f52f4
f0372
c243a
fee15
f713d
fa8bc
b0e4a
cfb2a
cdee2
feb3f
