Skip to content

Commit

Permalink
Added method to extract a subgraph for isomorphic comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
datadavev committed Dec 13, 2019
1 parent 2df0f66 commit 01c6d8a
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 3 deletions.
40 changes: 37 additions & 3 deletions docsource/source/dataset_identifiers.rst
Expand Up @@ -44,11 +44,45 @@ of a component (SeriesID or SID). That context is used in this document.
id and identifier
-----------------

The ``@id`` property in JSON-LD [#id]_ identifies a node in the RDF graph, and should be an IRI [#IRI]_.
The ``@id`` property in JSON-LD [#id]_ identifies a node in the RDF graph, and must be an IRI [#IRI]_.
The ``SO:identifier`` is an optional property of a node that may or may not be a URI, and may or may
not be the same as the ``@id`` for the node. Ideally, the ``@id`` and the ``SO:identifier`` would
not be the same as the ``@id`` for the node.



Ideally, the ``@id`` and the ``SO:identifier`` would
have the same value though this if often not the case for datasets.

Identifier Conflation
---------------------

The string "978-1-5387-1847-6" is an identifier, in this case an ISBN. A number
of services are available to provide more information about the subject of the
identifier. For example, `ISBN Search`_ is a lookup service that provides a HTML
view of the results. Goole provides a `Books API`_ that returns structured data,
though requires anuthentication to use, for example::

curl "https://www.googleapis.com/books/v1/volumes?key=${GAPIKEY}&q=isbn:9781538718476"

{
"kind": "books#volumes",
"totalItems": 1,
"items": [
{
"kind": "books#volume",
"id": "SyqzDwAAQBAJ",
"etag": "q7NUsBTwiu8",
"selfLink": "https://www.googleapis.com/books/v1/volumes/SyqzDwAAQBAJ",
...

Note that the canonical form of the identifier is "``9781538718476``", the commonly
used human readable form is "``978-1-5387-1847-6``", and a resolvable form that
varies with the resolving service such as the aforementioned Google Books API.


.. ISBN Search: https://isbnsearch.org/isbn/9781538718469
.. Books API:
Persistence
-----------

Expand All @@ -66,4 +100,4 @@ Foototes
.. [#identifier] http://schema.org/docs/datamodel.html#identifierBg
.. include:: includes/binder_activate.rst
.. include:: includes/binder_activate.rst
49 changes: 49 additions & 0 deletions docsource/source/examples/code/eg_getsubgraph_01.py
@@ -0,0 +1,49 @@
import rdflib
import rdflib.compare
import sotools

expected_json = """{
"@context": {
"@vocab":"https://example.net/"
},
"@id":"./sub",
"property_0": "literal_0",
"property_1": ["literal_1-0", "literal_1-1"],
"property_2": {
"property_3":"Anonymous subgraph"
}
}
"""

test_json = """{
"@context": {
"@vocab":"https://example.net/"
},
"@id":"./parent",
"sub":""" + expected_json + """
}
"""

# Load the full graph, setting the base to "https://example.net/"
g_full = rdflib.Graph()
g_full.parse(data=test_json, format="json-ld", publicID="https://example.net/")
print("===\nFull:")
print(g_full.serialize(format="nt").decode())

g_expected = rdflib.ConjunctiveGraph()
g_expected.parse(data=expected_json, format="json-ld", publicID="https://example.net/")
print("===\nExpected:")
print(g_expected.serialize(format="nt").decode())

#Extract the subgraph that is the object of the subject "https://example.net/sub"
g_sub = sotools.getSubgraph(g_full, rdflib.URIRef("https://example.net/sub"))
print("===\nExtracted:")
print(g_sub.serialize(format="nt").decode())

#Direct comparison of the graphs, will fail if there are BNodes
print(f"Extracted subgraph is equal to the expected graph: {g_sub == g_expected}")

# Use isomorphic comparison. This operations can be very expensive if either of
# the grphs are poorly structured with lots of BNodes
print((f"Extracted subgraph is isomorphic with the expected: "
f"{rdflib.compare.isomorphic(g_sub, g_expected)}"))
55 changes: 55 additions & 0 deletions sotools/common.py
Expand Up @@ -3,6 +3,7 @@
"""

import io
from rdflib.term import Identifier
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import NamespaceManager
from rdflib.tools import rdf2dot
Expand Down Expand Up @@ -192,6 +193,60 @@ def loadSOGraphFromUrl(url):
return loadSOGraphFromHtml(response.text, response.url)


def inflateSubgraph(g, sg, ts, depth=0, max_depth=100):
"""
Inflate the subgraph sg to contain all children of sg appearing in g.
Args:
g (Graph): The master graph from which the subgraph is extracted
sg (Graph): The subgraph, modified in place
ts (iterable of triples): list of triples, the objects of which identify subjects to copy frmm g
depth (integer): tracks depth of recursion
max_depth (integer): maximum recursion depth for retrieving terms
Returns:
None
"""
new_trips = []
for t in ts:
if isinstance(t[2], Identifier):
trips = g.triples((t[2], None, None))
for trip in trips:
if not trip in sg:
sg.add(trip)
new_trips.append(trip)
if len(new_trips) > 0:
depth += 1
if depth > max_depth:
return
inflateSubgraph(g, sg, new_trips, depth=depth)
return


def getSubgraph(g, subject, max_depth=100):
"""
Retrieve the subgraph of g with subject.
Args:
g (Graph): Source graph
subject (URIRef): Subject of the root of the subgraph to retrieve
max_depth (integer): Maximum recursion depth
Returns:
(Graph) The subgraph of g with subject.
Example:
.. jupyter-execute:: examples/code/eg_getsubgraph_01.py
"""
sg = ConjunctiveGraph()
sg.namespace_manager = NamespaceManager(g)
sg += g.triples( (subject, None, None) )
inflateSubgraph(g, sg, sg, max_depth=max_depth)
return sg


def renderGraph(g):
"""
For rendering an rdflib graph in Jupyter notebooks
Expand Down

0 comments on commit 01c6d8a

Please sign in to comment.