Added method to extract a subgraph for isomorphic comparison

datadavev · Dec 13, 2019 · 01c6d8a · 01c6d8a
1 parent 2df0f66
commit 01c6d8a
Show file tree

Hide file tree

Showing 3 changed files with 141 additions and 3 deletions.
diff --git a/docsource/source/dataset_identifiers.rst b/docsource/source/dataset_identifiers.rst
@@ -44,11 +44,45 @@ of a component (SeriesID or SID). That context is used in this document.
 id and identifier
 -----------------
 
-The ``@id`` property in JSON-LD [#id]_ identifies a node in the RDF graph, and should be an IRI [#IRI]_.
+The ``@id`` property in JSON-LD [#id]_ identifies a node in the RDF graph, and must be an IRI [#IRI]_.
 The ``SO:identifier`` is an optional property of a node that may or may not be a URI, and may or may
-not be the same as the ``@id`` for the node. Ideally, the ``@id`` and the ``SO:identifier`` would
+not be the same as the ``@id`` for the node.
+
+
+
+Ideally, the ``@id`` and the ``SO:identifier`` would
 have the same value though this if often not the case for datasets.
 
+Identifier Conflation
+---------------------
+
+The string "978-1-5387-1847-6" is an identifier, in this case an ISBN. A number 
+of services are available to provide more information about the subject of the 
+identifier. For example, `ISBN Search`_ is a lookup service that provides a HTML 
+view of the results. Goole provides a `Books API`_ that returns structured data,
+though requires anuthentication to use, for example::
+
+  curl "https://www.googleapis.com/books/v1/volumes?key=${GAPIKEY}&q=isbn:9781538718476"
+
+  {
+    "kind": "books#volumes",
+    "totalItems": 1,
+    "items": [
+     {
+      "kind": "books#volume",
+      "id": "SyqzDwAAQBAJ",
+      "etag": "q7NUsBTwiu8",
+      "selfLink": "https://www.googleapis.com/books/v1/volumes/SyqzDwAAQBAJ",
+  ...
+
+  Note that the canonical form of the identifier is "``9781538718476``", the commonly
+  used human readable form is "``978-1-5387-1847-6``", and a resolvable form that 
+  varies with the resolving service such as the aforementioned Google Books API.
+
+
+.. ISBN Search: https://isbnsearch.org/isbn/9781538718469
+.. Books API: 
+
 Persistence
 -----------
 
@@ -66,4 +100,4 @@ Foototes
 .. [#identifier] http://schema.org/docs/datamodel.html#identifierBg
 
 
-.. include:: includes/binder_activate.rst
+.. include:: includes/binder_activate.rst
diff --git a/docsource/source/examples/code/eg_getsubgraph_01.py b/docsource/source/examples/code/eg_getsubgraph_01.py
@@ -0,0 +1,49 @@
+import rdflib
+import rdflib.compare
+import sotools
+
+expected_json = """{
+    "@context": {
+        "@vocab":"https://example.net/"
+    },
+    "@id":"./sub",
+    "property_0": "literal_0",
+    "property_1": ["literal_1-0", "literal_1-1"],
+    "property_2": {
+        "property_3":"Anonymous subgraph"
+    }
+}
+"""
+
+test_json = """{
+    "@context": {
+        "@vocab":"https://example.net/"
+    },
+    "@id":"./parent",
+    "sub":""" + expected_json + """    
+}
+"""
+
+# Load the full graph, setting the base to "https://example.net/"
+g_full = rdflib.Graph()
+g_full.parse(data=test_json, format="json-ld", publicID="https://example.net/")
+print("===\nFull:")
+print(g_full.serialize(format="nt").decode())
+
+g_expected = rdflib.ConjunctiveGraph()
+g_expected.parse(data=expected_json, format="json-ld", publicID="https://example.net/")
+print("===\nExpected:")
+print(g_expected.serialize(format="nt").decode())
+
+#Extract the subgraph that is the object of the subject "https://example.net/sub"
+g_sub = sotools.getSubgraph(g_full, rdflib.URIRef("https://example.net/sub"))
+print("===\nExtracted:")
+print(g_sub.serialize(format="nt").decode())
+
+#Direct comparison of the graphs, will fail if there are BNodes
+print(f"Extracted subgraph is equal to the expected graph: {g_sub == g_expected}")
+
+# Use isomorphic comparison. This operations can be very expensive if either of
+# the grphs are poorly structured with lots of BNodes
+print((f"Extracted subgraph is isomorphic with the expected: "
+      f"{rdflib.compare.isomorphic(g_sub, g_expected)}"))
diff --git a/sotools/common.py b/sotools/common.py
@@ -3,6 +3,7 @@
 """
 
 import io
+from rdflib.term import Identifier
 from rdflib import ConjunctiveGraph, Namespace, URIRef
 from rdflib.namespace import NamespaceManager
 from rdflib.tools import rdf2dot
@@ -192,6 +193,60 @@ def loadSOGraphFromUrl(url):
     return loadSOGraphFromHtml(response.text, response.url)
 
 
+def inflateSubgraph(g, sg, ts, depth=0, max_depth=100):
+    """
+    Inflate the subgraph sg to contain all children of sg appearing in g.
+
+    Args:
+        g (Graph): The master graph from which the subgraph is extracted
+        sg (Graph): The subgraph, modified in place
+        ts (iterable of triples): list of triples, the objects of which identify subjects to copy frmm g
+        depth (integer): tracks depth of recursion
+        max_depth (integer): maximum recursion depth for retrieving terms
+
+    Returns:
+        None
+    """
+    new_trips = []
+    for t in ts:
+        if isinstance(t[2], Identifier):
+            trips = g.triples((t[2], None, None))
+            for trip in trips:
+                if not trip in sg:
+                    sg.add(trip)
+                    new_trips.append(trip)
+    if len(new_trips) > 0:
+        depth += 1
+        if depth > max_depth:
+            return
+        inflateSubgraph(g, sg, new_trips, depth=depth)
+    return
+
+
+def getSubgraph(g, subject, max_depth=100):
+    """
+    Retrieve the subgraph of g with subject.
+
+    Args:
+        g (Graph): Source graph
+        subject (URIRef): Subject of the root of the subgraph to retrieve
+        max_depth (integer): Maximum recursion depth
+
+    Returns:
+        (Graph) The subgraph of g with subject.
+
+    Example:
+
+    .. jupyter-execute:: examples/code/eg_getsubgraph_01.py
+
+    """
+    sg = ConjunctiveGraph()
+    sg.namespace_manager = NamespaceManager(g)
+    sg += g.triples( (subject, None, None) )
+    inflateSubgraph(g, sg, sg, max_depth=max_depth)
+    return sg
+
+
 def renderGraph(g):
     """
     For rendering an rdflib graph in Jupyter notebooks