# Test a `schema.org/Dataset` for DataONE compatibility

This script uses rdflib and pySHACL to evaluate whether a json-ld structure can be harvested by DataONE.

**NOTE:** This is a work in progress and does not necessarily correspond to the actual solution and requirements.

References:

* SHACL : https://www.w3.org/TR/shacl/
* Turtle: https://www.w3.org/TeamSubmission/turtle/
* pySHACL : https://github.com/RDFLib/pySHACL




In [32]:
import io
import rdflib
import rdflib.tools.rdf2dot
import graphviz
import pyshacl
from pprint import pprint

def renderGraph(g):
    fp = io.StringIO()
    rdflib.tools.rdf2dot.rdf2dot(g, fp)
    return graphviz.Source(fp.getvalue())    


def getDatasets(data_graph):
    """
    Return True if the provided data_graph is an instance of schema.org/Dataset
    """
    q = """PREFIX rdf:      <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX schema:   <http://schema.org/>

    SELECT ?x 
    WHERE { 
        ?x rdf:type schema:Dataset .
    }
    """
    qres = data_graph.query(q)
    print(qres)
    #return qres
    return len(qres) >= 1


def evaluateShape(data_graph, shape_graph_file="dataset_shape.ttl"):
    shape_0 = rdflib.Graph()
    shape_0.parse(shape_graph_file, format="turtle")
    conforms, results_graph, results_text = pyshacl.validate(
        data_graph, 
        shacl_graph=shape_0, 
        inference="rdfs", 
        meta_shacl=False, 
        abort_on_error=False, 
        debug=False
    )
    return conforms, results_graph, results_text

    

Define a basic Dataset structure in json-ld:

In [33]:
data00 = b"""{
    "@context":{ 
        "@vocab": "http://schema.org/",
        "datacite": "http://purl.org/spar/datacite/"
    },
    "@id":"ex:my_data",
    "@type": "Dataset",
    
    "identifier": {
        "@type": ["PropertyValue", "datacite:ResourceIdentifier"],
        "datacite:usesIdentifierScheme": { 
            "@id": "datacite:doi" 
        },
        "propertyId":"DOI",
        "url": "https://doi.org/10.1234/abcd",
        "value": "10.1234/abcd"
    },
    "encoding":{
        "@type": "MediaObject",
        "contentUrl":"https://example.org/link/to/iso.xml",
        "encodingFormat":"http://www.isotc211.org/2005/gmd",
        "description":"ISO TC211 XML rendering of metadata.",
        "dateModified":"2019-06-12T14:44:15Z"
    },
    "distribution": [
        {
            "@type":"DataDownload",
            "contentUrl": "https://example.org/link/to/data",
            "encodingFormat":"data format",
            "identifier": {
                "@type": ["PropertyValue", "datacite:ResourceIdentifier"],
                "datacite:usesIdentifierScheme": { 
                    "@id": "datacite:doi" 
                },
                "propertyId":"DOI",
                "url": "https://doi.org/10.1234/blh",
                "value": "10.1234/blah"
            },
            "encoding": {
                "@type": "MediaObject",
                "contentUrl":"https://example.org/link/to/data.csv",
                "encodingFormat":"text/csv",
                "description":"Comma separated data",
                "dateModified":"2019-06-12T14:44:15Z"            
            }
        }
    ]
}
"""
data00_graph = rdflib.Graph()
data00_graph.parse(io.BytesIO(data00), format="json-ld")
#gv = renderGraph(data00_graph)
#gv

<Graph identifier=Ncbc230a3a29040618e8105ca3c3687b2 (<class 'rdflib.graph.Graph'>)>

Verify that the data graph contains a Dataset instance:

In [35]:
data_graphs = getDatasets(data00_graph)
print(f"The provided graph contains this many Dataset instances: {data_graphs}")

<rdflib.plugins.sparql.processor.SPARQLResult object at 0x10860b438>
The provided graph contains this many Dataset instances: True


Evaluate the loaded graph against a Dataset SHACL shape.

In [37]:
conforms, results_graph, results_text = evaluateShape(data00_graph)

print(f"The data graph is of the required shape: {conforms}")
if not conforms:
    print(results_text)


The data graph is of the required shape: False
Validation Report
Conforms: False
Results (4):
Constraint Violation in OrConstraintComponent (http://www.w3.org/ns/shacl#OrConstraintComponent):
	Severity: sh:Violation
	Source Shape: d1:IdentifierShape
	Focus Node: [ ]
	Value Node: [ ]
	Message: An identifier should be an xsd:string or of type schema:PropertyValue and datacite:ResourceIdentifier.
Constraint Violation in NodeConstraintComponent (http://www.w3.org/ns/shacl#NodeConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:node d1:IdentifierShape ; sh:path schema:identifier ]
	Focus Node: [ ]
	Value Node: [ ]
	Result Path: schema:identifier
Constraint Violation in NodeConstraintComponent (http://www.w3.org/ns/shacl#NodeConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:node d1:DistributionShape ; sh:path schema:distribution ]
	Focus Node: <ex:my_data>
	Value Node: [ ]
	Result Path: schema:distribution
Constra

Evaluate a data graph that does not comply because it does not have an identifier:

In [9]:
data01 = b"""{
    "@context":{ 
        "@vocab": "http://schema.org/",
        "datacite": "http://purl.org/spar/datacite/"
    },
    "@id":"ex:my_data",
    "@type": "Dataset",
    
    "encoding":{
        "@type": "MediaObject",
        "contentUrl":"https://example.org/link/to/iso.xml",
        "encodingFormat":"http://www.isotc211.org/2005/gmd",
        "description":"ISO TC211 XML rendering of metadata.",
        "dateModified":"2019-06-12T14:44:15Z"
    },
    "distribution": [
        {
            "@type":"DataDownload",
            "contentUrl": "https://example.org/link/to/data",
            "encodingFormat":"data format"
        }
    ]
}
"""
data01_graph = rdflib.Graph()
data01_graph.parse(io.BytesIO(data01), format="json-ld")
print(f"The data graph contains a Dataset: {isDataset(data01_graph)}")
conforms, results_graph, results_text = evaluateShape(data01_graph)
print(f"The data graph is of the required shape: {conforms}")
if not conforms:
    print()
    print(results_text)


The data graph contains a Dataset: True
The data graph is of the required shape: False

Validation Report
Conforms: False
Results (3):
Constraint Violation in MinCountConstraintComponent (http://www.w3.org/ns/shacl#MinCountConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:minCount Literal("1", datatype=xsd:integer) ; sh:path schema:identifier ]
	Focus Node: [ ]
	Result Path: schema:identifier
Constraint Violation in NodeConstraintComponent (http://www.w3.org/ns/shacl#NodeConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:node d1:DistributionShape ; sh:path schema:distribution ]
	Focus Node: <ex:my_data>
	Value Node: [ ]
	Result Path: schema:distribution
Constraint Violation in MinCountConstraintComponent (http://www.w3.org/ns/shacl#MinCountConstraintComponent):
	Severity: sh:Violation
	Source Shape: [ sh:message Literal("A dataset must have an identifier.", lang=en) ; sh:minCount Literal("1", datatype=xsd:integer) ; sh:path schema:identifier ]
	Focus Node

In [10]:
q = """PREFIX rdf:      <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX schema:   <http://schema.org/>

SELECT ?x 
WHERE { 
    ?x rdf:type schema:Dataset .
}
"""
qres = data00_graph.query(q)
pprint(qres)

<rdflib.plugins.sparql.processor.SPARQLResult object at 0x10402c0f0>
