In [3]:
import imp
import logging
import json

logging.basicConfig(level=logging.DEBUG)

data = {}

data["d1"] = {
    "@context":"http://schema.org/",
    "@type":"Dataset",
    "name":"http remote @context"
}

data["d2"] = {
    "@context":"https://schema.org/",
    "@type":"Dataset",
    "name":"https remote @context"
}

data["d3"] = {
    "@context": {
        "@vocab":"http://schema.org/"
    },
    "@type":"Dataset",
    "name":"http @vocab only"
}

data["d4"] = {
    "@context": {
        "SO":"http://schema.org/"
    },
    "@type":"SO:Dataset",
    "SO:name":"http with namespace prefix = SO"
}

data["d5"] = {
    "@context": {
        "SO":"http://schema.org/"
    },
    "@graph":[
        {
            "@type":"SO:Dataset",
            "SO:name":"Single dataset graph with global http namespace prefix = SO"
        }
    ]
}

data["d6"] = {
    "@graph":[
        {
            "@context": {
                "SO":"http://schema.org/"
            },
            "@type":"SO:Dataset",
            "SO:name":"Single dataset graph with dataset local @context http with namespace prefix = SO"
        }
    ]
}
logging.info(f"Loaded {len(data.keys())} instances.");

INFO:root:Loaded 6 instances.


In [4]:
from pprint import pprint
from pyld import jsonld

for k in data.keys():
    expanded = jsonld.expand(data[k])
    print(f"dataset {k}:")
    print(json.dumps(expanded, indent=2))

DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): schema.org:80
DEBUG:urllib3.connectionpool:http://schema.org:80 "GET / HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET / HTTP/1.1" 200 5100
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): schema.org:80
DEBUG:urllib3.connectionpool:http://schema.org:80 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 200 156949
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET / HTTP/1.1" 200 5100
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 200 

dataset d1:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    "http://schema.org/name": [
      {
        "@value": "http remote @context"
      }
    ]
  }
]
dataset d2:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    "http://schema.org/name": [
      {
        "@value": "https remote @context"
      }
    ]
  }
]
dataset d3:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    "http://schema.org/name": [
      {
        "@value": "http @vocab only"
      }
    ]
  }
]
dataset d4:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    "http://schema.org/name": [
      {
        "@value": "http with namespace prefix = SO"
      }
    ]
  }
]
dataset d5:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    "http://schema.org/name": [
      {
        "@value": "Single dataset graph with global http namespace prefix = SO"
      }
    ]
  }
]
dataset d6:
[
  {
    "@type": [
      "http://schema.org/Dataset"
    ],
    

In [5]:
import copy 

SO_HTTP_CONTEXT = {
    "@context":{
        "@vocab":"http://schema.org/"
    }
}
SO_HTTPS_CONTEXT = {
    "@context":{
        "@vocab":"https://schema.org/"
    }
}

def normalizeSchemaOrg( o ):
    expanded = jsonld.expand( o )
    # Context document should not be modified in the .compact method, but it is in pyld.
    # Send a copy of the context instead of the original.
    normalized = jsonld.compact(
        expanded, 
        copy.deepcopy(SO_HTTP_CONTEXT),
        {"graph":True} )
    # Switch the namespace to use https
    normalized["@context"]["@vocab"] = "https://schema.org/"
    finalized = jsonld.compact(
        normalized, 
        copy.deepcopy(SO_HTTPS_CONTEXT),
        {"graph":True} )
    return finalized


def normalizeAll( sources ):
    res = {}
    for k in data.keys():
        res[k] = normalizeSchemaOrg( sources[k] )
    return res


data["d7"] = {
    "@context": {
        "SO":"http://schema.org/"
    },
    "@graph":[
        {
            "@id":"./d7a",
            "@type":"SO:Dataset",
            "SO:name":"Double dataset graph with global http namespace prefix = SO",
        },
        {
            "@id":"./d7b",
            "@type":"SO:Dataset",
            "SO:name":"Double dataset graph with global http namespace prefix = SO",
        },
    ]    
}

normalized = normalizeAll(data);
for k in normalized.keys():
    print(f"Dataset {k}:")
    print(json.dumps(normalized[k], indent=2))


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): schema.org:80
DEBUG:urllib3.connectionpool:http://schema.org:80 "GET / HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET / HTTP/1.1" 200 5100
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): schema.org:80
DEBUG:urllib3.connectionpool:http://schema.org:80 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 200 156949
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET / HTTP/1.1" 200 5100
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): schema.org:443
DEBUG:urllib3.connectionpool:https://schema.org:443 "GET /docs/jsonldcontext.jsonld HTTP/1.1" 200 

Dataset d1:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "http remote @context"
    }
  ]
}
Dataset d2:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "https remote @context"
    }
  ]
}
Dataset d3:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "http @vocab only"
    }
  ]
}
Dataset d4:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "http with namespace prefix = SO"
    }
  ]
}
Dataset d5:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "Single dataset graph with global http namespace prefix = SO"
    }
  ]
}
Dataset d6:
{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      

In [6]:
def printDatasetName(ds):
    for i,v in enumerate(ds["@graph"]):
        print(f"Dataset {i} has name = {v['name']}")

for k,o in normalized.items():
    print(f"Normalized {k}:")
    printDatasetName(o);
    print()

Normalized d1:
Dataset 0 has name = http remote @context

Normalized d2:
Dataset 0 has name = https remote @context

Normalized d3:
Dataset 0 has name = http @vocab only

Normalized d4:
Dataset 0 has name = http with namespace prefix = SO

Normalized d5:
Dataset 0 has name = Single dataset graph with global http namespace prefix = SO

Normalized d6:
Dataset 0 has name = Single dataset graph with dataset local @context http with namespace prefix = SO

Normalized d7:
Dataset 0 has name = Double dataset graph with global http namespace prefix = SO
Dataset 1 has name = Double dataset graph with global http namespace prefix = SO



In [7]:
http_data = {
    "@context":{
        "@vocab":"http://schema.org/"
    },
    "@type":"Dataset",
    "name":"test http, vocab only"
}

https_data = {
    "@context":{
        "@vocab":"https://schema.org/"
    },
    "@type":"Dataset",
    "name":"test https, vocab only"    
}

print("Dataset using http://schema.org/")
printDatasetName(normalizeSchemaOrg(http_data))

print("");
print("Dataset using https://schema.org/")
printDatasetName(normalizeSchemaOrg(https_data))

Dataset using http://schema.org/
Dataset 0 has name = test http, vocab only

Dataset using https://schema.org/
Dataset 0 has name = test https, vocab only


In [8]:
dataset = {
  "@context": {
    "@vocab": "http://schema.org/",
    "dbpedia": "http://dbpedia.org/resource/"
  },
  "@type": "Dataset",
  "name": "Removal of organic carbon by natural bacterioplankton communities as a function of pCO2 from laboratory experiments between 2012 and 2016",
  "spatialCoverage": {
    "@type": "Place",
    "geo": {
      "@type": "GeoShape",
      "line": "39.3280,120.1633 40.445,123.7878"
    },
    "additionalProperty": {
      "@type": ["PropertyValue", "dbpedia:Spatial_reference_system"],
      "@id": "http://www.opengis.net/def/crs/OGC/1.3/CRS84"
    }
  }
}

dataset_normalized = normalizeSchemaOrg(dataset)
print(json.dumps(dataset_normalized, indent=2))

# Modify the context for compaction
SO_HTTPS_CONTEXT["@context"]["dbpedia"] = "http://dbpedia.org/resource/"
print("\nAdjusted context:")

# Redo normalization with the adjusted compaction context
dataset_normalized = normalizeSchemaOrg(dataset)
print(json.dumps(dataset_normalized, indent=2))

{
  "@context": {
    "@vocab": "https://schema.org/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "Removal of organic carbon by natural bacterioplankton communities as a function of pCO2 from laboratory experiments between 2012 and 2016",
      "spatialCoverage": {
        "@type": "Place",
        "additionalProperty": {
          "@id": "http://www.opengis.net/def/crs/OGC/1.3/CRS84",
          "@type": [
            "PropertyValue",
            "http://dbpedia.org/resource/Spatial_reference_system"
          ]
        },
        "geo": {
          "@type": "GeoShape",
          "line": "39.3280,120.1633 40.445,123.7878"
        }
      }
    }
  ]
}

Adjusted context:
{
  "@context": {
    "@vocab": "https://schema.org/",
    "dbpedia": "http://dbpedia.org/resource/"
  },
  "@graph": [
    {
      "@type": "Dataset",
      "name": "Removal of organic carbon by natural bacterioplankton communities as a function of pCO2 from laboratory experiments between 2012 and 

## Running this notebook

The following process was used to install necessary dependencies and run this notebook. It assumes an Anaconda python distribution is being used.

Note that `requirements.txt` currently lists specific, non-release versions of `rdflib`, `pySHACL`, and `pyld`. See comments in `requirements.txt` for details why.

Installation:

```
cd path/to/shacl-pad
conda activate shacl-pad
pip install -r requirements.txt
conda deactivate
```

Running:
```
cd path/to/shacl-pad
conda activate shacl-pad
jupyter lab
```

When done:
```
conda deactivate
```
