## Schema checker for DDE-compatibility

The Data Discovery Engine's Schema Playground is very strict when ingesting and displaying schemas, but more flexible when creating them. As a result, it is possible to use the Schema Playground to create a schema that would not be viewable in the playground's schema viewer.

This script uses github actions to automatically fix a known source of "internal error" in schemas so that they can be viewed with the schema viewer. This error is caused by the creation of properties dependent on non-schema.org classes. It creates a dummy class so that that the DDE schema playground viewer can bypass this issue.

This script does NOT address other issues with json schema validation

In [1]:
import json
import pathlib
import os
from jsonschema import validate
from biothings_schema import Schema

#script_path = pathlib.Path(__file__).parent.absolute()
script_path = ''

In [2]:
def check_files(script_path):
    draft_folder = os.path.join(script_path,'drafts/')
    edited_folder = os.path.join(script_path,'edited/')
    draftlist = os.listdir(draft_folder)
    editlist = os.listdir(edited_folder)
    newfiles = [x for x in draftlist if x not in editlist]
    return(draft_folder,edited_folder,newfiles)

In [3]:
def create_dummy_class(eachclass):
    eachclass_info = eachclass.split(":")
    namespace = eachclass_info[0]
    classname = eachclass_info[1]
    dummy_dict = {
      "@id": eachclass,
      "@type": "rdfs:Class",
      "rdfs:comment": "A dummy class to enable avoid referencing issues",
      "rdfs:label": classname,
      "rdfs:subClassOf": {
        "@id": "schema:CreativeWork"
      }
    }
    return(dummy_dict)

In [3]:
def check_draft_schema(script_path):
    draft_folder,edited_folder,newfiles = check_files(script_path)
    for eachfile in newfiles:
        with open(os.path.join(draft_folder,eachfile),'r') as tempinfile:
            tmpjson = json.load(tempinfile)
        cleanjson = {}
        cleanjson['@context']=tmpjson['@context']
        graphlist = []
        rangelist = []
        for x in tmpjson['@graph']:
            graphlist.append(x)
            if x["@type"]=="rdf:Property":
                tmprangelist = x["schema:rangeIncludes"]
                for eachhit in tmprangelist:
                    if (eachhit['@id'] not in rangelist) and ("schema:" not in eachhit['@id']):
                        rangelist.append(eachhit['@id'])
        for eachclass in rangelist:
            dummy_dict = create_dummy_class(eachclass)
            graphlist.append(dummy_dict)
        cleanjson['@graph']=graphlist
        with open(os.path.join(edited_folder,eachfile),'w+') as tmpoutfile:
            tmpoutfile.write(json.dumps(cleanjson))


In [4]:
## Check for json schema validation errors
course_draft = "NDE_schema.json"
draft_to_check = os.path.join(course_draft)
with open(draft_to_check,'r') as infile:
    chemdict = json.load(infile)

#testdata = {"name": "Biology"}
#testdata2 = {}
#validate(testdata2, schema = chemdict)

sc = Schema(chemdict)
sc.validation

SchemaValidationError: field "anatomicalSystem" in "$validation" is not defined in this class or any of its parent classes

In [None]:

#### Main

check_draft_schema(script_path)
print(cleanjson)

In [21]:
draft_folder,edited_folder,newfiles = check_files(script_path)
for eachfile in newfiles:
    with open(os.path.join(draft_folder,eachfile),'r') as tempinfile:
        tmpjson = json.load(tempinfile)
    cleanjson = {}
    cleanjson['@context']=tmpjson['@context']
    graphlist = []
    rangelist = []
    for x in tmpjson['@graph']:
        graphlist.append(x)
        if x["@type"]=="rdf:Property":
            tmprangelist = x["schema:rangeIncludes"]
            for eachhit in tmprangelist:
                if (eachhit['@id'] not in rangelist) and ("schema:" not in eachhit['@id']):
                    rangelist.append(eachhit['@id'])
    for eachclass in rangelist:
        dummy_dict = create_dummy_class(eachclass)
        graphlist.append(dummy_dict)
    cleanjson['@graph']=graphlist
    with open(os.path.join(edited_folder,eachfile),'w+') as tmpoutfile:
        tmpoutfile.write(json.dumps(cleanjson))
        

In [6]:
inputfile = os.path.join(script_path,'NDE_Sample2.json')
with open(inputfile,'rb') as infile:
    definedterm = json.load(infile)

#print(definedterm)
sc = Schema(definedterm)
sc.validation

SchemaValidationError: field "sampleAvailability" in "$validation" is not defined in this class or any of its parent classes