In [1]:
import os
import json
import pandas
import pathlib
import yaml

In [133]:
#script_path = pathlib.Path(__file__).parent.absolute()
tmp_dir = os.getcwd()
parent_dir = os.path.dirname(tmp_dir)
available = os.listdir(parent_dir)
outputdirectory = os.path.join(parent_dir,'specifications')
inputdirectory = os.path.join(parent_dir,'Bioschemas-Validator')
input_profiles = os.path.join(inputdirectory,'profile_json')
input_marginality = os.path.join(inputdirectory,'profile_marginality')
input_yml = os.path.join(inputdirectory,'profile_yml')

specifications = os.listdir(input_profiles)
datapath = 'results/'

In [11]:
def create_new_dict():
    newdict = {}
    newdict['@context'] = {
        "schema": "http://schema.org/",
        "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
        "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
        "bioschemas": "http://discovery.biothings.io/view/bioschemas/"
      } 
    return(newdict)

def get_yml_dict(theymlfile):
    with open(theymlfile,'r') as ymlin:
        tmpyml = yaml.load_all(ymlin, Loader=yaml.FullLoader)
        for eachyml in tmpyml:
            if '<!DOCTYPE HTML>' in eachyml:
                break
            ymldict = eachyml
    return(ymldict)

def grab_class_info(specifications,eachversion,ymldict):
    parentclass = ymldict['hierarchy'][-2]
    if parentclass in specifications:
        parenttype = "bioschemas:"
    else:
        parenttype = "schema:"
    classinfo = {
      "@id": "bioschemas:"+ymldict['name'],
      "@type": "rdfs:Class",
      "rdfs:comment": ymldict['spec_info']['subtitle']+" "+ymldict['spec_info']['description']+" Version: "+ymldict['spec_info']['version'],
      "schema:schemaVersion": ["https://bioschemas.org/"+ymldict['spec_type']+"/"+ymldict['name']+"/"+eachversion.replace(".json","")],  
      "rdfs:label": ymldict['name'],
      "rdfs:subClassOf": {
        "@id": parenttype+parentclass
      }
    }
    return(classinfo)

In [3]:
expected_type_dict, reusable_definitions = load_dictionaries()
for eachspec in specifications:
    tmpinputprofilepath = os.path.join(input_profiles,eachspec)
    tmpinputyml = os.path.join(input_yml,eachspec)
    spec_profs = os.listdir(tmpinputprofilepath)
    for eachversion in spec_profs:
        graphlist = []
        tmpdict = create_new_dict()
        theymlfile = os.path.join(tmpinputyml,eachversion.replace('.json','.html'))
        ymldict = get_yml_dict(theymlfile)
        classinfo = grab_class_info(eachversion,theymlfile)
        

NameError: name 'load_dictionaries' is not defined

In [134]:
#### Generate validation content

def generate_validation(ymldict):
    expected_type_dict, reusable_definitions = load_dictionaries()
    propertylist = ymldict['mapping']
    validationdict = {
      "$schema": "http://json-schema.org/draft-07/schema#",
      "type": "object",
      "properties":{},
      "required":[],
      "recommended":[],
      "optional":[],
      "definitions":{}
    }
    for eachproperty in propertylist:
        rangelist = get_rangelist(eachproperty)
        definitiondict,referencelist = import_reusable_objects(reusable_definitions,rangelist)
        valpropdict, referencelist, definitiondict = cardinality_check(expected_type_dict, referencelist, definitiondict, eachproperty)
        validationdict["properties"][eachproperty['property']]=valpropdict
        validationdict['definitions'].update(definitiondict)
        try:
            marginality = eachproperty["marginality"]
        except:
            marginality = None
        if marginality == "Required":
            validationdict['required'].append(eachproperty["property"])
        elif marginality == "Recommended":
            validationdict['recommended'].append(eachproperty["property"])
        elif marginality == "Optional":
            validationdict['optional'].append(eachproperty["property"])
    return(validationdict)
        
def load_dictionaries():
    from dde_reusable_objects import expected_type_dict
    from dde_reusable_objects import reusable_definitions
    return(expected_type_dict, reusable_definitions)


def generate_type(expected_type_dict, propertytype):
    if propertytype in expected_type_dict.keys():
        matched_type = expected_type_dict[propertytype]
    else:
        matched_type = False
    return(matched_type)


def generate_base_dict(expectedtype):
    base_dict = {
        "@type": expectedtype,
        "type": "object",
        "properties": {
          "name": {
            "type": "string"  
          }  
        },
        "required": []
    }
    return(base_dict)


def import_reusable_objects(reusable_definitions,rangelist):
    expected_types = [x["@id"].replace("bioschemas:","") for x in rangelist]
    all_expected_types = [x.replace("schema:","") for x in expected_types]
    definitionslist = [x for x in all_expected_types if x.lower() in reusable_definitions.keys()]
    definitiondict = {}
    referencelist = {}
    for eachdefinition in definitionslist:
        definitiondict[eachdefinition.lower()] = reusable_definitions[eachdefinition.lower()]
        referencelist[eachdefinition.lower()] = {"$ref":"#/definitions/"+eachdefinition.lower()}
    return(definitiondict,referencelist)


def check_type(expected_type_dict, referencelist, definitiondict, propertytype_info):
    referencename = propertytype_info.replace("bioschemas:","").replace("schema:","").lower()
    matched_type = generate_type(expected_type_dict, propertytype_info)
    if matched_type != False:
        actualtype = matched_type
    else:
        if referencename in referencelist.keys():
            actualtype = referencelist[referencename]
        else:
            ### create reference and property
            actualtype = {"$ref":"#/definitions/"+referencename}
            referencelist[referencename] = actualtype
            definitiondict[referencename] = generate_base_dict(propertytype_info)    
    return(actualtype, referencelist, definitiondict)


def cardinality_check(expected_type_dict, referencelist, definitiondict, eachproperty):
    rangelist = get_rangelist(eachproperty)
    ## Generate the base object
    if "bsc_description" in eachproperty.keys():
        valpropdict = {"description":eachproperty['bsc_description']+" "+eachproperty['description']}
    else:
        valpropdict = {"description":eachproperty['description']}
    ## Check cardinality    
    if eachproperty['cardinality'] == "ONE": ## There can only be one expected value
        ## Check number of expected types
        if len(rangelist) == 1: ## There can only be one expected type
            propertytype = rangelist[0]
            actualtype, referencelist, definitiondict = check_type(expected_type_dict, referencelist, definitiondict, propertytype['@id'])
            valpropdict["type"] = actualtype
        else: ## There are more than one expected type
            propertyelements = []
            for propertytype in rangelist:
                actualtype, referencelist, definitiondict = check_type(expected_type_dict, referencelist, definitiondict, propertytype['@id'])
                if actualtype not in propertyelements:
                    propertyelements.append(actualtype)
            valpropdict["type"] = {
                "oneOf": propertyelements
            }
                
    else: ## each property can have many elements, ie- Cardinality == Many
        ## Check number of expected types
        if len(rangelist) == 1: ## If only one type expected, but many of it are allowed
            propertytype = rangelist[0]
            actualtype, referencelist, definitiondict = check_type(expected_type_dict, referencelist, definitiondict, propertytype['@id'])
            valpropdict["type"] = {
              "oneOf": [
                  actualtype,
                  {
                    "type": "array",
                    "items": actualtype
                  }
              ]  
            }
        else: ## Many types are allowed, and many values are expected
            propertyelements = []
            for propertytype in rangelist:
                actualtype, referencelist, definitiondict = check_type(expected_type_dict, referencelist, definitiondict, propertytype['@id'])
                if actualtype not in propertyelements:
                    propertyelements.append(actualtype)
                manyactualtype = {"type":"array", "items":actualtype}
                if manyactualtype not in propertyelements:
                    propertyelements.append(manyactualtype)
            valpropdict["type"] = {
                "anyOf": propertyelements
            }            
    return(valpropdict, referencelist, definitiondict)


In [135]:
def parse_spec_version(tmpinputyml,eachversion,specifications):
    theymlfile = os.path.join(tmpinputyml,eachversion.replace('.json','.html'))
    graphlist = []
    tmpdict = create_new_dict()
    ymldict = get_yml_dict(theymlfile)
    classinfo = grab_class_info(specifications,eachversion,ymldict)
    expected_type_dict, reusable_definitions = load_dictionaries()
    propertylist = ymldict['mapping']
    validationdict = generate_validation(ymldict)
    classinfo['$validation']=validationdict
    graphlist.append(classinfo)
    for eachproperty in propertylist:
        bioschemaprop = create_bioschema_property(eachspec, eachproperty)
        if bioschemaprop != False:
            graphlist.append(bioschemaprop)
    tmpdict['@graph']=graphlist
    return(tmpdict)

def parse_for_dde(specifications,datapath,test=False):
    if test==True:
        eachspec = specifications[14] ##TaxonRank (-3), LabProtocol (14), Course (5)
        tmpinputprofilepath = os.path.join(input_profiles,eachspec)
        tmpinputyml = os.path.join(input_yml,eachspec)
        spec_profs = os.listdir(tmpinputprofilepath)
        eachversion = spec_profs[-1]
        tmpdict = parse_spec_version(tmpinputyml,eachversion,specifications)
        with open(os.path.join(datapath,str(eachspec)+'_v'+str(eachversion)),"w+") as outfile:
            outfile.write(json.dumps(tmpdict))
    else:
        for eachspec in specifications:
            tmpinputprofilepath = os.path.join(input_profiles,eachspec)
            tmpinputyml = os.path.join(input_yml,eachspec)
            spec_profs = os.listdir(tmpinputprofilepath)
            for eachversion in spec_profs:
                tmpdict = parse_spec_version(tmpinputyml,eachversion,specifications)
                with open(os.path.join(datapath,str(eachspec)+'_v'+str(eachversion)),"w+") as outfile:
                    outfile.write(json.dumps(tmpdict))


parse_for_dde(specifications,datapath,test=True)

In [120]:
#### Logic
## If a property comes from bioschemas, it will need to be defined in the @graph
## If it comes from schemas it may or may not be need to be defined 
## This depends on whether or not the property exists in the hierarchy
## If it does not exist in the hierarchy, it normally does not apply to this class,
## If this is the case, it should be created with the "sameAs" property

## If a property exists in the hierarchy and can be inherited, then it should not need to be defined
## Instead, it should be defined in the $validation

def check_property_source(eachproperty): 
    if eachproperty['type']=="bioschemas":
        namespace = "bioschemas"
    else:
        namespace = "schema"
    return(namespace)

def get_rangelist(eachproperty):
    typelist = eachproperty['expected_types']
    expected_bioschemas = ["bioschemas:"+x for x in typelist if x in specifications]
    expected_schemas = ["schema:"+x for x in typelist if x not in expected_bioschemas]
    rangelist = [{"@id":x} for x in expected_bioschemas]
    for x in expected_schemas:
        rangelist.append({"@id":x})
    return(rangelist)

def get_domain(eachspec):
    domain = {"@id":"bioschemas:"+eachspec}
    return(domain)

def create_bioschema_property(eachspec, eachproperty):
    domain = get_domain(eachspec)
    rangelist = get_rangelist(eachproperty)
    namespace = check_property_source(eachproperty)
    if namespace=="bioschemas":
        ### Create property
        try:
            description = eachproperty["description"]+" "+eachproperty['bsc_description']
        except:
            description = eachproperty["description"]
        property_dict = {
            "@id": "bioschemas:"+eachproperty['property'],
            "rdfs:comment": description,
            "@type": "rdf:Property",
            "rdfs:label": eachproperty['property'],
            "schema:rangeIncludes": domain,
            "schema:rangeIncludes": rangelist
        }
    else:
        property_dict = False
    return(property_dict)





In [None]:
#print(specifications)
eachspec = specifications[14] ##TaxonRank (-3), LabProtocol (14), Course (5)
tmpinputprofilepath = os.path.join(input_profiles,eachspec)
tmpinputmarginpath = os.path.join(input_marginality,eachspec)
tmpinputyml = os.path.join(input_yml,eachspec)
spec_profs = os.listdir(tmpinputprofilepath)
eachversion = spec_profs[-1]
thejsonfile = os.path.join(tmpinputprofilepath,eachversion)
themarginfile = os.path.join(tmpinputmarginpath,eachversion)
theymlfile = os.path.join(tmpinputyml,eachversion.replace('.json','.html'))
print(os.listdir(tmpinputyml))
print(theymlfile)

with open(thejsonfile,'r') as infile:
    tmpjson = json.load(infile)
print(tmpjson.keys())

tmpdict = create_new_dict()
classinfo = grab_yaml_info(theymlfile)

print(tmpjson['$defs'])
print("-------------")
#print(tmpjson['properties'].keys())
#print("-------------")


In [None]:
def generate_bioschemas_objects (specifications):
    biospecs = {}
    for eachspec in specifications:
        tmpinputyml = os.path.join(input_yml,eachspec)
        spec_profs = os.listdir(tmpinputprofilepath)
        latestspec = spec_profs[-1]
            theymlfile = os.path.join(tmpinputyml,latestspec.replace('.json','.html'))
            ymldict = get_yml_dict(theymlfile)
        object_name = eachspec.lower()+"_object"
        tmprequirelist = []
        propdefs = {}
        for eachprop in ymldict['mapping']:
            if eachprop['marginality']=="Required":
                tmprequirelist.append(eachprop)
            rangelist = get_rangelist(eachproperty)
            
        biospecs[object_name] = {"@type": "bioschemas:"+eachspec,
                                 "type": "object",
                                 "properties":propdefs,
                                 "required":tmprequirelist}        
            