**mandatoryPropertyMissing.ipynb**

EDAM concept is missing a mandatory annotation propery

**Documentation:** https://github.com/edamontology/edamverify/blob/master/docs/mandatoryPropertyMissing.md

**NB.1: - Running the notebook**
The directory containing the ``EDAM_dev.owl`` file must be defined by ``EDAM_PATH`` environment variable.

The script requires the test to be run from a subdirectory of ``EDAM_PATH`` (hence ``'../EDAM_dev.owl'`` below)

In [2]:
import os
from rdflib import ConjunctiveGraph, Namespace
import json

# Constants for script error reporting as per https://github.com/edamontology/edamverify.
NOERR = "NOERR"
INFO  = "INFO"
WARN  = "WARN"
ERROR = "ERROR"

#Load EDAM_dev.owl from GitHub into an RDF graph.
print("Loading graph ...", end="")
g = ConjunctiveGraph()
# g.load(os.environ.get('EDAM_PATH', '../EDAM_dev.owl'), format='xml')
g.load('https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl', format='xml')
#g.load('EDAM_dev.owl', format='xml')
g.bind('edam', Namespace('http://edamontology.org#'))
print("done!")



Loading graph ...done!


Define SPARQL query to retrieve ID, term, and all other properties that are mandatory for Operation, Data, Identifier, Format or Topic concepts. Run the query.

**NB.1:** BASE is used to define the define the default namespace (for various elements below).

**NB.2:** Concepts are identified by one of ``operation``, ``data``, ``format`` or ``topic`` occurring in the concept ID.

In [3]:
# Compile SPARQL query
query_term = """
BASE <http://edamontology.org/>
SELECT ?id ?term ?subsetpath ?definition ?created_in WHERE
{
?id rdfs:label ?term .
?id oboInOwl:inSubset ?subsetpath .
OPTIONAL {?id oboInOwl:hasDefinition ?definition .}
OPTIONAL {?id :created_in ?created_in .}
}
"""

# Declare hash tables for results
# ids is for concept IDs
# edamSubset is used to record 'edam' subset assignations
# branchSubset is used to record 'topics', 'data', 'formats' or 'operations' subset assignations
edamSubset = {}
branchSubset = {}
ids = {}
terms = {}

# Run SPARQL query and collate results
err = NOERR
errfound = False
report = list()
results = g.query(query_term)

Analyse results of query.

In [4]:
# Error message building function
def report_append(err_new, msg):
    """
    Writes a report of an error and sets the error level appropriately.
    err: the error level to set
    msg: the message to write
    """
    global err
    global errfound
    global report
    if err_new > err:
        err = err_new
    errfound = True
    report.append(msg)

for r in results :
#    print(str(r['id']), str(r['term']), str(r['ext']), str(r['exact_syn']))
    id   = str(r['id'])
    term = str(r['term'])
    subsetpath  = str(r['subsetpath'])
    definition = str(r['definition'])
    created_in = str(r['created_in'])


    # Skip obsolete concepts
    if subsetpath != 'None' and "obsolete" in subsetpath:
        continue

    # Operate on EDAM Operation, Data, Format or Topic concepts only
    if "operation_" in id \
        or "data_" in id \
        or ("format_" in id and "is_format_of" not in id)\
        or ("topic_" in id and "is_topic_of" not in id):

        # Update the IDs hash table
        ids[id] = id
        terms[id] = term

        # Update the subset assignation hash tables
        # Values are set to True but later on, just the key is used
        # '#' is needed for edam because of how subsets are defined: http://purl.obolibrary.org/obo/edam#edam
        if "#edam" in subsetpath:
            edamSubset[id] = True
        if "topics" in subsetpath \
            or "operations" in subsetpath \
            or "data" in subsetpath \
            or "identifiers" in subsetpath \
            or "formats" in subsetpath:
            branchSubset[id] = True

        # Check for missing 'definition' and 'created_in' properties
        if definition == "None":
            report_append(ERROR, "ERROR Missing property (definition) on ::: " + id +  ' (' + term + ')')
        if created_in == "None":
            report_append(ERROR, "ERROR Missing property (created_in) on ::: " + id +  ' (' + term + ')')

# Check for missing subset properties
for next_id in ids:
    if next_id not in edamSubset:
        report_append(ERROR, "ERROR subset ('edam') assignation missing on ::: " + next_id +  ' (' + terms[next_id] + ')')
    if next_id not in branchSubset:
        report_append(ERROR, "ERROR subset assignation ('topics', 'operations', 'data', 'identifier' or 'formats') missing on ::: " + next_id +  ' (' + terms[next_id] + ')')

Write report and return approriate value.

In [5]:
report_obj = {}
report_obj['test_name'] = 'mandatoryPropertyMissing'
report_obj['comment'] = 'Mandatory properties missing from one or more concepts.'

if errfound:
    report_obj['status'] = err
    report_obj['reason'] = report
else:
    report_obj['status'] = NOERR

report_json = json.dumps(report_obj, indent=4)
print(report_json)

{
    "test_name": "mandatoryPropertyMissing",
    "comment": "Mandatory properties missing from one or more concepts.",
    "status": "NOERR",
    "reason": [
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0006 (Data)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0582 (Ontology)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0842 (Identifier)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0844 (Molecular mass)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0845 (Molecular charge)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0846 (Chemical formula)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.org/data_0847 (QSAR descriptor)",
        "ERROR subset ('edam') assignation missing on ::: http://edamontology.