**subsetMisuse.ipynb**

Misuse of subset annotation property.

**Documentation:** https://github.com/edamontology/edamverify/blob/master/docs/subsetMisuse.md    

Set constants for script return values. Load EDAM_dev.owl from GitHub into an RDF graph.

In [1]:
import os
from rdflib import ConjunctiveGraph, Namespace
import json
from collections import OrderedDict

# Constants for script error reporting as per https://github.com/edamontology/edamverify.
NOERR = "NOERR"
INFO  = "INFO"
WARN  = "WARN"
ERROR = "ERROR"

#Load EDAM_dev.owl from GitHub into an RDF graph.
print("Loading graph ...", end="")
g = ConjunctiveGraph()
g.load(os.environ.get('EDAM_PATH', '../EDAM_dev.owl'), format='xml')
# g.load('https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl', format='xml')
# g.load('EDAM_dev.owl')
g.bind('edam', Namespace('http://edamontology.org#'))
print("done!")

Loading graph ...done!


https://neuroml.org/neuromlv2|https://neuroml-db.org/ does not look like a valid URI, trying to serialize this will break.


Define SPARQL query to extract ID, term, subset and (if available) deprecated field of all concepts. Run the query.

In [2]:
# Compile SPARQL query
query_term = """
SELECT ?id ?term ?subset ?deprecated WHERE
{
?id rdfs:label ?term .
?id oboInOwl:inSubset ?subset . 
OPTIONAL {?id owl:deprecated ?deprecated .}
}
"""


# Declare hash tables for ids, counts of subset assignations, terms and subsets
# ids_subset_count_data : count of subset assignations to 'data' and 'identifiers'
# ids_subset_count_other : count of subset assignations to 'operations', 'topics' and 'formats'
ids = {}
ids_subset_count_data = {}
ids_subset_count_other = {}
terms = {}
subsets = {}

# Run SPARQL query and collate results
errfound = False    
report = list()
results = g.query(query_term)

Analyse results of query.

**NB:**
Subsets in EDAM are (currently) defined in one of two ways:
  
``<oboInOwl:inSubset rdf:resource="http://purl.obolibrary.org/obo/edam#data" />``

``<oboInOwl:inSubset>concept_properties />``

this complicates the logic slightly. 




In [3]:
report.append("Suspected error in subset assignation for these concepts:")

for r in results :
    
    id     = str(r['id'])
    term   = str(r['term']) 
    subsetpath = str(r['subset'])
    deprecated = str(r['deprecated'])
            
    # print(str(r['id']), "   ", str(r['term']), "   ",  str(r['subset']), "   ",  str(r['deprecated']), "   ", deprecated)

    # Extract subset without path, e.g. "data"    
    pos = subsetpath.rfind("#")
    if pos == -1: 
        # subset path does not contain '#'
        subset = subsetpath
    else:
        subset = subsetpath[pos+1:] 

    # Check for invalid subset
    if subset != "topics" \
            and subset != "operations" \
            and subset != "data" \
            and subset != "identifiers" \
            and subset != "formats" \
            and subset != "edam" \
            and subset != "events" \
            and subset != "obsolete" \
            and subset != "relations" \
            and subset != "concept_properties" :
            errfound = True
            report.append("Invalid subset ::: " + id +  ' (' + term + ')' + " in subset:" + subset + "   ")
            continue

    # Check deprecated concepts are in right subset 
    if deprecated == "true" and subset !=  "obsolete" :
        errfound = True
        report.append("Invalid subset for deprecated concept ::: " + id +  ' (' + term + ')' + " in subset:" + subset)

    # Check concept in "deprecated" subset is actually deprecated 
    if subset ==  "obsolete" and deprecated != "true":
        errfound = True
        report.append("Concept in deprecated subset lacks <owl:deprecated>true</>" + " ::: " + id +  ' (' + term + ')' + " in subset:" + subset)

    # Skip subsets not corresponding to main sub-ontologies
    if subset == "edam" or subset == "events" or subset == "relations" or subset == "concept_properties":
        continue
    else:
        # Count the number of subset assignations
        # There must be only one of `topics`, `operations`, 'formats' and 'data' subset assignations
        # Identifier concepts must have both 'data' and 'identifiers' subset assignations
        ids[id] = id
        terms[id] = term
        subsets[id] = subset
        if subset=="data" or subset=="identifiers":
            if id not in ids_subset_count_data:
                ids_subset_count_data[id] = 1
            else:
                ids_subset_count_data[id] += 1
        elif subset=="topics" \
                or subset=="operations" \
                or subset=="formats":
            if id not in ids_subset_count_other:
                ids_subset_count_other[id] = 1
            else:
                ids_subset_count_other[id] += 1



    # We're also done processing obsolete concepts
    if subset == "obsolete":
        continue
    
    # Check for mismatch between subset and subontology (non-deprecated concepts)    
    # if subset not in id and subset != "identifier" :
    if "topic" in id and subset != "topics" or \
        "operation" in id and subset != "operations" or \
        "data" in id and subset != "data" and subset != "identifiers" or \
        "format" in id and subset != "formats" :
        errfound = True
        report.append("Wrong subset ::: " + id +  ' (' + term + ')' + " in subset:" + subset)


# Check for multiple subset assignations
# There must be only one of `topics`, `operations` etc. subset assignations per concept!
for next_id in ids:
    # Check if Identifier concepts have both 'data' and 'identifiers' subset assignations
    if next_id in ids_subset_count_data \
            and subsets[next_id] == "identifiers" \
            and ids_subset_count_data[next_id] != 2:
        report.append("Identifier concept should have both 'data' and 'identifiers' subset assignations ::: " + next_id +  ' (' + terms[next_id] + ')')
       # Check that there is only one of `topics`, `operations`, 'formats' and 'data' subset assignations
    if next_id in ids_subset_count_other:
       if (ids_subset_count_other[next_id] >1) or (next_id in ids_subset_count_data):
           report.append("Concept with multiple erroneous subset assignations ::: " + next_id +  ' (' + terms[next_id] + ')')




Write report and return approriate value.

In [4]:
# Write report
# Use ordered dictionary to get consistent listing of properties in the JSON output
report_dict = OrderedDict

if errfound:
    report_dict = {
        'test_name': 'subsetMisuse',
        'status' : ERROR,
        'reason' : report
    }
else:
    report_dict = {
        'test_name': 'subsetMisuse',
        'status' : NOERR,
        'reason' : '-'
    }

report_json = json.dumps(report_dict, indent=4)
print(report_json)





{
    "test_name": "subsetMisuse",
    "status": "ERROR",
    "reason": [
        "Suspected error in subset assignation for these concepts:",
        "Invalid subset ::: http://edamontology.org/format_3876 (TNG) in subset:TNG   ",
        "Wrong subset ::: http://edamontology.org/topic_3077 (Data acquisition) in subset:data",
        "Wrong subset ::: http://edamontology.org/operation_3456 (Rigid body refinement) in subset:data",
        "Wrong subset ::: http://edamontology.org/operation_3931 (Chemometrics) in subset:topics",
        "Wrong subset ::: http://edamontology.org/operation_3482 (Antimicrobial resistance prediction) in subset:data",
        "Wrong subset ::: http://edamontology.org/operation_3454 (Phasing) in subset:data",
        "Wrong subset ::: http://edamontology.org/operation_3939 (Metabolic engineering) in subset:topics",
        "Wrong subset ::: http://edamontology.org/operation_3461 (Virulence prediction) in subset:data",
        "Wrong subset ::: http://edamonto