**subsetMisuse.ipynb**

Misuse of subset annotation property.

**Documentation:** https://github.com/edamontology/edamverify/blob/master/docs/subsetMisuse.md    

Set constants for script return values. Load EDAM_dev.owl from GitHub into an RDF graph.

In [None]:
import sys
from rdflib import ConjunctiveGraph, Namespace

# Constants for script return value as per https://github.com/edamontology/edamverify.
NOERR = 0
INFO  = 1
WARN  = 2
ERROR = 3

#Load EDAM_dev.owl from GitHub into an RDF graph.
print("Loading graph ...", end="")
g = ConjunctiveGraph()
g.load('https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl', format='xml')
# g.load('EDAM_dev.owl')
g.bind('edam', Namespace('http://edamontology.org#'))
print("done!")

Define SPARQL query to extract ID, term, subset and (if available) deprecated field of all concepts. Run the query.

In [None]:
# Compile SPARQL query
query_term = """
SELECT ?id ?term ?subset ?deprecated WHERE
{
?id rdfs:label ?term .
?id oboInOwl:inSubset ?subset . 
OPTIONAL {?id owl:deprecated ?deprecated .}
}
"""

# Run SPARQL query and collate results
errfound = False    
report = list()
results = g.query(query_term)

Analyse results of query.

**NB:**
Subsets in EDAM are (currently) defined in one of two ways:
  
``<oboInOwl:inSubset rdf:resource="http://purl.obolibrary.org/obo/edam#data" />``

``<oboInOwl:inSubset>concept_properties />``

this complicates the logic slightly. 




In [None]:
for r in results :
    
    id     = str(r['id'])
    term   = str(r['term']) 
    subsetpath = str(r['subset'])
    deprecated = str(r['deprecated'])
            
    # print(str(r['id']), "   ", str(r['term']), "   ",  str(r['subset']), "   ",  str(r['deprecated']), "   ", deprecated)

    # Extract subset without path, e.g. "data"    
    pos = subsetpath.rfind("#")
    if pos == -1: 
        # subset path does not contain '#'
        subset = subsetpath
    else:
        subset = subsetpath[pos+1:] 

    # Check for invalid subset
    if subset != "topics" \
            and subset != "operations" \
            and subset != "data" \
            and subset != "identifiers" \
            and subset != "formats" \
            and subset != "edam" \
            and subset != "events" \
            and subset != "obsolete" \
            and subset != "relations" \
            and subset != "concept_properties" :
            errfound = True
            report.append("Invalid subset ::: " + id +  ' (' + term + ')' + " in subset:" + subset + "   " + subsetpath)
            continue
                 
    # Check deprecated concepts are in right subset 
    if deprecated == "true" and subset !=  "obsolete" :
        errfound = True
        report.append("Invalid subset for deprecated concept ::: " + id +  ' (' + term + ')' + " in subset:" + subset)

    # Check concept in "deprecated" subset is actually deprecated 
    if subset ==  "obsolete" and deprecated != "true":
        errfound = True
        report.append("Concept in deprecated subset lacks <owl:deprecated>true</>" + " ::: " + id +  ' (' + term + ')' + " in subset:" + subset)

    # Skip subsets not corresponding to main sub-ontologies
    if subset == "edam" or subset == "events" or subset == "relations" or subset == "concept_properties":
        continue

    # We're also done processing obsolete concepts
    if subset == "obsolete":
        continue
    
    # Check for mismatch between subset and subontology (non-deprecated concepts)    
    # if subset not in id and subset != "identifier" :
    if "topic" in id and subset != "topics" or \
        "operation" in id and subset != "operations" or \
        "data" in id and subset != "data" and subset != "identifiers" or \
        "format" in id and subset != "formats" :
        errfound = True
        report.append("Wrong subset ::: " + id +  ' (' + term + ')' + " in subset:" + subset)

    

Write report and return approriate value.

In [None]:
# Return exit code (raises exception)
if errfound:
    print("Suspected error in subset assignation for these concepts:")
    print("\n".join(report))
    sys.exit(ERROR)
else:
    print("No issues found.")
    sys.exit(NOERR)


