Set constants for script return values. Load EDAM_dev.owl from GitHub into an RDF graph.

In [151]:
import io
import sys
from rdflib import ConjunctiveGraph, Namespace

# Constants for script return value as per https://github.com/edamontology/edamverify.
NOERR = 0
INFO  = 1
WARN  = 2
ERROR = 3

#Load EDAM_dev.owl from GitHub into an RDF graph.
print("Loading graph ...", end="")
g = ConjunctiveGraph()
# g.load('https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl', format='xml')
g.load('EDAM_dev.owl')
g.bind('edam', Namespace('http://edamontology.org#'))
print("done!")


Loading graph ...done!


Define SPARQL query to extract ID and term of all concepts in the "edam" subset. Run the query, write report and return approriate value.

IMPORTANT
1. The numerical component of a concept ID is taken to be everything after the first occurrence of underscore ('_') character.
2. Concepts which are not defined to be in the "edam" subset are ignored (not checked). 

In [152]:
# Compile SPARQL query
# ?id oboInOwl:inSubset "http://purl.obolibrary.org/obo/edam#topics" .
# FILTER(?subset = "http://purl.obolibrary.org/obo/edam#topics") .


query_term = """
SELECT ?id ?term ?subset WHERE
{
?id rdfs:label ?term .
?id oboInOwl:inSubset ?subset . 
FILTER(?subset = "http://purl.obolibrary.org/obo/edam#edam")
}
"""

# Declare hash tables for results
numerical_ids = {}
ids = {}
terms = {}

# Run SPARQL query and collate results
errfound = False    
report = list()
results = g.query(query_term)
for r in results :
    print(str(r['id']), str(r['term']), str(r['subset']))
    id   = str(r['id'])
    term = str(r['term']) 
    
    # Populate hash tables
    pos = id.rfind("_")    
    numerical_id = id[pos+1:]
    
    # Check for duplicate numerical ID
    if numerical_id in numerical_ids:
        errfound = True
        report.append(id +  ' (' + term + ')' + " :: " +
                      ids[numerical_id] + ' (' + terms[numerical_id] + ')')
        print("Duplicate found!")
    else:
        numerical_ids[numerical_id] = True
        ids[numerical_id] = id
        terms[numerical_id] = term
    
    
    
#    if (ext.lower() != exact_syn.lower()) and (ext.lower() != term.lower()): 
#        errfound = True
#        report.append(id +  ' (' + term + '): ' + ext)


# Return exit code (raises exception) 
if errfound == True:
    print("Duplication of the numerical component of the concept ID for these concepts:")
    print("\n".join(report))
    sys.exit(WARN)
else:
    print("No issues found.")
    sys.exit(NOERR)


No issues found.


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
