## Loading EDAM into an RDFlib graph

In [100]:
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL
from collections import Counter

In [107]:
# a single function to load EDAM and get the graph object as a result
def load_EDAM():
    g = ConjunctiveGraph()
    g.load('http://edamontology.org/EDAM.owl', format='xml')
    g.bind('edam', Namespace('http://edamontology.org#'))
    g.bind('oboInOwl', Namespace('http://www.geneontology.org/formats/oboInOwl#'))
    return g

kg = load_EDAM()
print(len(kg))

36884


## Adding duplicate IDs


In [108]:
kg.add((URIRef("http://edamontology.org/data_3593"), RDF.type, OWL.Class))
kg.add((URIRef("http://edamontology.org/topic_1630"), RDF.type, OWL.Class))

## Indexing URIs with labels and synonyms

In [109]:
# we don't want something like data_123 and topic_123

In [113]:
index_of_ids = []

for subject,predicate,obj in kg.triples((None, RDF.type, OWL.Class)):
    if "_" in str(subject):
        ident = str(subject).split("_")[1]
        index_of_ids.append(ident)

## Counting duplicates

In [116]:
id_counter = Counter(index_of_ids)
print(id_counter.most_common(5))

[('1630', 2), ('3593', 2), ('3487', 1), ('3694', 1), ('1679', 1)]


## Filtering duplicates

In [115]:
for duplicate_id in filter(lambda x: x[1]>1, id_counter.items()):
    print(f"Error with duplicate ID {duplicate_id[0]}")

Error with duplicate ID 1630
Error with duplicate ID 3593
