## Loading EDAM into an RDFlib graph

In [1]:
from rdflib import ConjunctiveGraph

Here we initialize the graph. 

In [2]:
kg = ConjunctiveGraph()

def print_size():
    print(f"The knowledge graph has {len(kg)} triples")

Here we load the EDAM ontology into the graph. 

In [3]:
#kg.parse('http://edamontology.org/EDAM.owl', format='xml')
#print_size()
edam_version = 'https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl'
kg = ConjunctiveGraph()
kg.parse(edam_version, format='xml')
#kg.bind('edam', Namespace('http://edamontology.org/'))
print(str(len(kg)) + ' triples in the EDAM triple store')


38279 triples in the EDAM triple store


In [4]:
kg.serialize("edam.json", format="json-ld")
kg.serialize("edam.ttl", format="turtle")

<Graph identifier=Nbc057fd0bd0a4472965d003fbc879e46 (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [5]:
# a single function to load EDAM and get the graph object as a result
def load_EDAM():
    g = ConjunctiveGraph()
    g.parse(edam_version, format='xml')
    return g

G = load_EDAM()
print(len(G))

38279


## Listing the 100 first triples  

In [6]:
i = 0

for subject,predicate,obj in kg:
    print(f'({subject}, {predicate}, {obj})')
    i+=1
    
    if i > 99:
        break
    

(http://edamontology.org/data_2646, http://edamontology.org/created_in, beta12orEarlier)
(http://edamontology.org/format_3159, http://www.geneontology.org/formats/oboInOwl#inSubset, http://edamontology.org/bio)
(http://edamontology.org/topic_0659, http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym, Functional RNA)
(http://edamontology.org/format_3015, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://www.w3.org/2002/07/owl#Class)
(http://edamontology.org/data_1193, http://edamontology.org/created_in, beta12orEarlier)
(http://edamontology.org/format_2920, http://www.geneontology.org/formats/oboInOwl#hasDefinition, Data format for molecular sequence alignment information that can hold sequence alignment(s) of only 2 sequences.)
(http://edamontology.org/operation_2472, http://edamontology.org/created_in, beta12orEarlier)
(http://edamontology.org/operation_0287, http://www.geneontology.org/formats/oboInOwl#inSubset, http://edamontology.org/bio)
(http://edamontology.org/for

In [7]:
from rdflib.namespace import RDF, RDFS, OWL 

i = 0


for s in kg.triples((None, RDF.type, OWL.Class)):
    for label in kg.triples((s, RDFS.label, None)):
        print(kg.value(s, RDFS.label))
        i +=1
    
    if i > 99:
        break



## Evaluating SPARQL queries for dashboard
Aim: getting topics that have no wikipedia url in their properties.

In [8]:
query = """
PREFIX edam: <http://edamontology.org/>

SELECT (count(?term) as ?nb_no_wikipedia) WHERE {
    ?c rdfs:subClassOf+ edam:topic_0003 ;
                rdfs:label ?term .
        FILTER NOT EXISTS {
        ?c rdfs:seeAlso ?seealso .
        FILTER (regex(str(?seealso), "wikipedia.org", "i"))
       } .
}
"""

results = kg.query(query)

for r in results :
    print(f"There are {r['nb_no_wikipedia']} topics without a URL (seeAlso property).") 

There are 14 topics without a URL (seeAlso property).


In [9]:
query = """
PREFIX edam: <http://edamontology.org/>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>


SELECT ?term WHERE {
    ?c rdfs:subClassOf+ edam:topic_0003 ;
                rdfs:label ?term .
        FILTER NOT EXISTS {
        ?c rdfs:seeAlso ?seealso .
        FILTER (regex(str(?seealso), "wikipedia.org", "i"))
       } .
}
"""

results = kg.query(query)

for r in results :
    print(f"Topic '{r['term']}' has no 'seeAlso' property.") 

Topic 'Literature and language' has no 'seeAlso' property.
Topic 'Data submission, annotation, and curation' has no 'seeAlso' property.
Topic 'Data identity and mapping' has no 'seeAlso' property.
Topic 'Genome resequencing' has no 'seeAlso' property.
Topic 'Simulation experiment' has no 'seeAlso' property.
Topic 'Nucleic acid sites, features and motifs' has no 'seeAlso' property.
Topic 'Protein properties' has no 'seeAlso' property.
Topic 'Protein sites, features and motifs' has no 'seeAlso' property.
Topic 'Sequence composition, complexity and repeats' has no 'seeAlso' property.
Topic 'Probes and primers' has no 'seeAlso' property.
Topic 'Sequence sites, features and motifs' has no 'seeAlso' property.
Topic 'Biomolecular simulation' has no 'seeAlso' property.
Topic 'Biotherapeutics' has no 'seeAlso' property.
Topic 'Quality affairs' has no 'seeAlso' property.


In [10]:
query = """
PREFIX edam: <http://edamontology.org/>

SELECT * WHERE {
    ?x ?property ?value .
    VALUES ?x {edam:topic_0622}
}
"""

results = kg.query(query)

for r in results :
    print(f"prop '{r['property']}' has value: '{r['value']}'.") 

prop 'http://www.w3.org/2000/01/rdf-schema#seeAlso' has value: 'https://en.wikipedia.org/wiki/Genomics'.
prop 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' has value: 'Whole genomes'.
prop 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' has value: 'Genome annotation'.
prop 'http://www.w3.org/2000/01/rdf-schema#subClassOf' has value: 'http://edamontology.org/topic_3391'.
prop 'http://www.geneontology.org/formats/oboInOwl#inSubset' has value: 'http://edamontology.org/topics'.
prop 'http://edamontology.org/created_in' has value: 'beta12orEarlier'.
prop 'http://www.w3.org/2000/01/rdf-schema#seeAlso' has value: 'http://purl.bioontology.org/ontology/MSH/D023281'.
prop 'http://www.geneontology.org/formats/oboInOwl#inSubset' has value: 'http://edamontology.org/bio'.
prop 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym' has value: 'Exomes'.
prop 'http://www.geneontology.org/formats/oboInOwl#hasDefinition' has value: 'Whole genomes of one or more 

In [11]:
query = """
PREFIX edam: <http://edamontology.org/>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>


SELECT ?term ?def WHERE {
    ?c rdfs:subClassOf+ edam:topic_0003 ;
       rdfs:label ?term ;
    OPTIONAL {?c oboInOwl:hasDefinition ?def} .

    FILTER NOT EXISTS {
        #?c rdfs:seeAlso [] .
        ?c oboInOwl:hasDefinition [] .
    } .
}
"""

results = kg.query(query)

for r in results :
    print(f"Topic '{r['term']}' has  def  '{r['def']}'.") 

In [12]:
query = """
PREFIX edam: <http://edamontology.org/>
PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>


SELECT ?term ?syno WHERE {
    ?c rdfs:subClassOf+ edam:topic_0003 ;
       rdfs:label ?term ;
    OPTIONAL {?c oboInOwl:hasExactSynonym ?syno} .

    FILTER NOT EXISTS {
        ?c oboInOwl:hasExactSynonym [] .
    } .
}
"""

results = kg.query(query)

for r in results :
    print(f"Topic '{r['term']}' has  no exactSynonym.") 

Topic 'Ontology and terminology' has  no exactSynonym.
Topic 'Bioinformatics' has  no exactSynonym.
Topic 'Laboratory information management' has  no exactSynonym.
Topic 'Chemometrics' has  no exactSynonym.
Topic 'Database management' has  no exactSynonym.
Topic 'Data management' has  no exactSynonym.
Topic 'Data submission, annotation, and curation' has  no exactSynonym.
Topic 'Data identity and mapping' has  no exactSynonym.
Topic 'Data architecture, analysis and design' has  no exactSynonym.
Topic 'Data integration and warehousing' has  no exactSynonym.
Topic 'Data governance' has  no exactSynonym.
Topic 'Data quality management' has  no exactSynonym.
Topic 'Data rescue' has  no exactSynonym.
Topic 'Chemistry' has  no exactSynonym.
Topic 'Microfluidics' has  no exactSynonym.
Topic 'Computational chemistry' has  no exactSynonym.
Topic 'Drug discovery' has  no exactSynonym.
Topic 'Compound libraries and screening' has  no exactSynonym.
Topic 'Analytical chemistry' has  no exactSynonym