## Loading EDAM into an RDFlib graph

In [1]:
from rdflib import ConjunctiveGraph, Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL

In [2]:
# a single function to load EDAM and get the graph object as a result
def load_EDAM():
    g = ConjunctiveGraph()
    g.load('http://edamontology.org/EDAM.owl', format='xml')
    g.bind('edam', Namespace('http://edamontology.org#'))
    g.bind('oboInOwl', Namespace('http://www.geneontology.org/formats/oboInOwl#'))
    return g

G = load_EDAM()
print(len(G))

36884


## Indexing URIs with labels and synonyms

In [3]:
from collections import OrderedDict

class LimitedSizeDict(OrderedDict):
    def __init__(self, *args, **kwds):
        self.size_limit = kwds.pop("size_limit", None)
        OrderedDict.__init__(self, *args, **kwds)
        self._check_size_limit()

    def __setitem__(self, key, value):
        OrderedDict.__setitem__(self, key, value)
        self._check_size_limit()

    def _check_size_limit(self):
        if self.size_limit is not None:
            while len(self) > self.size_limit:
                self.popitem(last=False)

In [5]:
#index_uri_to_label = {}
index_label_to_uri = {}
for subject,predicate,obj in G.triples((None, RDFS.label, None)):
    #index_uri_to_label[str(subject)] = str(obj)
    index_label_to_uri[str(obj)] = str(subject)
    
for subject,predicate,obj in G.triples((None, URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym"), None)):
    #index_uri_to_label[str(subject)] = str(obj)
    #print(str(obj), str(subject))
    index_label_to_uri[str(obj)] = str(subject)


In [6]:
ten_uris = {k: index_uri_to_label[k] for k in list(index_uri_to_label)[:10]}
print(ten_uris)
print()
ten_labels = {k: index_label_to_uri[k] for k in list(index_label_to_uri)[:10]}
print(ten_labels)

NameError: name 'index_uri_to_label' is not defined

In [7]:
import jellyfish

def get_edam_top_10_jaro(l):
    max_sim = 0
    min_dist = 100
    top_10 = LimitedSizeDict(size_limit= 10)        

    for label in index_label_to_uri.keys() : 
        s =  jellyfish.jaro_winkler(l, label)
        if s > max_sim:
            max_sim = s
            top_10[label] = s
            top_10.move_to_end(label, last=False)

    for k in top_10.keys():
        print(k, f"({top_10[k]}) : ", index_label_to_uri[k])
    return top_10

def get_edam_top_10_jaro_ci(l):
    max_sim = 0
    min_dist = 100
    top_10 = LimitedSizeDict(size_limit= 10)        

    for label in index_label_to_uri.keys() : 
        s =  jellyfish.jaro_winkler(l.lower(), label.lower())
        if s > max_sim:
            max_sim = s
            top_10[label] = s
            top_10.move_to_end(label, last=False)

    for k in top_10.keys():
        print(k, f"({top_10[k]}) : ", index_label_to_uri[k])
    return top_10

In [8]:
top_10 = get_edam_top_10_jaro("Nucleosome")

Nucleosome exclusion sequences (0.8666666666666666) :  http://edamontology.org/data_1306
Nucleotide code (0.8416666666666667) :  http://edamontology.org/data_1007
Nucleotide base annotation (0.827008547008547) :  http://edamontology.org/data_0911
refactor_comment (0.6027777777777777) :  http://edamontology.org/refactor_comment
Documentation (0.5760683760683761) :  http://edamontology.org/documentation
deprecation_comment (0.5274853801169591) :  http://edamontology.org/deprecation_comment
Created in (0.4666666666666666) :  http://edamontology.org/created_in
Citation (0.4083333333333334) :  http://edamontology.org/data_0970


In [9]:
get_edam_top_10_jaro("mhine leaning")

Machine learning (0.7799145299145299) :  http://edamontology.org/topic_3474
DNA linear map rendering (0.6965811965811967) :  http://edamontology.org/operation_0577
Phylogenetic inference (0.6911421911421911) :  http://edamontology.org/operation_0323
Genetic mapping (0.69002849002849) :  http://edamontology.org/operation_0282
File name extension (0.687154950312845) :  http://edamontology.org/data_1059
Gene classification (0.6849602639076323) :  http://edamontology.org/data_0917
Peptide annotation (0.6566951566951568) :  http://edamontology.org/data_0895
File extension (0.6456043956043956) :  http://edamontology.org/file_extension
Created in (0.5760683760683761) :  http://edamontology.org/created_in
Citation (0.5192307692307693) :  http://edamontology.org/data_0970


LimitedSizeDict([('Machine learning', 0.7799145299145299),
                 ('DNA linear map rendering', 0.6965811965811967),
                 ('Phylogenetic inference', 0.6911421911421911),
                 ('Genetic mapping', 0.69002849002849),
                 ('File name extension', 0.687154950312845),
                 ('Gene classification', 0.6849602639076323),
                 ('Peptide annotation', 0.6566951566951568),
                 ('File extension', 0.6456043956043956),
                 ('Created in', 0.5760683760683761),
                 ('Citation', 0.5192307692307693)])

In [10]:
get_edam_top_10_jaro("chromosome")

Chromosomes (0.9060606060606061) :  http://edamontology.org/topic_0624
chrominfo (0.8533333333333333) :  http://edamontology.org/format_3587
Chromosome name (0.8333333333333334) :  http://edamontology.org/data_0987
Chromosome report (0.8098039215686276) :  http://edamontology.org/data_0919
refactor_comment (0.6172619047619047) :  http://edamontology.org/refactor_comment
Ontology used (0.5692307692307693) :  http://edamontology.org/ontology_used
notRecommendedForAnnotation (0.5578483245149912) :  http://edamontology.org/notRecommendedForAnnotation
deprecation_comment (0.5274853801169591) :  http://edamontology.org/deprecation_comment
Created in (0.4666666666666666) :  http://edamontology.org/created_in
Citation (0.4083333333333334) :  http://edamontology.org/data_0970


LimitedSizeDict([('Chromosomes', 0.9060606060606061),
                 ('chrominfo', 0.8533333333333333),
                 ('Chromosome name', 0.8333333333333334),
                 ('Chromosome report', 0.8098039215686276),
                 ('refactor_comment', 0.6172619047619047),
                 ('Ontology used', 0.5692307692307693),
                 ('notRecommendedForAnnotation', 0.5578483245149912),
                 ('deprecation_comment', 0.5274853801169591),
                 ('Created in', 0.4666666666666666),
                 ('Citation', 0.4083333333333334)])

In [11]:
get_edam_top_10_jaro("immuno")

im (0.8222222222222222) :  http://edamontology.org/format_3593
Peptide immunogenicity data (0.7407407407407408) :  http://edamontology.org/data_1534
is function of (0.6507936507936508) :  http://edamontology.org/is_function_of
has function (0.5833333333333334) :  http://edamontology.org/has_function
Documentation (0.5747863247863249) :  http://edamontology.org/documentation
Citation (0.5138888888888888) :  http://edamontology.org/data_0970


LimitedSizeDict([('im', 0.8222222222222222),
                 ('Peptide immunogenicity data', 0.7407407407407408),
                 ('is function of', 0.6507936507936508),
                 ('has function', 0.5833333333333334),
                 ('Documentation', 0.5747863247863249),
                 ('Citation', 0.5138888888888888)])

In [12]:
get_edam_top_10_jaro_ci("immuno")

Immunology (0.92) :  http://edamontology.org/topic_0804
Immunogen design (0.875) :  http://edamontology.org/operation_0332
im (0.8222222222222222) :  http://edamontology.org/format_3593
Peptide immunogenicity data (0.7407407407407408) :  http://edamontology.org/data_1534
is function of (0.6507936507936508) :  http://edamontology.org/is_function_of
has function (0.5833333333333334) :  http://edamontology.org/has_function
Documentation (0.5747863247863249) :  http://edamontology.org/documentation
Citation (0.5138888888888888) :  http://edamontology.org/data_0970


LimitedSizeDict([('Immunology', 0.92),
                 ('Immunogen design', 0.875),
                 ('im', 0.8222222222222222),
                 ('Peptide immunogenicity data', 0.7407407407407408),
                 ('is function of', 0.6507936507936508),
                 ('has function', 0.5833333333333334),
                 ('Documentation', 0.5747863247863249),
                 ('Citation', 0.5138888888888888)])

In [13]:
get_edam_top_10_jaro_ci("call peaks")

Peak calling (0.7351851851851853) :  http://edamontology.org/operation_3222
Logical operator (0.725) :  http://edamontology.org/data_2133
Molecular mass (0.7071428571428572) :  http://edamontology.org/data_0844
Media type (0.6) :  http://edamontology.org/media_type
Example (0.5738095238095239) :  http://edamontology.org/example
Created in (0.5166666666666667) :  http://edamontology.org/created_in
Citation (0.48333333333333334) :  http://edamontology.org/data_0970


LimitedSizeDict([('Peak calling', 0.7351851851851853),
                 ('Logical operator', 0.725),
                 ('Molecular mass', 0.7071428571428572),
                 ('Media type', 0.6),
                 ('Example', 0.5738095238095239),
                 ('Created in', 0.5166666666666667),
                 ('Citation', 0.48333333333333334)])

In [14]:
get_edam_top_10_jaro_ci("peak detect")

Peak detection (0.9571428571428571) :  http://edamontology.org/operation_3215
PED (0.8060606060606059) :  http://edamontology.org/format_3286
Peptide property (0.7606060606060606) :  http://edamontology.org/data_2979
InterPro detailed match image (0.684639498432602) :  http://edamontology.org/data_1291
Peptide identifier (0.6822390572390572) :  http://edamontology.org/data_0988
Peptide identification (0.6683501683501684) :  http://edamontology.org/operation_3631
deprecation_comment (0.6528442317916002) :  http://edamontology.org/deprecation_comment
Created in (0.604040404040404) :  http://edamontology.org/created_in
Citation (0.4772727272727273) :  http://edamontology.org/data_0970


LimitedSizeDict([('Peak detection', 0.9571428571428571),
                 ('PED', 0.8060606060606059),
                 ('Peptide property', 0.7606060606060606),
                 ('InterPro detailed match image', 0.684639498432602),
                 ('Peptide identifier', 0.6822390572390572),
                 ('Peptide identification', 0.6683501683501684),
                 ('deprecation_comment', 0.6528442317916002),
                 ('Created in', 0.604040404040404),
                 ('Citation', 0.4772727272727273)])

In [15]:
get_edam_top_10_jaro_ci("detect peaks")

EC (0.7222222222222223) :  http://edamontology.org/data_1011
Sequence feature key (0.6472222222222223) :  http://edamontology.org/data_1020
Sequence trace (0.6468253968253969) :  http://edamontology.org/data_0924
Sequence signature data (0.6298309178743962) :  http://edamontology.org/data_0860
Database entry (0.621031746031746) :  http://edamontology.org/data_0843
Directory metadata (0.6203703703703703) :  http://edamontology.org/data_0583
Media type (0.6182539682539682) :  http://edamontology.org/media_type
deprecation_comment (0.6125730994152047) :  http://edamontology.org/deprecation_comment
Created in (0.5055555555555555) :  http://edamontology.org/created_in
Citation (0.4305555555555555) :  http://edamontology.org/data_0970


LimitedSizeDict([('EC', 0.7222222222222223),
                 ('Sequence feature key', 0.6472222222222223),
                 ('Sequence trace', 0.6468253968253969),
                 ('Sequence signature data', 0.6298309178743962),
                 ('Database entry', 0.621031746031746),
                 ('Directory metadata', 0.6203703703703703),
                 ('Media type', 0.6182539682539682),
                 ('deprecation_comment', 0.6125730994152047),
                 ('Created in', 0.5055555555555555),
                 ('Citation', 0.4305555555555555)])

# Whoosh text search engine 
conda install -c conda-forge whoosh

In [16]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import StemmingAnalyzer

schema = Schema(uri=ID(stored=True),
                label=TEXT(stored=True),
                definition=TEXT(analyzer=StemmingAnalyzer()),
                synonyms=KEYWORD(stored=True))

In [27]:
import os.path
from whoosh.index import create_in

if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)

In [28]:
from whoosh.index import open_dir

ix = open_dir("index")

In [32]:
writer = ix.writer()

for c, p1, o1 in G.triples((None, RDF.type, URIRef("http://www.w3.org/2002/07/owl#Class"))):
    for x, y, label in G.triples((c, RDFS.label, None)):
        #print(f"{str(c)} {str(label)}")
        writer.add_document(uri=str(c), label=str(label))
    
    for x, y, d in G.triples((c, URIRef("http://www.geneontology.org/formats/oboInOwl#hasDefinition"), None)):
        writer.update_document(uri=str(c), definition=str(d))
        #print(d)
    
    for x, y, s in G.triples((c, URIRef("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym"), None)):
        writer.update_document(uri=str(c), synonyms=str(s))
    
writer.commit()

In [37]:
from whoosh.qparser import MultifieldParser, QueryParser
from whoosh.query import Every

mparser = MultifieldParser(["definition"], schema=schema)
query = mparser.parse("nucle*some")
#query = mparser.parse("detect* AND peak")
#query = mparser.parse("peak AND format AND ENCODE")
##query = QueryParser("definition", schema).parse("sequence")

#results = ix.searcher().search(Every('label'))
#for r in results:
#    print(r, r.)

with ix.searcher() as searcher:
    results = searcher.search(query)
    for r in results:
            print(r['uri'], r)

http://edamontology.org/operation_0432 <Hit {'uri': 'http://edamontology.org/operation_0432'}>
http://edamontology.org/operation_0432 <Hit {'uri': 'http://edamontology.org/operation_0432'}>
http://edamontology.org/operation_0432 <Hit {'uri': 'http://edamontology.org/operation_0432'}>
http://edamontology.org/data_1306 <Hit {'uri': 'http://edamontology.org/data_1306'}>
http://edamontology.org/operation_0453 <Hit {'uri': 'http://edamontology.org/operation_0453'}>
http://edamontology.org/data_1306 <Hit {'uri': 'http://edamontology.org/data_1306'}>
http://edamontology.org/operation_0453 <Hit {'uri': 'http://edamontology.org/operation_0453'}>
http://edamontology.org/data_1306 <Hit {'uri': 'http://edamontology.org/data_1306'}>
http://edamontology.org/operation_0453 <Hit {'uri': 'http://edamontology.org/operation_0453'}>
http://edamontology.org/topic_3176 <Hit {'uri': 'http://edamontology.org/topic_3176'}>
