In [1]:
import collections
import gzip
import json
import pronto
import pymongo
import rdflib
import urllib.request

from mosmo.knowledge import kb
from mosmo.model import DS, KbEntry, DbXref

KB = kb.configure_kb()

# EC (from https://ftp.expasy.org/databases/enzyme/)

In [2]:
%%time
# Get the data fresh from the download site
request = urllib.request.urlopen("https://ftp.expasy.org/databases/enzyme/enzyme.rdf")
ec_rdf = rdflib.Graph().parse(request, format="application/rdf+xml")

# Set up the namespaces we need to navigate the structure
ec = rdflib.namespace.Namespace('http://purl.uniprot.org/enzyme/')
ec_rdf.bind('ec', ec)

namespaces = {k: v for k, v in ec_rdf.namespaces()}
core = rdflib.namespace.Namespace(namespaces[''])
rdf = rdflib.namespace.Namespace(namespaces['rdf'])
rdfs = rdflib.namespace.Namespace(namespaces['rdfs'])
skos = rdflib.namespace.Namespace(namespaces['skos'])

CPU times: user 12.1 s, sys: 117 ms, total: 12.2 s
Wall time: 14.6 s


In [3]:
entries = {}
for ecnum in ec_rdf.subjects(rdf.type, core.Enzyme):
    if ec_rdf.value(ecnum, core.obsolete):
        continue

    id = ecnum.removeprefix(ec)
    entries[id] = KbEntry(
        id = id,
        db = DS.EC,
        name = ec_rdf.value(ecnum, skos.prefLabel).toPython(),
        aka = [altname.toPython() for altname in ec_rdf.objects(ecnum, skos.altLabel)] or None,
        description = '\n'.join(comment.toPython() for comment in ec_rdf.objects(ecnum, rdfs.comment)) or None,
    )

print(f"{len(entries)} entries extracted")

7251 entries extracted


### Uncomment this cell to write to the db

In [4]:
# %%time
# collection = KB.client[KB.EC.client_db][KB.EC.collection]
# collection.drop()
# for entry in entries.values():
#     KB.put(KB.EC, entry, bypass_cache=True)
# collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index('aka', name='aka', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# print("success")

# GO (from https://purl.obolibrary.org/obo/go.obo)

In [5]:
%%time
# pronto.Ontology has built-in functionality that takes care of all the details for us
go = pronto.Ontology.from_obo_library('go.obo')
print(f'This version of GO has {len(go.terms())} terms.')

This version of GO has 47904 terms.
CPU times: user 1min 42s, sys: 12.4 s, total: 1min 54s
Wall time: 19.8 s


In [6]:
%%time
source_name = {
    'Reactome': 'REACT',
    'Wikipedia': 'WIKI',
    'KEGG_REACTION': 'KEGG',
    'KEGG_PATHWAY': 'KEGG',
}
source_count = collections.defaultdict(int)

def extract_xref(xref):
    parts = xref.id.split(':')
    if len(parts) == 2:
        db = DS.get(source_name.get(parts[0], parts[0].upper()))
        id = parts[1]
        source_count[db] += 1
        return DbXref(db=db, id=id)
    else:
        return DbXref(db=None, id=xref.id)

go_entries = {}
not_parsed = []
for term in go.terms():
    if term.obsolete:
        continue

    id_parts = term.id.split(':')
    if len(id_parts) == 2 and id_parts[0] == 'GO':
        go_entries[id_parts[1]] = KbEntry(
            id = id_parts[1],
            db = DS.GO,
            name = term.name,
            description = f'[{term.namespace}] {term.definition}',
            aka = [synonym.description for synonym in term.synonyms] or None,
            xrefs = {extract_xref(xref) for xref in term.xrefs} or None
        )
    else:
        not_parsed.append(term)

print("XRef usage")
for source in sorted(source_count.keys()):
    print(f'{source.id:>12}: {source_count[source]}')


XRef usage
      BIOCYC: 1
       CORUM: 12
          EC: 4845
        FBBT: 1
         FMA: 5
        HTTP: 1
       HTTPS: 2
      INTACT: 3
    INTERPRO: 1
        KEGG: 1745
 KEGG.MODULE: 1
     METACYC: 5149
          MP: 1
NIF_SUBCELLULAR: 223
        PMID: 1
          PO: 1
       REACT: 6167
       RESID: 139
        RHEA: 4485
    SABIO-RK: 2
          SO: 3
          TC: 52
UM-BBD_ENZYMEID: 61
UM-BBD_PATHWAYID: 123
UM-BBD_REACTIONID: 273
  UNIPATHWAY: 2
UNIPROTKB-KW: 1
          VZ: 139
        WBBT: 2
        WIKI: 758
   WIKIPEDIA: 2
CPU times: user 512 ms, sys: 0 ns, total: 512 ms
Wall time: 510 ms


In [7]:
go_entries['0000050'].__dict__

{'id': '0000050',
 'db': Datasource(id='GO', name='Gene Ontology', home='http://geneontology.org/', urlpat={<class 'mosmo.model.base.KbEntry'>: 'http://amigo.geneontology.org/amigo/term/GO:{id}'}),
 'name': 'urea cycle',
 'shorthand': None,
 'description': '[biological_process] The sequence of reactions by which arginine is synthesized from ornithine, then cleaved to yield urea and regenerate ornithine. The overall reaction equation is NH3 + CO2 + aspartate + 3 ATP + 2 H2O = urea + fumarate + 2 ADP + 2 phosphate + AMP + diphosphate.',
 'aka': ['urea biosynthesis', 'ornithine cycle', 'urea biosynthetic process'],
 'xrefs': {WIKI:Urea_cycle}}

### Uncomment this cell to write GO entries to the db

In [8]:
# %%time
# collection = KB.client[KB.GO.client_db][KB.GO.collection]
# collection.drop()
# for go_entry in go_entries.values():
#     KB.put(KB.GO, go_entry, bypass_cache=True)
# collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index('aka', name='aka', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index([('xrefs.id', pymongo.ASCENDING), ('xrefs.db', pymongo.ASCENDING)],
#                         name='xrefs',
#                         collation=pymongo.collation.Collation(locale='en_US', strength=1))
# print("success")