In [1]:
import collections
import gzip
import json
import pronto
import pymongo
import urllib.request
import xml.etree.ElementTree as ET

from mosmo.knowledge import kb
from mosmo.model import DS, KbEntry, DbXref

KB = kb.configure_kb()

# EC (from https://www.enzyme-database.org/)

Hardly pretty, but this seems to be the definitive source. Relatively minimal load: ID, name, aka.

In [2]:
%%time
gzdata = urllib.request.urlopen('https://www.enzyme-database.org/downloads/enzyme-data.xml.gz')
data = gzip.GzipFile(fileobj=gzdata)
tree = ET.parse(data)

CPU times: user 2.72 s, sys: 144 ms, total: 2.87 s
Wall time: 3.99 s


In [3]:
%%time
entry_rows = tree.getroot().findall('./database/table_data[@name="entry"]/row')
entries = []
for row in entry_rows:
    entry = {}
    for field in row.findall('field'):
        if field.attrib['name'] in {'ec_num', 'accepted_name', 'sys_name', 'other_names', 'comments'}:
            entry[field.attrib['name']] = field.text

    if 'ec_num' in entry and 'accepted_name' in entry:
        kb_entry = KbEntry(id = entry['ec_num'], db=DS.EC, name = entry['accepted_name'], aka=[])
        if entry['sys_name']:
            kb_entry.aka.append(entry['sys_name'])
        if entry['other_names']:
            kb_entry.aka.extend(entry['other_names'].split('; '))
        if entry['comments']:
            kb_entry.description = entry['comments']

        entries.append(kb_entry)
print(f"{len(entries)} entries extracted from {len(entry_rows)} table rows")

8201 entries extracted from 8201 table rows
CPU times: user 61.6 ms, sys: 4.1 ms, total: 65.7 ms
Wall time: 64.5 ms


In [4]:
entries[2].__dict__

{'id': '1.1.1.3',
 'db': Datasource(id='EC', name='Enzyme Commission', home='https://enzyme.expasy.org/', urlpat={<class 'mosmo.model.base.KbEntry'>: 'https://enzyme.expasy.org/EC/{id}'}),
 'name': 'homoserine dehydrogenase',
 'shorthand': None,
 'description': 'The yeast enzyme acts most rapidly with NAD+; the Neurospora enzyme with NADP+. The enzyme from Escherichia coli is a multi-functional protein, which also catalyses the reaction of EC 2.7.2.4 (aspartate kinase).',
 'aka': ['L-homoserine:NAD(P)+ oxidoreductase', 'HSDH', 'HSD'],
 'xrefs': None}

### Uncomment this cell to write to the db

In [6]:
# %%time
# collection = KB.client[KB.EC.client_db][KB.EC.collection]
# collection.drop()
# for entry in entries:
#     KB.put(KB.EC, entry, bypass_cache=True)
# collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index('aka', name='aka', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# print("success")

success
CPU times: user 1.05 s, sys: 70.4 ms, total: 1.12 s
Wall time: 1.87 s


# GO (from https://purl.obolibrary.org/obo/go.obo)

In [7]:
%%time
# pronto.Ontology has built-in functionality that takes care of all the details for us
go = pronto.Ontology.from_obo_library('go.obo')
print(f'This version of GO has {len(go.terms())} terms.')

This version of GO has 47904 terms.
CPU times: user 5min 22s, sys: 40.2 s, total: 6min 3s
Wall time: 50.4 s


In [8]:
%%time
source_name = {
    'Reactome': 'REACT',
    'Wikipedia': 'WIKI',
    'KEGG_REACTION': 'KEGG',
    'KEGG_PATHWAY': 'KEGG',
}
source_count = collections.defaultdict(int)

def extract_xref(xref):
    parts = xref.id.split(':')
    if len(parts) == 2:
        db = DS.get(source_name.get(parts[0], parts[0].upper()))
        id = parts[1]
        source_count[db] += 1
        return DbXref(db=db, id=id)
    else:
        return DbXref(db=None, id=xref.id)

go_entries = {}
not_parsed = []
for term in go.terms():
    if term.obsolete:
        continue

    id_parts = term.id.split(':')
    if len(id_parts) == 2 and id_parts[0] == 'GO':
        go_entries[id_parts[1]] = KbEntry(
            id = id_parts[1],
            db = DS.GO,
            name = term.name,
            description = f'[{term.namespace}] {term.definition}',
            aka = [synonym.description for synonym in term.synonyms] or None,
            xrefs = {extract_xref(xref) for xref in term.xrefs} or None
        )
    else:
        not_parsed.append(term)

print("XRef usage")
for source in sorted(source_count.keys()):
    print(f'{source.id:>12}: {source_count[source]}')


XRef usage
      BIOCYC: 1
       CORUM: 12
          EC: 4845
        FBBT: 1
         FMA: 5
        HTTP: 1
       HTTPS: 2
      INTACT: 3
    INTERPRO: 1
        KEGG: 1745
 KEGG.MODULE: 1
     METACYC: 5149
          MP: 1
NIF_SUBCELLULAR: 223
        PMID: 1
          PO: 1
       REACT: 6167
       RESID: 139
        RHEA: 4485
    SABIO-RK: 2
          SO: 3
          TC: 52
UM-BBD_ENZYMEID: 61
UM-BBD_PATHWAYID: 123
UM-BBD_REACTIONID: 273
  UNIPATHWAY: 2
UNIPROTKB-KW: 1
          VZ: 139
        WBBT: 2
        WIKI: 758
   WIKIPEDIA: 2
CPU times: user 460 ms, sys: 7.96 ms, total: 468 ms
Wall time: 468 ms


In [9]:
go_entries['0000050'].__dict__

{'id': '0000050',
 'db': Datasource(id='GO', name='Gene Ontology', home='http://geneontology.org/', urlpat={<class 'mosmo.model.base.KbEntry'>: 'http://amigo.geneontology.org/amigo/term/GO:{id}'}),
 'name': 'urea cycle',
 'shorthand': None,
 'description': '[biological_process] The sequence of reactions by which arginine is synthesized from ornithine, then cleaved to yield urea and regenerate ornithine. The overall reaction equation is NH3 + CO2 + aspartate + 3 ATP + 2 H2O = urea + fumarate + 2 ADP + 2 phosphate + AMP + diphosphate.',
 'aka': ['urea biosynthetic process', 'urea biosynthesis', 'ornithine cycle'],
 'xrefs': {WIKI:Urea_cycle}}

### Uncomment this cell to write GO entries to the db

In [10]:
# %%time
# collection = KB.client[KB.GO.client_db][KB.GO.collection]
# collection.drop()
# for go_entry in go_entries.values():
#     KB.put(KB.GO, go_entry, bypass_cache=True)
# collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index('aka', name='aka', collation=pymongo.collation.Collation(locale='en_US', strength=1))
# collection.create_index([('xrefs.id', pymongo.ASCENDING), ('xrefs.db', pymongo.ASCENDING)],
#                         name='xrefs',
#                         collation=pymongo.collation.Collation(locale='en_US', strength=1))
# print("success")

success
CPU times: user 5.39 s, sys: 509 ms, total: 5.89 s
Wall time: 9.68 s
