In [1]:
import projectpath

import collections
import json
import os
import pronto
import xmltodict

from kb import kb
from scheme import KbEntry, DbXref

KB = kb.configure_kb()

# EC (from https://www.enzyme-database.org/downloads.php)

Hardly pretty, but this seems to be the definitive source. Relatively minimal load: ID, name, aka.

In [2]:
# Don't mess with callback in xmltodict itself
with open(os.path.join('/home/fdrusso/work/data', 'enzyme-data.xml'), 'r') as f:
    xml_dict = xmltodict.parse(f.read())

# Extract just the table data from the mysqldump. Convert all <field name='foo'>bar</field> to 'foo': 'bar'.
# This simplifies the structure substantially.
table_data = {}
for table in xml_dict['mysqldump']['database']['table_data']:
    table_name = table['@name']
    table_data[table_name] = []
    for row in table['row']:
        table_data[table_name].append({field['@name']: field.get('#text') for field in row['field']})

for k, v in table_data.items():
    print(f'{k:>6}: {len(v):5d} rows')

  cite: 19654 rows
 class:   397 rows
 entry:  7890 rows
  hist:  7890 rows
  html:  7890 rows
  refs: 15472 rows


## `entry` table contains only the fully specied leaf nodes

TODO: Do we also want the hierarchical nodes, from the `class` table?

In [3]:
table_data['entry'][1]

{'ec_num': '1.1.1.2',
 'accepted_name': 'alcohol dehydrogenase (NADP+)',
 'reaction': 'an alcohol + NADP+ = an aldehyde + NADPH + H+',
 'other_names': 'aldehyde reductase (NADPH2); NADP-alcohol dehydrogenase; NADP+-aldehyde reductase; NADP+-dependent aldehyde reductase; NADPH-aldehyde reductase; NADPH-dependent aldehyde reductase; nonspecific succinic semialdehyde reductase; ALR 1; low-Km aldehyde reductase; high-Km aldehyde reductase; alcohol dehydrogenase (NADP)',
 'sys_name': 'alcohol:NADP+ oxidoreductase',
 'comments': 'A zinc protein. Some members of this group oxidize only primary alcohols; others act also on secondary alcohols. May be identical with EC 1.1.1.19 (L-glucuronate reductase), EC 1.1.1.33 [mevaldate reductase (NADPH)] and EC 1.1.1.55 [lactaldehyde reductase (NADPH)]. Re-specific with respect to NADPH.',
 'links': 'BRENDA, EXPASY, IUBMB, KEGG, PDB',
 'class': '1',
 'subclass': '1',
 'subsubclass': '1',
 'serial': '2',
 'status': None,
 'diagram': None,
 'cas_num': '902

In [4]:
%%time
for entry in table_data['entry']:
    if 'accepted_name' in entry:  # Only indicator of valid status?
        kb_entry = KbEntry(_id = entry['ec_num'], name = entry['accepted_name'], aka=[])
        if entry['sys_name']:
            kb_entry.aka.append(entry['sys_name'])
        if entry['other_names']:
            kb_entry.aka.extend(entry['other_names'].split('; '))
        if entry['comments']:
            kb_entry.description = entry['comments']

        KB.put(KB.EC, kb_entry, bypass_cache=True)

CPU times: user 1.45 s, sys: 88.7 ms, total: 1.54 s
Wall time: 2.05 s


# GO (from http://geneontology.org/docs/download-ontology/)

- Using go.obo, presumably the full ontology?

In [5]:
go = pronto.Ontology(os.path.join('/home/fdrusso/work/data', 'go.obo'))

print(f'This version of GO has {len(go.terms())} terms.')

This version of GO has 47235 terms.


In [6]:
%%time
xref_source = {
    'Reactome': 'REACT',
    'Wikipedia': 'WIKI',
    'KEGG_REACTION': 'KEGG',
    'KEGG_PATHWAY': 'KEGG',
}

xref_sources = collections.defaultdict(int)

def extract_xref(xref):
    parts = xref.id.split(':')
    if len(parts) == 2:
        xref_sources[parts[0]] += 1
        return DbXref(xref_source.get(parts[0], parts[0].upper()), parts[1])
    else:
        return DbXref(None, xref.id)

not_parsed = []
for term in go.terms():
    if term.obsolete:
        continue

    id_parts = term.id.split(':')
    if len(id_parts) == 2 and id_parts[0] == 'GO':
        KB.put(KB.GO, KbEntry(
            _id = id_parts[1],
            name = term.name,
            description = f'[{term.namespace}] {term.definition}',
            aka = [synonym.description for synonym in term.synonyms] or None,
            xrefs = {extract_xref(xref) for xref in term.xrefs} or None
        ), bypass_cache=True)
    else:
        not_parsed.append(term)



CPU times: user 12.7 s, sys: 826 ms, total: 13.6 s
Wall time: 20 s
