In [1]:
import collections
import copy
from ipydatagrid import DataGrid
import ipywidgets as widgets
import pandas as pd
from IPython.display import display

from mosmo.knowledge import kb
from mosmo.model import DS, DbXref, KbEntry, Molecule, Reaction, Pathway

# Data files are resources in the `data` package
from importlib.resources import files
import data

def datafile(filename):
    return files(data).joinpath(filename)

KB = kb.configure_kb()

## Semi-ambitious - big enough to be worth doing, not too big to be manageable

__Notes__
- We assume that most of the sugars already exist in the KnowledgeBase as a result of the notebook `Sugar Forms and Variations`, as well as the core nucleotides and deoxynucleotides, via `Nucleotides in the KB`.
- This notebook builds up the reactions of central carbon metabolism, as well as the remaining metabolites not covered by the systematic approaches above.
- We use ChEBI as the origin for most metabolite definitions
  - No access or licensing restrictions
  - The precision and granularity we need, e.g. in terms of protonation states, isomeric forms, etc
- Likewise, RHEA is the origin for reaction definitions.
- However, those sources themselves do not serve our overall goals. So we construct and refine canonical KB versions of all the relevant items. The path is
> RHEA reaction -> ChEBI compound -> KB compound -> KB reaction
- We use EcoCyc as a source to list the core reactions for glycolysis, pentose phosphate, and the TCA cycle, but only as a convenient way to identify the relevant RHEA reactions

In [2]:
glycolysis_df = pd.read_csv(datafile('glycolysis.tsv'), sep='\t')
ppp_df = pd.read_csv(datafile('pentose_phosphate.tsv'), sep='\t')
tca_df = pd.read_csv(datafile('tca.tsv'), sep='\t')

ecocyc_to_rhea = {ecocyc_id: KB.xref(KB.RHEA, ecocyc_id)
                  for ecocyc_id in {*glycolysis_df['Reaction id'],
                                    *ppp_df['Reaction id'],
                                    *tca_df['Reaction id'],
                                    'PGLUCONDEHYDRAT-RXN',  # Entner-Doudoroff
                                    'KDPGALDOL-RXN',  # Entner-Doudoroff
                                    'PYRUVDEH-RXN',  # connect glycolis -> TCA
                                    'ISOCIT-CLEAV-RXN',  # glyoxylate shunt
                                    'MALSYN-RXN',  # glyoxylate shunt
                                   }}

for i, ecocyc_id in enumerate(sorted(ecocyc_to_rhea.keys())):
    print(i+1, ecocyc_id, [r.name for r in ecocyc_to_rhea[ecocyc_id]])

1 1TRANSKETO-RXN ['transketolase']
2 2OXOGLUTARATEDEH-RXN ['2-oxoglutarate dehydrogenase system']
3 2PGADEHYDRAT-RXN ['phosphopyruvate hydratase']
4 2TRANSKETO-RXN ['transketolase']
5 3PGAREARR-RXN ['* phosphoglycerate mutase (2,3-diphosphoglycerate-independent)']
6 6PFRUCTPHOS-RXN ['6-phosphofructokinase']
7 6PGLUCONOLACT-RXN ['6-phosphogluconolactonase']
8 ACONITATEDEHYDR-RXN ['citrate = cis-aconitate + H2O']
9 ACONITATEHYDR-RXN ['cis-aconitate + H2O = D-threo-isocitrate']
10 CITSYN-RXN ['* citrate (Re)-synthase']
11 F16ALDOLASE-RXN ['fructose-bisphosphate aldolase']
12 F16BDEPHOS-RXN ['fructose-bisphosphatase']
13 FUMHYDR-RXN ['fumarate hydratase']
14 GAPOXNPHOSPHN-RXN ['* glyceraldehyde-3-phosphate dehydrogenase (NAD(P)+) (phosphorylating)']
15 GLU6PDEHYDROG-RXN ['* glucose-6-phosphate dehydrogenase [NAD(P)+]']
16 ISOCIT-CLEAV-RXN ['isocitrate lyase']
17 ISOCITDEH-RXN ['isocitrate dehydrogenase (NADP+)']
18 KDPGALDOL-RXN ['* 2-dehydro-3-deoxy-phosphogluconate aldolase']
19 MALATE-D

- RXN-15513 is GpmA, 2,3-bisphosphoglycerate-dependent phosphoglycerate mutase. 3PGAREARR-RXN/GpmM is the same net reaction, so this isn't blocking. But at some point we do want it correct in the KB (with separate EC numbers).
- [RESOLVED] SUCCINATE-DEHYDROGENASE-UBIQUINONE-RXN is RHEA:13714 (master: RHEA:13713; EC:1.3.5.1). Present on rhea-db.org, but not in `ref`. Need to investigate why it did not load
    - ubiquinone !

## Get all the metabolites where we want them to be
- In the KB
- Canonical
    - Intuitive names, e.g. ATP not ATP(2-), D-Glucose 6-phosphate not β-D-glucopyranose 6-phosphate(4-)
        - But with technically correct default forms at pH 7.3
    - β-D-fructofuranose 6-phosphate(2-) and keto-D-fructose 6-phosphate(2-) are both just D-Fructose 6P for the purpose of connecting PGI and PFK

### Collect, load, canonicalize

In [3]:
# Collapse reactions dict
rhea_rxns = {rxn.id: rxn for rxns in ecocyc_to_rhea.values() for rxn in rxns}

# Collect and load metabolites refered to in these reactions, by ChEBI ID
by_chebi_id = {}
for rxn in rhea_rxns.values():
    for met in rxn.stoichiometry:
        if met.id not in by_chebi_id:
            by_chebi_id[met.id] = KB.get(KB.CHEBI, met.id)

# Find corresponding canonical KB compounds where available, and navigate to canonical form.
# Where found, this replaces the molecule in `by_chebi_id` (so we can detect redundancy).
kb_mets = {}
for chebi_id in by_chebi_id.keys():
    # From KB, xref to CHEBI
    mols = KB.xref(KB.compounds, DbXref(db=DS.CHEBI, id=chebi_id))
    if mols:
        if len(mols) > 1:
            print(f'{len(mols)} hits to CHEBI:{chebi_id}')
        mol = mols[0]
        if mol.canonical_form is not None:
            mol = KB.get(KB.compounds, mol.canonical_form.parent_id)

        kb_mets[mol.id] = mol
        by_chebi_id[chebi_id] = mol

print(f'{len(by_chebi_id)} metabolites, {len(set(by_chebi_id.values()))} unique.')

by_met = collections.defaultdict(list)
for chebi_id, met in by_chebi_id.items():
    by_met[met].append(chebi_id)
for met, chebi_ids in by_met.items():
    if len(chebi_ids) > 1:
        print(f'RHEA metabolites {chebi_ids} ==> {met.id}')

47 metabolites, 45 unique.
RHEA metabolites ['57634', '57579'] ==> Fru.D.6P
RHEA metabolites ['61548', '57584'] ==> Glc.D.6P


### Clean up everything not yet in the KB

In [4]:
todo = []
for met in sorted(by_chebi_id.values(), key=lambda m: m.name):
    if met.id not in kb_mets:
        todo.append({
            'chebi_id': met.id,
            'kb_id': '',
            'name': met.name,
            'shorthand': met.shorthand or '',
            'formula': met.formula,
            'mass': met.mass or '',
            'charge': met.charge,
        })

met_grid = None
if todo:
    met_grid = DataGrid(pd.DataFrame(todo).set_index('chebi_id'), editable=True)
    display(met_grid)
else:
    print('All referenced molecules are in the KB.')

All referenced molecules are in the KB.


In [5]:
kb_update = {}
if met_grid is not None:
    for i, row in met_grid.data.iterrows():
        if row.kb_id:
            chebi_mol = by_chebi_id[i]

            # Preserve the old name as an aka, and remove the new one if it was an aka.
            aka = chebi_mol.aka or []
            if row['name'] != chebi_mol.name:
                aka.insert(0, chebi_mol.name)
                try:
                    aka.remove(row['name'])
                except ValueError:
                    pass

            # Keep an xref to the ChEBI mol
            xrefs = {DbXref(db=DS.CHEBI, id=chebi_mol.id)}
            if chebi_mol.xrefs:
                xrefs.update(chebi_mol.xrefs)

            kb_mol = Molecule(
                id=row.kb_id,
                name=row['name'],
                shorthand=row.shorthand or None,
                description=chebi_mol.description or None,
                aka=aka or None,
                xrefs=xrefs,
                formula=row.formula or None,
                mass=row.mass if not pd.isna(row.mass) else None,
                charge=row.charge,
                inchi=chebi_mol.inchi or None,
            )
            kb_update[row.kb_id] = kb_mol
            kb_mets[row.kb_id] = kb_mol
            by_chebi_id[chebi_mol.id] = kb_mol
print(f'{len(kb_update)} molecules pending update in the KB')

0 molecules pending update in the KB


In [6]:
for kb_id, kb_mol in kb_update.items():
    KB.put(KB.compounds, kb_mol)

## Build KB versions of all reactions

In [7]:
kb_rxns = {}
for rhea_id in rhea_rxns.keys():
    rxns = KB.xref(KB.reactions, DbXref(db=DS.RHEA, id=rhea_id))
    if rxns:
        if len(rxns) > 1:
            print(f'{len(rxns)} hits to RHEA:{rhea_id}')
        rxn = rxns[0]
        kb_rxns[rhea_id] = rxn
print(f'{len(rhea_rxns)} reactions cross-referenced into {len(kb_rxns)} KB reactions')

todo = []
for rxn in sorted(rhea_rxns.values(), key=lambda r: r.name):
    if rxn.id not in kb_rxns:
        todo.append({
            'rhea_id': rxn.id,
            'kb_id': '',
            'name': rxn.name,
            'shorthand': rxn.shorthand or '',
            'catalyst': rxn.catalyst or '',
            'rev': rxn.reversible,
            'reactants': ', '.join(by_chebi_id[met.id].label for met, count in rxn.stoichiometry.items()),
            'xrefs': ', '.join(sorted(str(xref) for xref in rxn.xrefs)),
        })

rxn_grid = None
if todo:
    rxn_grid = DataGrid(pd.DataFrame(todo).set_index('rhea_id'), editable=True)
    display(rxn_grid)
else:
    print('All referenced reactions are in the KB.')

34 reactions cross-referenced into 34 KB reactions
All referenced reactions are in the KB.


In [8]:
kb_rxns_update = {}
if rxn_grid is not None:
    for rhea_id, row in rxn_grid.data.iterrows():
        if row.kb_id:
            rhea_rxn = rhea_rxns[rhea_id]

            # Keep an xref to the Rhea reaction
            xrefs = {DbXref(db=DS.RHEA, id=rhea_id)}
            if rhea_rxn.xrefs:
                xrefs.update(rhea_rxn.xrefs)
                
            # Map metabolites to KB
            stoichiometry = {by_chebi_id[met.id]: count for met, count in rhea_rxn.stoichiometry.items()}

            kb_rxn = Reaction(
                id=row.kb_id,
                name=row['name'],
                shorthand=row.shorthand or None,
                description=rhea_rxn.description or None,
                aka=rhea_rxn.aka or None,
                xrefs=xrefs,
                catalyst=Molecule(id=row.catalyst, name=row['name']) if row.catalyst else None,
                stoichiometry=stoichiometry,
                reversible=row.rev,
            )
            kb_rxns_update[row.kb_id] = kb_rxn
            kb_rxns[rhea_id] = kb_rxn
    print(f'{len(kb_rxns_update)} reactions to update in the KB')

In [9]:
for kb_id, kb_rxn in kb_rxns_update.items():
    KB.put(KB.reactions, kb_rxn)