In [1]:
import projectpath

import collections
import copy
import os

import ipywidgets as widgets
import ipysheet
import pandas as pd

from mosmo.knowledge import kb
from mosmo.model import DS, DbXref, KbEntry, Molecule, Reaction, Pathway

KB = kb.configure_kb()

### Semi-ambitious - big enough to be worth doing, not too big to be manageable

- Use EcoCyc as a source for the pathways (glycolysis, pentose phosphate, TCA cycle)
- Cross-reference into refdb.RHEA to pull out reference reactions
- Expect to need some generalization / specialization magic to navigate different forms of compounds referenced throughout
    - sugars: D/L, ring/chain, phospho-, deoxy-?, protonation
    - ATP etc: protonation
    - redox carrirers: redox state
    - cofactors ...?

Ultimate outcome:
- catalog of all reactions
    - and catalysts? Feels like all the information _should_ be there but not necessarily in a way that avoids a lot of manual work
- catalog of metabolites, in roles
    - carbon input / output
        - though for central metabolism, isn't pretty much every intermediate also an output?
    - 'true' intermediates, though see above
    - energy input / output
    - redox input / output
    - recycled carriers
    - other? TBD

In [2]:
data_dir = '/home/fdrusso/work/data'

glycolysis_df = pd.read_csv(os.path.join(data_dir, 'ecocyc_glycolysis.txt'), skiprows=2, sep='\t')
ppp_df = pd.read_csv(os.path.join(data_dir, 'ecocyc_pentose_phosphate.txt'), skiprows=2, sep='\t')
tca_df = pd.read_csv(os.path.join(data_dir, 'ecocyc_tca.txt'), skiprows=2, sep='\t')

ecocyc_to_rhea = {ecocyc_id: KB.xref(KB.RHEA, ecocyc_id)
                  for ecocyc_id in {*glycolysis_df['Reaction id'],
                                    *ppp_df['Reaction id'],
                                    *tca_df['Reaction id'],
                                    'PGLUCONDEHYDRAT-RXN',  # Entner-Doudoroff
                                    'KDPGALDOL-RXN',  # Entner-Doudoroff
                                    'PYRUVDEH-RXN',  # connect glycolis -> TCA
                                    'ISOCIT-CLEAV-RXN',  # glyoxylate shunt
                                    'MALSYN-RXN',  # glyoxylate shunt
                                   }}

for i, ecocyc_id in enumerate(sorted(ecocyc_to_rhea.keys())):
    print(i+1, ecocyc_id, [r.name for r in ecocyc_to_rhea[ecocyc_id]])

1 1TRANSKETO-RXN ['transketolase']
2 2OXOGLUTARATEDEH-RXN ['2-oxoglutarate dehydrogenase system']
3 2PGADEHYDRAT-RXN ['phosphopyruvate hydratase']
4 2TRANSKETO-RXN ['transketolase']
5 3PGAREARR-RXN ['* phosphoglycerate mutase (2,3-diphosphoglycerate-independent)']
6 6PFRUCTPHOS-RXN ['6-phosphofructokinase']
7 6PGLUCONOLACT-RXN ['6-phosphogluconolactonase']
8 ACONITATEDEHYDR-RXN ['citrate = cis-aconitate + H2O']
9 ACONITATEHYDR-RXN ['cis-aconitate + H2O = D-threo-isocitrate']
10 CITSYN-RXN ['* citrate (Re)-synthase']
11 F16ALDOLASE-RXN ['fructose-bisphosphate aldolase']
12 F16BDEPHOS-RXN ['fructose-bisphosphatase']
13 FUMHYDR-RXN ['fumarate hydratase']
14 GAPOXNPHOSPHN-RXN ['* glyceraldehyde-3-phosphate dehydrogenase (NAD(P)+) (phosphorylating)']
15 GLU6PDEHYDROG-RXN ['* glucose-6-phosphate dehydrogenase [NAD(P)+]']
16 ISOCIT-CLEAV-RXN ['isocitrate lyase']
17 ISOCITDEH-RXN ['isocitrate dehydrogenase (NADP+)']
18 KDPGALDOL-RXN ['* 2-dehydro-3-deoxy-phosphogluconate aldolase']
19 MALATE-D

- RXN-15513 is GpmA, 2,3-bisphosphoglycerate-dependent phosphoglycerate mutase. 3PGAREARR-RXN/GpmM is the same net reaction, so this isn't blocking. But at some point we do want it correct in the KB (with separate EC numbers).
- [RESOLVED] SUCCINATE-DEHYDROGENASE-UBIQUINONE-RXN is RHEA:13714 (master: RHEA:13713; EC:1.3.5.1). Present on rhea-db.org, but not in REFDB. Need to investigate why it did not load
    - ubiquinone !

## Get all the metabolites where we want them to be
- In the KB
- Canonical
    - Intuitive names, e.g. ATP not ATP(2-), D-Glucose 6-phosphate not β-D-glucopyranose 6-phosphate(4-)
        - But with technically correct default forms at pH 7.3
    - β-D-fructofuranose 6-phosphate(2-) and keto-D-fructose 6-phosphate(2-) are both just D-Fructose 6P for the purpose of connecting PGI and PFK

### Collect, load, canonicalize

In [3]:
# Collapse reactions dict
rhea_rxns = {rxn.id: rxn for rxns in ecocyc_to_rhea.values() for rxn in rxns}

# Collect and load metabolites refered to in these reactions, by ChEBI ID
rhea_mets = {}
for rxn in rhea_rxns.values():
    for met in rxn.stoichiometry:
        if met.id not in rhea_mets:
            rhea_mets[met.id] = KB.get(KB.CHEBI, met.id)

# Find corresponding KB compounds where available, and navigate to canonical form.
# Where found, this replaces the molecule in rhea_met.
kb_mets = {}
for chebi_id in rhea_mets.keys():
    # From KB, xref to CHEBI
    mols = KB.xref(KB.compounds, 'CHEBI:' + chebi_id)
    if mols:
        if len(mols) > 1:
            print(f'{len(mols)} hits to CHEBI:{chebi_id}')
        mol = mols[0]
        kb_mets[mol.id] = mol

        if mol.canonical_form is not None:
            # Reuse the same instance if it's already been loaded
            canonical_id = mol.canonical_form.parent_id
            if canonical_id not in kb_mets:
                kb_mets[canonical_id] = KB.get(KB.compounds, canonical_id)
            mol = kb_mets[canonical_id]

        rhea_mets[chebi_id] = mol
                
mets = sorted(set(rhea_mets.values()), key=lambda m: str(m.id))
print(f'{len(rhea_mets)} metabolites, {len(mets)} unique.')

by_met = collections.defaultdict(list)
for chebi_id, met in rhea_mets.items():
    by_met[met].append(chebi_id)
for met, chebi_ids in by_met.items():
    if len(chebi_ids) > 1:
        print(f'RHEA metabolites {chebi_ids} ==> {met.id}')



47 metabolites, 45 unique.
RHEA metabolites ['57584', '61548'] ==> Glc.D.6P
RHEA metabolites ['57579', '57634'] ==> Fru.D.6P


### Clean up everything not yet in the KB

In [4]:
todo = [{
    'chebi_id': met.id,
    'kb_id': '',
    'name': met.name,
    'shorthand': met.shorthand,
    'formula': met.formula,
    'mass': met.mass,
    'charge': met.charge,
} for met in mets if met.id not in kb_mets]
if todo:
    worksheet = ipysheet.from_dataframe(pd.DataFrame(todo).set_index('chebi_id').sort_values('name'))
else:
    worksheet = 'All referenced molecules are in the KB.'
worksheet

'All referenced molecules are in the KB.'

In [5]:
kb_update = {}
if todo:
    for i, row in ipysheet.to_dataframe(worksheet).iterrows():
        if row.kb_id:
            chebi_mol = KB.get(KB.CHEBI, i)

            # Preserve the old name as an aka, and remove the new one if it was an aka.
            aka = chebi_mol.aka or []
            if row['name'] != chebi_mol.name:
                aka.insert(0, chebi_mol.name)
                try:
                    aka.remove(row['name'])
                except ValueError:
                    pass

            # Keep an xref to the ChEBI mol
            xrefs = {DbXref('CHEBI', chebi_mol.id)}
            if chebi_mol.xrefs:
                xrefs.update(chebi_mol.xrefs)

            kb_mol = Molecule(
                id=row.kb_id,
                name=row['name'],
                shorthand=row.shorthand or None,
                description=chebi_mol.description or None,
                aka=aka or None,
                xrefs=xrefs,
                formula=row.formula or None,
                mass=row.mass if not pd.isna(row.mass) else None,
                charge=row.charge,
                inchi=chebi_mol.inchi or None,
            )
            kb_update[row.kb_id] = kb_mol
            kb_mets[row.kb_id] = kb_mol
            rhea_mets[chebi_mol.id] = kb_mol
    print(f'{len(kb_update)} molecules to update in the KB')

In [6]:
for kb_id, kb_mol in kb_update.items():
    KB.put(KB.compounds, kb_mol)

## Fix cross-references for canonical compounds

In [7]:
met_data = {
    met: {
        'id': met.id,
        'name': met.name,
        'shorthand': met.shorthand,
        'CHEBI': None,
        'KEGG': None,
        'METACYC': None,
        'save': False,
    } 
    for met in sorted(kb_mets.values(), key=lambda m: m.id)
}

for met in met_data:
    for xref in (met.xrefs or []):
        if xref.db in [DS.CHEBI, DS.KEGG, DS.METACYC]:
            if met_data[met][xref.db.id]:
                print(f'{met.id} multiple xrefs to {xref.db.id}')
            else:
                met_data[met][xref.db.id] = xref.id

worksheet = ipysheet.from_dataframe(pd.DataFrame(met_data.values()).set_index('id').fillna(''))
worksheet

Sheet(cells=(Cell(column_end=0, column_start=0, numeric_format=None, row_end=49, row_start=0, squeeze_row=Fals…

In [8]:
for i, row in ipysheet.to_dataframe(worksheet).iterrows():
    met = KB.get(KB.compounds, i)
    data = copy.deepcopy(met.__dict__)
    
    for k in ['name', 'shorthand']:
        if data[k] != row[k]:
            print(f'{met.id} {k}: {data[k]} --> {row[k]}')
            data[k] = row[k]
    
    xrefs = {}
    for db in ['CHEBI', 'KEGG', 'METACYC']:
        if row[db]:
            if not data['xrefs']:
                data['xrefs'] = set()
            xref = DbXref(db=DS.get(db), id=row[db])
            if xref not in data['xrefs']:
                data['xrefs'].add(xref)
                print(f'{met.id}: add xref {xref}')
                xrefs[xref] = [other for other in KB.xref(KB.compounds, xref) if other != met]
                if xrefs[xref]:
                    print(f'    - remove from existing {xrefs[xref]}')

    if row.save:
        for xref, others in xrefs.items():
            for other in others:
                other.xrefs.remove(xref)
                KB.put(KB.compounds, other)
                print(f'Saved {other}')

        met.__dict__.update(data)
        KB.put(KB.compounds, met)
        print(f'Saved {met}')


Ery.D.4P: add xref KEGG:C00279
    - remove from existing [[Ery.D.4P.full] D-erythrose 4-phosphate (fully protonated)]
Ery.D.4P: add xref METACYC:ERYTHROSE-4P
Saved [Ery.D.4P.full] D-erythrose 4-phosphate (fully protonated)
Saved [Ery.D.4P] D-erythrose 4-phosphate
Fru.D.6P: add xref KEGG:C00085
    - remove from existing [[Fru.D.6P.open.full] keto-D-fructose 6-phosphate (fully protonated), [Fru.D.6P.r5.full] D-fructofuranose 6-phosphate (fully protonated)]
Saved [Fru.D.6P.open.full] keto-D-fructose 6-phosphate (fully protonated)
Saved [Fru.D.6P.r5.full] D-fructofuranose 6-phosphate (fully protonated)
Saved [Fru.D.6P] D-fructose 6-phosphate
Fru.D.6P.r5.β: add xref KEGG:C05345
    - remove from existing [[Fru.D.6P.r5.β.full] beta-D-fructofuranose 6-phosphate (fully protonated)]
Saved [Fru.D.6P.r5.β.full] beta-D-fructofuranose 6-phosphate (fully protonated)
Saved [Fru.D.6P.r5.β] beta-D-fructofuranose 6-phosphate
Fru.D.bis16: add xref CHEBI:78682
    - remove from existing [[Fru.D.bis16.fu

## Build KB versions of all reactions

In [49]:
kb_xref = {}
kb_rxns = {}
for rhea_id in rhea_rxns.keys():
    rxns = KB.xref(KB.reactions, 'RHEA:' + rhea_id)
    if rxns:
        if len(rxns) > 1:
            print(f'{len(rxns)} hits to RHEA:{rhea_id}')
        kb_rxn = rxns[0]
        kb_xref[rhea_id] = kb_rxn
        kb_rxns[kb_rxn.id] = kb_rxn
print(f'{len(kb_xref)} reactions cross-referenced into {len(kb_rxns)} KB reactions')

todo_rxns = [{
    'rhea_id': rxn.id,
    'kb_id': '',
    'name': rxn.name,
    'shorthand': rxn.shorthand,
    'catalyst': rxn.catalyst,
    'rev': rxn.reversible,
    'reactants': ', '.join(rhea_mets[met.id].label for met, count in rxn.stoichiometry.items()),
    'xrefs': ', '.join(sorted(str(xref) for xref in rxn.xrefs)),
} for rxn in rhea_rxns.values() if rxn.id not in kb_xref]

if todo_rxns:
    worksheet_rxns = ipysheet.from_dataframe(pd.DataFrame(todo_rxns).set_index('rhea_id').sort_values('name'))
else:
    worksheet_rxns = 'All referenced reactions are in the KB.'
worksheet_rxns

34 reactions cross-referenced into 34 KB reactions


'All referenced reactions are in the KB.'

In [50]:
kb_rxns_update = {}
if todo_rxns:
    for i, row in ipysheet.to_dataframe(worksheet_rxns).iterrows():
        if row.kb_id:
            rhea_rxn = KB.get(KB.RHEA, int(i))

            # Keep an xref to the Rhea reaction
            xrefs = {DbXref('RHEA', rhea_rxn.id)}
            if rhea_rxn.xrefs:
                xrefs.update(rhea_rxn.xrefs)
                
            # Map metabolites to KB
            stoichiometry = {rhea_mets[met.id]: count for met, count in rhea_rxn.stoichiometry.items()}

            kb_rxn = Reaction(
                id=row.kb_id,
                name=row['name'],
                shorthand=row.shorthand or None,
                description=rhea_rxn.description or None,
                aka=rhea_rxn.aka or None,
                xrefs=xrefs,
                catalyst=Molecule(id=row.catalyst, name=row['name']),
                stoichiometry=stoichiometry,
                reversible=row.rev,
            )
            kb_rxns_update[row.kb_id] = kb_rxn
            kb_rxns[row.kb_id] = kb_rxn
            kb_xref[rhea_rxn.id] = kb_rxn
    print(f'{len(kb_rxns_update)} reactions to update in the KB')

In [51]:
for kb_id, kb_rxn in kb_rxns_update.items():
    KB.put(KB.reactions, kb_rxn)

### Manually create Glc PTS net reaction, not in RHEA

In [52]:
kb_rxns['pts.glc'] = KB('pts.glc')
if not kb_rxns['pts.glc']:
    glc_e = Molecule(
        id = 'Glc.D.ext',
        name = 'D-glucose (external)',
        shorthand = 'Glc.D.ext',
        formula = 'C6H12O6',
        mass = 180.15588,
        charge = 0,
    )
    KB.put(KB.compounds, glc_e)

    glc_pts = Reaction(
        id = 'pts.glc',
        name = 'glucose PTS net reaction',
        description = 'Net reaction of glucose import via phosphotranferase system. Transfers a phosphate from PEP to glucose, via'
            ' phospho- intermediates of cross-functional EIa and HPr, and glucose-specfic EIIbc.',
        shorthand = 'GlcPTS',
        xrefs = {DbXref(db='BIGG', id='GLCpts')},
        stoichiometry = {
            glc_e: -1,
            kb_mets['pep']: -1,
            kb_mets['Glc.D.6P']: +1,
            kb_mets['pyr']: +1,
        },
        catalyst = Molecule('GlcPTS'),
        reversible = False,
    )
    KB.put(KB.reactions, glc_pts)

kb_rxns['pts.glc'] = KB('pts.glc')
kb_mets['Glc.D.ext'] = KB('Glc.D.ext')


## Prepare data for Escher

_Disabled while escher and jupyter do not play nice_

In [53]:
# import escher
# import json

# def escher_model(model_name, rxn_ids):
#     def escher_reaction(kb_rxn):
#         stoich = {}
#         for met, count in kb_rxn.stoichiometry.items():
#             met = kb_mets[met.id]
#             met_id = met.shorthand or met.id
#             stoich[met_id] = count

#         return {
#           'id': kb_rxn.shorthand or kb_rxn.id,
#           'name': kb_rxn.name,
#           'metabolites': stoich,
#           'lower_bound': -1000.0 if kb_rxn.reversible else 0.0,
#           'upper_bound': 1000.0,
#           'gene_reaction_rule': kb_rxn.catalyst.shorthand or kb_rxn.catalyst.id,
#         }

#     def escher_metabolite(kb_met):
#         return {
#           'id': kb_met.shorthand or kb_met.id,
#           'name': kb_met.name,
#           'compartment': 'any',
#           'charge': kb_met.charge,
#           'formula': kb_met.formula,
#         }

#     def escher_gene(catalyst):
#         return {
#           'id': catalyst.shorthand or catalyst.id,
#           'name': catalyst.shorthand or catalyst.id,
#         }

#     reactions = {}
#     metabolites = {}
#     genes = {}
#     for rxn_id in rxn_ids:
#         kb_rxn = kb_rxns[rxn_id]
#         reactions[rxn_id] = escher_reaction(kb_rxn)
#         for met in kb_rxn.stoichiometry:
#             metabolites[met.id] = escher_metabolite(kb_mets[met.id])
#         if kb_rxn.catalyst:
#             genes[kb_rxn.catalyst.id] = escher_gene(kb_rxn.catalyst)

#     return {
#         'id': model_name,
#         'version': '1',
#         'metabolites': list(metabolites.values()),
#         'reactions': list(reactions.values()),
#         'genes': list(genes.values()),
#         'compartments': {'any': 'anywhere'},
#     }

# escher.Builder(model_json=json.dumps(escher_model('central_carbon', kb_rxns.keys())))

### Saved pathway diagrams:
- files/pw/glycolysis_ppp_ed.json
- files/pw/central_carbon.json


### Potentially still TBD:
- malate - pyr - PEP flows
- mixed acid metabolism / lactate / acetate
- PTS glucose import

## Persistent Pathway Groups

- It cannot be a requirement that a reaction belong to exaclty one or only one pathway.
- But for this set of reactions, this is a reasonable starting point

In [54]:
pw_df = pd.DataFrame([{
    'rxn_id': kb_rxn.id,
    'shorthand': kb_rxn.shorthand,
    'name': kb_rxn.name,
    'pathway': '',
} for kb_rxn in kb_rxns.values()])
pw_widget = ipysheet.from_dataframe(pw_df)
pw_widget

Sheet(cells=(Cell(column_end=0, column_start=0, numeric_format=None, row_end=34, row_start=0, squeeze_row=Fals…

In [55]:
pw_df = ipysheet.to_dataframe(pw_widget)
pw_df.sort_values(['pathway', 'rxn_id'])

Unnamed: 0,rxn_id,shorthand,name,pathway
11,acn1,ACN1,Citrate Hydrolyase,
2,acn2,ACN2,Isocitrate hydrolyase,
8,akgdh,AKGDH,2-oxoglutarate Dehydrogenase,
5,citsyn,CS,Citrate Synthase,
30,eno,ENO,Enolase,
24,fba,FBA,Fructose-bisphosphate Aldolase,
7,fbp,FBP,Fuctose-bisphosphatase,
15,fumhyd,FH,Fumarase,
0,g6pdh,G6PDH,Glucose-6-phosphate Dehydrogenase,
9,gapdh,GAPDH,Glyceraldehyde-3-phosphate Dehydrogenase,


In [15]:
# kb_pathways = {name: Pathway(idNone, name=name) for name in set(pw_df.pathway)}
# for i, row in pw_df.iterrows():
#     kb_pathway = kb_pathways[row.pathway]
#     kb_rxn = kb_rxns[row.rxn_id]
#     kb_pathway.steps.add(kb_rxn)
#     kb_pathway.metabolites.update(kb_mets[met.id] for met in kb_rxn.stoichiometry.keys())
#     if kb_rxn.catalyst:
#         kb_pathway.enzymes.add(kb_rxn.catalyst)

# for kb_pathway in kb_pathways.values():
#     KB.put(KB.pathways, kb_pathway)