In [1]:
import setpypath

import collections
import os
from typing import Iterable

import pandas as pd

from kb import kb
from scheme import Molecule, Reaction, DbCrossRef

chebi_dir = "/home/fdrusso/work/data/chebi"


# Base ChEBI data

In [2]:
compounds = {}
for row in pd.read_csv(os.path.join(chebi_dir, "compounds.tsv"), sep="\t").itertuples():
    # Policy decision: cross-ref to primary ID only; let the source worry about obsolete IDs.
    if row.STATUS == "C" and pd.isnull(row.PARENT_ID):
        compounds[row.ID] = Molecule(_id=row.ID, name=row.NAME, crossref=[DbCrossRef("ChEBI", row.ID)])
print(f"{len(compounds)} valid compounds")

59583 valid compounds


In [3]:
compound_names = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, "names.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds:
        compound_names[row.COMPOUND_ID].add(row.NAME)
for compound_id, names in compound_names.items():
    compound = compounds[compound_id]
    compound.aka = list(names - {compound.name})

In [4]:
for row in pd.read_csv(os.path.join(chebi_dir, "chemical_data.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds:
        if row.TYPE == "MASS":
            compounds[row.COMPOUND_ID].mass = float(row.CHEMICAL_DATA)
        elif row.TYPE == "CHARGE":
            compounds[row.COMPOUND_ID].charge = int(row.CHEMICAL_DATA)
        elif row.TYPE == "FORMULA":
            compounds[row.COMPOUND_ID].formula = row.CHEMICAL_DATA

In [5]:
# This cell is NOT idempotent

db_mapping = {
#       1 FAO/WHO standards accession
#       2 PPR
#       3 CiteXplore citation
#      30 ChemIDplus accession
#      36 LIPID MAPS class accession
#      90 COMe accession
#      92 BPDB accession
#      93 RESID accession
#      99 ECMDB accession
#     102 YMDB accession
#     111 WebElements accession
#     128 PDB accession
#     129 Chinese Abstracts citation
#     131 PubMed Central citation
#     150 SMID accession
#     161 Pubchem accession
#     250 VSDB accession
#     294 MolBase accession
#     472 FooDB accession
#     585 Pesticides accession
#     620 UM-BBD compID
    "KEGG GLYCAN accession": "KEGG",
#    1090 Agricola citation
#    1104 PPDB accession
#    3205 DrugBank accession
#    3355 PDBeChem accession
#    3457 Gmelin Registry Number
#    3775 Drug Central accession
    "KEGG DRUG accession": "KEGG",
#    4804 Patent accession
#    4815 KNApSAcK accession
    "Wikipedia accession": "Wikipedia",
    "MetaCyc accession": "MetaCyc",
#    7561 LIPID MAPS instance accession
#    9194 Beilstein Registry Number
#   10518 GlyGen accession
#   10569 GlyTouCan accession
#   15470 HMDB accession
#   16681 Chemspider accession
#   17543 Reaxys Registry Number
    "KEGG COMPOUND accession": "KEGG",
    "CAS Registry Number": "CAS",
    "LINCS accession": "LINCS",
#   98442 PubMed citation  
}

compound_xrefs = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, "database_accession.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds and row.TYPE in db_mapping:
        compound_xrefs[row.COMPOUND_ID].add(
            DbCrossRef(db_mapping[row.TYPE], row.ACCESSION_NUMBER))
for compound_id, xrefs in compound_xrefs.items():
        compounds[compound_id].crossref.extend(xrefs)


In [6]:
for row in pd.read_csv(os.path.join(chebi_dir, "chebiId_inchi.tsv"), sep="\t").itertuples():
    if row.CHEBI_ID in compounds:
        compounds[row.CHEBI_ID].inchi = row.InChI

In [7]:
compounds[17634]

Molecule(_id=17634, name='D-glucose', shorthand=None, aka=['D-(+)-glucose', 'grape sugar', 'D-gluco-hexose', 'Traubenzucker', 'dextrose', 'D(+)-glucose'], crossref=[ChEBI:17634, CAS:50-99-7], formula='C6H12O6', mass=180.15588, charge=0, inchi=None)

In [8]:
compounds[42758]

Molecule(_id=42758, name='aldehydo-D-glucose', shorthand=None, aka=['aldehydo-D-gluco-hexose', 'Dextrose', '(2R,3S,4R,5R)-2,3,4,5,6-pentahydroxyhexanal', 'Glucose', 'D-glucose', 'WURCS=2.0/1,1,0/[o2122h]/1/', 'D(+)-Glucose', 'D-GLUCOSE IN LINEAR FORM'], crossref=[ChEBI:42758, CAS:50-99-7, Wikipedia:Glucose], formula='C6H12O6', mass=180.15588, charge=0, inchi='InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h1,3-6,8-12H,2H2/t3-,4+,5+,6+/m0/s1')

# Put it in Mongo

In [9]:
kb.CODECS[Molecule].encode(compounds[17634])

{'_id': 17634,
 'name': 'D-glucose',
 'aka': ['D-(+)-glucose',
  'grape sugar',
  'D-gluco-hexose',
  'Traubenzucker',
  'dextrose',
  'D(+)-glucose'],
 'crossref': [{'db': 'ChEBI', 'id': 17634}, {'db': 'CAS', 'id': '50-99-7'}],
 'formula': 'C6H12O6',
 'mass': 180.15588,
 'charge': 0}

## Big write -- wipe and replace

In [10]:
%%time
# kb.KB.compounds.drop()
# for compound_id, compound in compounds.items():
#     doc = kb.CODECS[Molecule].encode(compound)
#     kb.KB.compounds.insert_one(doc)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.53 µs


In [11]:
list(kb.KB.compounds.find({"aka": "ribose"}).collation({"locale": 'en', "strength": 1}))

[{'_id': 27476,
  'name': 'beta-D-ribopyranose',
  'aka': ['Ribose',
   'RIBOSE(PYRANOSE FORM)',
   'WURCS=2.0/1,1,0/[a222h-1b_1-5]/1/',
   'beta-D-Ribopyranose'],
  'crossref': [{'db': 'ChEBI', 'id': 27476},
   {'db': 'KEGG', 'id': 'C08353'},
   {'db': 'CAS', 'id': '7296-60-8'}],
  'formula': 'C5H10O5',
  'mass': 150.1299,
  'charge': 0,
  'inchi': 'InChI=1S/C5H10O5/c6-2-1-10-5(9)4(8)3(2)7/h2-9H,1H2/t2-,3-,4-,5-/m1/s1'},
 {'_id': 45506,
  'name': 'alpha-D-ribose',
  'aka': ['RIBOSE',
   'alpha-D-Rib',
   'alpha-D-ribofuranose',
   'WURCS=2.0/1,1,0/[a222h-1a_1-4]/1/'],
  'crossref': [{'db': 'ChEBI', 'id': 45506}],
  'formula': 'C5H10O5',
  'mass': 150.1299,
  'charge': 0,
  'inchi': 'InChI=1S/C5H10O5/c6-1-2-3(7)4(8)5(9)10-2/h2-9H,1H2/t2-,3-,4-,5+/m1/s1'},
 {'_id': 47013,
  'name': 'D-ribofuranose',
  'aka': ['D-Ribose',
   'D-ribose',
   'WURCS=2.0/1,1,0/[a222h-1x_1-4]/1/',
   'ribose',
   '(3R,4S,5R)-5-(hydroxymethyl)tetrahydrofuran-2,3,4-triol'],
  'crossref': [{'db': 'ChEBI', 'id': 

# Explore is_a relationships

In [12]:
parents = collections.defaultdict(list)
children = collections.defaultdict(list)
for row in pd.read_csv(os.path.join(chebi_dir, "relation.tsv"), sep="\t").itertuples():
    if row.TYPE == "is_a" and row.INIT_ID in compounds and row.FINAL_ID in compounds:
        parents[row.FINAL_ID].append(row.INIT_ID)
        children[row.INIT_ID].append(row.FINAL_ID)

print(f"{len(parents)} entries have parents; {len(children)} entries have children")

59031 entries have parents; 11141 entries have children


In [13]:
parent_dist = collections.Counter(len(v) for v in parents.values())
sorted(parent_dist.items())

[(1, 33533),
 (2, 13757),
 (3, 5938),
 (4, 2913),
 (5, 1500),
 (6, 794),
 (7, 326),
 (8, 169),
 (9, 71),
 (10, 25),
 (11, 4),
 (14, 1)]

In [14]:
children_dist = collections.Counter(len(v) for v in children.values())
sorted(children_dist.items())

[(1, 3171),
 (2, 3111),
 (3, 1053),
 (4, 664),
 (5, 451),
 (6, 304),
 (7, 228),
 (8, 202),
 (9, 153),
 (10, 143),
 (11, 117),
 (12, 102),
 (13, 80),
 (14, 81),
 (15, 72),
 (16, 64),
 (17, 46),
 (18, 56),
 (19, 45),
 (20, 47),
 (21, 33),
 (22, 36),
 (23, 28),
 (24, 33),
 (25, 24),
 (26, 22),
 (27, 27),
 (28, 27),
 (29, 29),
 (30, 29),
 (31, 17),
 (32, 17),
 (33, 21),
 (34, 15),
 (35, 17),
 (36, 16),
 (37, 15),
 (38, 15),
 (39, 17),
 (40, 13),
 (41, 15),
 (42, 13),
 (43, 16),
 (44, 11),
 (45, 8),
 (46, 11),
 (47, 11),
 (48, 8),
 (49, 12),
 (50, 12),
 (51, 6),
 (52, 9),
 (53, 10),
 (54, 10),
 (55, 5),
 (56, 9),
 (57, 9),
 (58, 6),
 (59, 6),
 (60, 9),
 (61, 4),
 (62, 6),
 (63, 3),
 (64, 2),
 (65, 5),
 (66, 9),
 (67, 6),
 (68, 1),
 (69, 5),
 (70, 8),
 (71, 5),
 (72, 6),
 (73, 4),
 (74, 7),
 (75, 3),
 (76, 3),
 (77, 5),
 (78, 2),
 (79, 2),
 (80, 5),
 (81, 6),
 (82, 1),
 (83, 6),
 (84, 4),
 (85, 1),
 (86, 2),
 (87, 3),
 (88, 3),
 (89, 1),
 (90, 3),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 4),
 (95

## Start from D-glucose [17634], and collect all descendents

- We could start from (DL-) glucose [17234], but this parent itself is stereochemically undefined and therefore not biologically relevant. It also includes children such as (DL-) glucopyranose, which is of likewise dubious value for biological modeling.

In [15]:
glucose_ids = set()
pending = [17634]
while pending:
    id = pending.pop()
    if id not in glucose_ids:
        glucose_ids.add(id)
        pending.extend(children[id])

glucoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in glucose_ids
]

glucoses_df = pd.DataFrame([
    {"id": c.id, "name": c.name, "formula": c.formula, "inchi": c.inchi}
    for c in glucoses
]).set_index("id")
glucoses_df

Unnamed: 0_level_0,name,formula,inchi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
145664,D-glucofuranose,C6H12O6,InChI=1S/C6H12O6/c7-1-2(8)5-3(9)4(10)6(11)12-5...
134625,D-glucose-(13)C6,[13C]6H12O6,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...
17634,D-glucose,C6H12O6,
17925,alpha-D-glucose,C6H12O6,InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2...
145606,beta-D-glucofuranose,C6H12O6,InChI=1S/C6H12O6/c7-1-2(8)5-3(9)4(10)6(11)12-5...
42758,aldehydo-D-glucose,C6H12O6,InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h...
4167,D-glucopyranose,C6H12O6,InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2...
88300,"alpha-D-glucose-1,2-((13)C2)",C4[13C]2H12O6,InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2...
148749,alpha-D-glucofuranose,C6H12O6,InChI=1S/C6H12O6/c7-1-2(8)5-3(9)4(10)6(11)12-5...
82808,agrocinopine D,C12H23O14P,InChI=1S/C12H23O14P/c13-1-3-5(15)7(17)9(11(19)...


In [16]:
glucoses_df.to_csv(os.path.join(chebi_dir, "glucoses.tsv"), sep="\t")

### Quick conclusions

- This does seem to find more or less all the forms of glucose we want for modeling
- Their relationships are intuitively clear
- These relationships are also there in the inchi strings, but not in a way that is trivial to infer in code.
- So. Just dump them all (meaning all simple sugars, however we define that) to a spreadsheet, assuming the total is manageable. Then manually annotate them according to a rational isomer addressing scheme.

In [17]:
# Take all descendents of hexose [18133]

hexose_ids = set()
pending = [18133]
while pending:
    id = pending.pop()
    if id not in hexose_ids:
        hexose_ids.add(id)
        pending.extend(children[id])
print(f"{len(hexose_ids)} hexose descendents")

hexoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in hexose_ids
]

hexoses_df = pd.DataFrame([
    {"id": c.id, "name": c.name, "formula": c.formula, "inchi": c.inchi}
    for c in hexoses
]).set_index("id")
print(hexoses_df.shape)

# hexoses_df.to_csv(os.path.join(chebi_dir, "hexoses.tsv"), sep="\t")

313 hexose descendents
(313, 3)


## Form-addressing scheme

In [18]:
class Variation:
    def __init__(self, name: str, forms: Iterable[str]):
        self.name = name
        self.forms = list(forms)
        
rings = Variation("ring-chain tautomerism", ["open", "r6α", "r6β", "r5α", "r5β"])

### Forms

- In the case of sugars, the form designation has a systematic application to both structure and naming.
  - Structure: defined transformation of inchi string? I'll have to figure this out at some point.
  - Naming: straightforward in this case. The trick will be to make this a general framework for other types of variations.

In [19]:
class FormSpec:
    def __init__(self, form: str, name_format: str):
        self.form = form
        self.name_format = name_format
        
    def form_name(self, root: str):
        return self.name_format.format(root)

    
class Variation:
    def __init__(self, name: str, base: FormSpec, forms: Iterable[FormSpec]):
        self.base = base
        self.forms = {form.form: form for form in forms}
        
    def base_name(self, root):
        return self.base.form_name(root)
    
    def form_name(self, form, root):
        return self.forms[form].form_name(root)

rings = Variation("ring-chain tautomerism",
                  FormSpec(None, "{0}ose"),
                  [
                      FormSpec("open", "aldehydo-{0}ose"),
                      FormSpec("r6α", "α-{0}opyranose"),
                      FormSpec("r6β", "β-{0}opyranose"),
                      FormSpec("r5α", "α-{0}ofuranose"),
                      FormSpec("r5β", "β-{0}ofuranose"),
                  ])

for root in ("D-gluc", "D-fruct", "D-mann"):
    print(rings.base_name(root))
    for form in rings.forms:
        print(rings.form_name(form, root))
    print()

D-glucose
aldehydo-D-glucose
α-D-glucopyranose
β-D-glucopyranose
α-D-glucofuranose
β-D-glucofuranose

D-fructose
aldehydo-D-fructose
α-D-fructopyranose
β-D-fructopyranose
α-D-fructofuranose
β-D-fructofuranose

D-mannose
aldehydo-D-mannose
α-D-mannopyranose
β-D-mannopyranose
α-D-mannofuranose
β-D-mannofuranose



In [20]:
dl = Variation("optical isomers", FormSpec(None, "{0}"), [FormSpec("D", "D-{0}"), FormSpec("L", "L-{0}")])
for root in ("gluc", "fruct", "mann"):
    print(rings.base_name(dl.base_name(root)))
    for dl_form in dl.forms:
        print(rings.base_name(dl.form_name(dl_form, root)))
        for ring_form in rings.forms:
            print(rings.form_name(ring_form, dl.form_name(dl_form, root)))
    print()

glucose
D-glucose
aldehydo-D-glucose
α-D-glucopyranose
β-D-glucopyranose
α-D-glucofuranose
β-D-glucofuranose
L-glucose
aldehydo-L-glucose
α-L-glucopyranose
β-L-glucopyranose
α-L-glucofuranose
β-L-glucofuranose

fructose
D-fructose
aldehydo-D-fructose
α-D-fructopyranose
β-D-fructopyranose
α-D-fructofuranose
β-D-fructofuranose
L-fructose
aldehydo-L-fructose
α-L-fructopyranose
β-L-fructopyranose
α-L-fructofuranose
β-L-fructofuranose

mannose
D-mannose
aldehydo-D-mannose
α-D-mannopyranose
β-D-mannopyranose
α-D-mannofuranose
β-D-mannofuranose
L-mannose
aldehydo-L-mannose
α-L-mannopyranose
β-L-mannopyranose
α-L-mannofuranose
β-L-mannofuranose



One wrinkle: open-chain fructose is keto- not aldehydo-.
- Different versions of ring-chain tautomerism for aldo vs keto sugars?
- Type-neutral names instead, i.e. simply "linear D-fructose"?
- Use some property of the base molecule?

OTOH the goal of this is not necessarily to systematically generate all of ChEBI, but rather to _augment_ ChEBI to make it easier to navigate these relationships. The name generation scheme above _might_ assist in setting up those augmented relationships, but manual curation is always an option (if a bit tedious).

What we really need to explore is how to _use_ those augmented relationships in modeling. Prime example: RHEA defines
- [RHEA:11816] PGI = aldehydo-D-glucose-6P <=> keto-D-fructose-6P
- [RHEA:16109] PFK = ATP + β-D-fructose-6P <=> ADP + β-D-fructose-1,6-bisphosphate + H+

Because they apply to different _forms_ of fructose-6P, it is nontrivial to connect them in a pathway. We want to coerce these reaction definitions to use a canonical molecule (fructose-6P, or D-fructose-6P), with the form designations `open` or `r5β` as secondary detail.

In [21]:
name_gen = {
    "D": ["D-{0}"],
    "L": ["L-{0}"],
    
#     "root": ["{0}se"],
    "open": ["aldehydo-{0}", "keto-{0}"],
    "r6α": ["alpha-{0}opyran", "alpha-{0}"],
    "r6β": ["beta-{0}opyran", "beta-{0}"],
    "r5α": ["alpha-{0}ofuran"],
    "r5β": ["beta-{0}ofuran"],
    
    "sugar": ["{0}ose"],
    
    "1P": ["{0} 1-phosphate", "{0}-1-phosphate", "{0} 1P", "{0}-1P"],
    "6P": ["{0} 6-phosphate", "{0}-6-phosphate", "{0} 6P", "{0}-6P"],
    
    "1-": ["{0}(1-)"],
    "2-": ["{0}(2-)"],
}

# Build all (valid) combinations of forms to be mapped to ChEBI, on multiple dimensions of variation.
forms = []
for form1 in [None, "D", "L"]:
    for form2 in [None, "open", "r6α", "r6β", "r5α", "r5β"]:
        for form3 in [None, "1P", "6P"]:
            # Look at protonation states only for phoshpo- forms
            if form3:
                for form4 in [None, "1-", "2-"]:
                    forms.append((form1, form2, "sugar", form3, form4))
            else:
                forms.append((form1, form2, "sugar", form3, None))

name_root = "mann"
shorthand_root = "Man"
possible_names = {}
for form in forms:
    shorthand = ".".join([shorthand_root] + [f for f in form if f is not None and f != "sugar"])
    names = [name_root]
    # Form parts are in the order they need to be used to generate possible names.
    for f in form:
        if f:
            extended_names = []
            for pat in name_gen[f]:
                for name in names:
                    extended_names.append(pat.format(name))
            names = extended_names
    possible_names[shorthand] = names


## Payoff: map these names to compounds queried from the KB

In [22]:
reverse_map = {name: shorthand for shorthand, names in possible_names.items() for name in names}
mapped_compounds = {}
for doc in kb.KB.compounds.find({"name": {"$in": list(reverse_map.keys())}}).collation({"locale": 'en', "strength": 1}):
    compound = kb.CODECS[Molecule].decode(doc)
    mapped_compounds[reverse_map[compound.name]] = compound

print(f"{shorthand_root}: {len(mapped_compounds)} out of {len(forms)} found")
print()
for shorthand, compound in sorted(mapped_compounds.items()):
    print(f"{shorthand:16s} {compound._id:7d} {compound.name}")


Man: 22 out of 126 found

Man                37684 mannose
Man.D              16024 D-mannose
Man.D.1P           35374 D-mannose 1-phosphate
Man.D.6P           17369 D-mannose 6-phosphate
Man.D.open         37675 aldehydo-D-mannose
Man.D.open.6P      48042 aldehydo-D-mannose 6-phosphate
Man.D.r5α         153460 alpha-D-mannofuranose
Man.D.r5β         152296 beta-D-mannofuranose
Man.D.r6α          28729 alpha-D-mannose
Man.D.r6α.1P       18205 alpha-D-mannose 1-phosphate
Man.D.r6α.1P.2-    58409 alpha-D-mannose 1-phosphate(2-)
Man.D.r6α.6P       43896 alpha-D-mannose 6-phosphate
Man.D.r6α.6P.2-    60332 alpha-D-mannose 6-phosphate(2-)
Man.D.r6β          28563 beta-D-mannose
Man.D.r6β.6P       49728 beta-D-mannose 6-phosphate
Man.L              37676 L-mannose
Man.L.open         37681 aldehydo-L-mannose
Man.L.r5α         148580 alpha-L-mannofuranose
Man.L.r5β         146876 beta-L-mannofuranose
Man.L.r6α          37680 alpha-L-mannose
Man.L.r6β          37679 beta-L-mannose
Man.open     

In [23]:
def map_forms(name_root, shorthand_root, forms, name_gen):
    possible_names = {}
    for form in forms:
        shorthand = ".".join([shorthand_root] + [f for f in form if f is not None and f != "sugar"])
        names = [name_root]
        # Form parts are in the order they need to be used to generate possible names.
        for f in form:
            if f:
                extended_names = []
                for pat in name_gen[f]:
                    for name in names:
                        extended_names.append(pat.format(name))
                names = extended_names
        possible_names[shorthand] = names

    reverse_map = {name: shorthand for shorthand, names in possible_names.items() for name in names}
    mapped_compounds = {}
    for doc in kb.KB.compounds.find({"name": {"$in": list(reverse_map.keys())}}).collation({"locale": 'en', "strength": 1}):
        compound = kb.CODECS[Molecule].decode(doc)
        mapped_compounds[reverse_map[compound.name]] = compound
    return mapped_compounds

mapped_compounds = map_forms("fruct", "Fru", forms, name_gen)
print(f"{len(mapped_compounds)} out of {len(forms)} found")
print()
for shorthand, compound in sorted(mapped_compounds.items()):
    print(f"{shorthand:16s} {compound._id:7d} {compound.name}")

26 out of 126 found

Fru                28757 fructose
Fru.1P             78737 fructose 1-phosphate
Fru.6P             88003 fructose 6-phosphate
Fru.D              15824 D-fructose
Fru.D.1P           78736 D-fructose 1-phosphate
Fru.D.6P           78697 D-fructose 6-phosphate
Fru.D.6P.2-        57579 D-fructose 6-phosphate(2-)
Fru.D.open         48095 keto-D-fructose
Fru.D.open.1P      18105 keto-D-fructose 1-phosphate
Fru.D.open.6P      15946 keto-D-fructose 6-phosphate
Fru.D.r5α          37720 alpha-D-fructofuranose
Fru.D.r5β          28645 beta-D-fructofuranose
Fru.D.r5β.1P      139419 beta-D-fructofuranose 1-phosphate
Fru.D.r5β.1P.2-   138881 beta-D-fructofuranose 1-phosphate(2-)
Fru.D.r5β.6P       16084 beta-D-fructofuranose 6-phosphate
Fru.D.r5β.6P.2-    57634 beta-D-fructofuranose 6-phosphate(2-)
Fru.D.r6α          37719 alpha-D-fructopyranose
Fru.D.r6β          41005 beta-D-fructopyranose
Fru.D.r6β.1P       42320 beta-D-fructopyranose 1-phosphate
Fru.L              28120 L-fr

In [24]:
mapped_compounds = map_forms("gluc", "Glc", forms, name_gen)
print(f"{len(mapped_compounds)} out of {len(forms)} found")
print()
for shorthand, compound in sorted(mapped_compounds.items()):
    print(f"{shorthand:16s} {compound._id:7d} {compound.name}")

25 out of 126 found

Glc                17234 glucose
Glc.D              17634 D-glucose
Glc.D.6P           14314 D-glucose 6-phosphate
Glc.D.open         42758 aldehydo-D-glucose
Glc.D.open.6P      15954 aldehydo-D-glucose 6-phosphate
Glc.D.open.6P.2-   57584 aldehydo-D-glucose 6-phosphate(2-)
Glc.D.r5α         148749 alpha-D-glucofuranose
Glc.D.r5β         145606 beta-D-glucofuranose
Glc.D.r6α          17925 alpha-D-glucose
Glc.D.r6α.1P       29042 alpha-D-glucose 1-phosphate
Glc.D.r6α.1P.2-    58601 alpha-D-glucose 1-phosphate(2-)
Glc.D.r6α.6P       17665 alpha-D-glucose 6-phosphate
Glc.D.r6α.6P.2-    58225 alpha-D-glucose 6-phosphate(2-)
Glc.D.r6β          15903 beta-D-glucose
Glc.D.r6β.1P       16218 beta-D-glucose 1-phosphate
Glc.D.r6β.1P.2-    57684 beta-D-glucose 1-phosphate(2-)
Glc.D.r6β.6P       17719 beta-D-glucose 6-phosphate
Glc.D.r6β.6P.2-    58247 beta-D-glucose 6-phosphate(2-)
Glc.L              37624 L-glucose
Glc.L.open         37626 aldehydo-L-glucose
Glc.L.r5α      

In [25]:
mapped_compounds = map_forms("altr", "Alt", forms, name_gen)
print(f"{len(mapped_compounds)} out of {len(forms)} found")
print()
for shorthand, compound in sorted(mapped_compounds.items()):
    print(f"{shorthand:16s} {compound._id:7d} {compound.name}")

12 out of 126 found

Alt                37708 altrose
Alt.D              28385 D-altrose
Alt.D.r5α         152069 alpha-D-altrofuranose
Alt.D.r5β         153974 beta-D-altrofuranose
Alt.D.r6α         155672 alpha-D-altropyranose
Alt.D.r6β         148773 beta-D-altropyranose
Alt.L             149543 L-altrose
Alt.L.open        147973 aldehydo-L-altrose
Alt.L.r5α         150666 alpha-L-altrofuranose
Alt.L.r5β         150524 beta-L-altrofuranose
Alt.L.r6α         154259 alpha-L-altropyranose
Alt.L.r6β          63421 beta-L-altrose


In [26]:
%%time
mapped_compounds = {}
for root, shorthand in [
    ("all", "All"),
    ("altr", "Alt"),
    ("gluc", "Glc"),
    ("mann", "Man"),
    ("gul", "Gul"),
    ("id", "Ido"),
    ("galact", "Gal"),
    ("tal", "Tal"),
    
    ("psic", "Gul"),
    ("fruct", "Fru"),
    ("sorb", "Sor"),
    ("tagat", "Tag"),
]:
    mapped_compounds.update(map_forms(root, shorthand, forms, name_gen))

for shorthand, compound in mapped_compounds.items():
    compound.shorthand = shorthand

print(f"{len(mapped_compounds)} total hexoses and hexose-phosphates organized")

204 total hexoses and hexose-phosphates organized
CPU times: user 38.9 ms, sys: 7.86 ms, total: 46.7 ms
Wall time: 74.6 ms


In [27]:
mapped_compounds["Glc.D.6P"]

Molecule(_id=14314, name='D-glucose 6-phosphate', shorthand='Glc.D.6P', aka=['D-glucose 6-(dihydrogen phosphate)', '6-O-phosphono-D-glucose'], crossref=[ChEBI:14314], formula='C6H13O9P', mass=260.13578, charge=0, inchi=None)

In [28]:
mapped_compounds["Glc.D.r6β.6P.2-"]

Molecule(_id=58247, name='beta-D-glucose 6-phosphate(2-)', shorthand='Glc.D.r6β.6P.2-', aka=['6-O-phosphonato-beta-D-glucopyranose', 'beta-D-glucopyranose 6- phosphate', 'beta-D-glucose 6-phosphate', 'beta-D-glucose 6-phosphate dianion'], crossref=[ChEBI:58247, MetaCyc:GLC-6-P], formula='C6H11O9P', mass=258.1199, charge=-2, inchi='InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)/p-2/t2-,3-,4+,5-,6-/m1/s1')

- Overall this looks really good, at least modestly scalable
- Looming challenges:
    - Which form combos are present or absent. ChEBI is highly curated and _mostly_ systematic, but there are definite holes
    - Form combos should be unordered, but shorthand strings have a set order. Glc.D.r6β.6P.2- maybe makes more sense as Glc.D.6P.r6β.2- or Glc.D.6P.2-.r6β.
        - Less important for more canonical forms, e.g. Glc.D.6P
        
**Next:** read in some RHEA reactions and map to canonical forms

# Reading RHEA

In [29]:
rhea_dir = "/home/fdrusso/work/data/rhea"
pd.read_csv(os.path.join(rhea_dir, "rhea-directions.tsv"), sep="\t").head()

Unnamed: 0,RHEA_ID_MASTER,RHEA_ID_LR,RHEA_ID_RL,RHEA_ID_BI
0,10000,10001,10002,10003
1,10004,10005,10006,10007
2,10008,10009,10010,10011
3,10012,10013,10014,10015
4,10016,10017,10018,10019


RHEA is organized around 'quartets'
- Master - indeterminate or unspecified direction
- irreversible left -> right
- irreversible right -> left
- explicitly reversible

Not clear what is gained by this representation vs say a reversibility attribute. One possibility is it's all about the cross-references to other reaction DBs. Need to explore a bit more.

## Main RHEA reaction definitions are in RDF

In [30]:
import rdflib

In [31]:
%%time
rhea_rdf = rdflib.Graph().parse(os.path.join(rhea_dir, "rhea.rdf"))

CPU times: user 2min 42s, sys: 925 ms, total: 2min 42s
Wall time: 2min 43s


<Graph identifier=Nc21ba3041a0e4225b6ead250235005a3 (<class 'rdflib.graph.Graph'>)>

In [32]:
from rdflib.namespace import RDFS
RH = rdflib.namespace.Namespace("http://rdf.rhea-db.org/")

rhea_rdf.bind("rh", RH)
rhea_rdf.bind("rdfs", RDFS)
rhea_rdf.bind("ch", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/"))
rhea_rdf.bind("ch2", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/chebi#"))
rhea_rdf.bind("ch3", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/chebi/"))
rhea_rdf.bind("up", rdflib.namespace.Namespace("http://purl.uniprot.org/core/"))
rhea_rdf.bind("ec", rdflib.namespace.Namespace("http://purl.uniprot.org/enzyme/"))
rhea_rdf.bind("pubmed", rdflib.namespace.Namespace("http://rdf.ncbi.nlm.nih.gov/pubmed/"))

rhea_rdf.bind("ECOCYC", rdflib.namespace.Namespace("http://identifiers.org/biocyc/ECOCYC:"))
rhea_rdf.bind("METACYC", rdflib.namespace.Namespace("http://identifiers.org/biocyc/METACYC:"))
rhea_rdf.bind("KEGG", rdflib.namespace.Namespace("http://identifiers.org/kegg.reaction/"))
rhea_rdf.bind("REACTOME", rdflib.namespace.Namespace("http://identifiers.org/reactome/"))


In [33]:
for s, p, o in rhea_rdf.triples((RH["16109"], None, None)):
    print(s.n3(rhea_rdf.namespace_manager), p.n3(rhea_rdf.namespace_manager), o.n3(rhea_rdf.namespace_manager))


rh:16109 rdfs:subClassOf rh:Reaction
rh:16109 rh:id "16109"^^xsd:long
rh:16109 rh:accession "RHEA:16109"
rh:16109 rdfs:label "ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)"
rh:16109 rh:equation "ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)"
rh:16109 rh:htmlEquation "ATP + &#946;-<small>D</small>-fructose 6-phosphate = ADP + &#946;-<small>D</small>-fructose 1,6-bisphosphate + H<small><sup>+</sup></small>"
rh:16109 rh:directionalReaction rh:16110
rh:16109 rh:directionalReaction rh:16111
rh:16109 rh:bidirectionalReaction rh:16112
rh:16109 rh:status rh:Approved
rh:16109 rh:isChemicallyBalanced "true"^^xsd:boolean
rh:16109 rh:isTransport "false"^^xsd:boolean
rh:16109 rh:ec ec:2.7.1.11
rh:16109 rdfs:seeAlso ch:GO_0003872
rh:16109 rh:citation pubmed:12981037
rh:16109 rh:citation pubmed:23729568
rh:16109 rh:citation pubmed:26205495
rh:16109 rh:citation pubmed:4224472
rh:16109 rh:citation pubmed:4237772
rh:16109 rh:citatio

## Pull it into a more workable structure

In [42]:
scalars = {
    RH.id,
    RDFS.label,
    RH.equation,
    RH.status,
    RH.isTransport,
    
    RH.curatedOrder,
    
    RH.accession,
    RH.name,
    RH.formula,
    RH.charge,
    
    RH.location,
    RH.position,
}
lists = {
    RH.ec,
    RDFS.seeAlso,
}
objects = {
    RH.bidirectionalReaction,
    RH.directionalReaction,

    RH.side,

    RH.contains1,
    RH.contains2,
    RH.contains3,
    RH.contains4,
    RH.contains5,
    RH.contains6,
    RH.contains7,
    RH.contains8,
    RH.contains9,
    RH.contains10,
    RH.contains11,
    RH.contains12,
    RH.contains13,
    RH.contains14,
    RH.contains15,
    RH.contains16,
    RH.contains17,
    RH.contains18,
    RH.contains19,
    RH.contains20,

    RH.compound,
    RH.reactivePart,
}
drop = {
    RDFS.comment,
    RDFS.subClassOf,
    RH.citation,
    RH.chebi,
    RH.contains,
    RH.htmlEquation,
    RH.htmlName,
    RH.isChemicallyBalanced,
    RH.products,
    RH.substrates,
    RH.substratesOrProducts,
    RH.transformableTo,
}

def extract_value(g, o):
    if type(o) == rdflib.Literal:
        return o.toPython()
    else:
        return o.n3(g.namespace_manager)
    
def extract_object(g, s):
    result = {}
    for p, o in g[s]:
        if p in scalars:
            p = extract_value(g, p)
            o = extract_value(g, o)
            result[p] = o
        elif p in lists:
            p = extract_value(g, p)
            o = extract_value(g, o)
            if p in result:
                result[p].append(o)
            else:
                result[p] = [o]
        elif p in objects:
            p = extract_value(g, p)
            o = extract_object(g, o)
            if p in result:
                result[p].append(o)
            else:
                result[p] = [o]
        elif p not in drop:
            print(f"Ignoring {extract_value(g, s)} {extract_value(g, p)}")
    return result

extract_object(rhea_rdf, RH["16109"])
    

{'rh:id': 16109,
 'rh:accession': 'RHEA:16109',
 'rdfs:label': 'ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)',
 'rh:equation': 'ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)',
 'rh:directionalReaction': [{'rh:id': 16110,
   'rh:accession': 'RHEA:16110',
   'rdfs:label': 'ATP + beta-D-fructose 6-phosphate => ADP + beta-D-fructose 1,6-bisphosphate + H(+)',
   'rh:equation': 'ATP + beta-D-fructose 6-phosphate => ADP + beta-D-fructose 1,6-bisphosphate + H(+)',
   'rh:status': 'rh:Approved',
   'rh:isTransport': False,
   'rdfs:seeAlso': ['ECOCYC:6PFRUCTPHOS-RXN',
    'METACYC:6PFRUCTPHOS-RXN',
    'REACTOME:R-HSA-70467.5']},
  {'rh:id': 16111,
   'rh:accession': 'RHEA:16111',
   'rdfs:label': 'ADP + beta-D-fructose 1,6-bisphosphate + H(+) => ATP + beta-D-fructose 6-phosphate',
   'rh:equation': 'ADP + beta-D-fructose 1,6-bisphosphate + H(+) => ATP + beta-D-fructose 6-phosphate',
   'rh:status': 'rh:Approved',
   'rh:i

## Coerce master reactions (only) into the KB Reaction structure

In [43]:
def to_dbxref(rhea_xref):
    # Special cases
    for prefix, db in [
        ("ch:GO_", "GO"),
    ]:
        if rhea_xref.startswith(prefix):
            return DbCrossRef(db, rhea_xref[len(prefix):])

    # Generally otherwise just split on a colon
    parts = rhea_xref.split(":")
    if len(parts) == 2:
        return DbCrossRef(parts[0].upper(), parts[1])
    else:
        return DbCrossRef("RHEA", rhea_xref)
    
    
def to_reaction(rhea_rxn):
    crossref = []
    if "rh:ec" in rhea_rxn:
        crossref.extend(to_dbxref(rhea_xref) for rhea_xref in rhea_rxn["rh:ec"])
    if "rdfs:seeAlso" in rhea_rxn:
        crossref.extend(to_dbxref(rhea_xref) for rhea_xref in rhea_rxn["rdfs:seeAlso"])
    # Lump in the rest of the quartet's xrefs -- not rigorous by RHEA standards, but reasonable for us.
    for key in ["rh:directionalReaction", "rh:bidirectionalReaction"]:
        for other_reaction in rhea_rxn.get(key, []):
            if "rh:ec" in other_reaction:
                crossref.extend(to_dbxref(rhea_xref) for rhea_xref in other_reaction["rh:ec"])
            if "rdfs:seeAlso" in other_reaction:
                crossref.extend(to_dbxref(rhea_xref) for rhea_xref in other_reaction["rdfs:seeAlso"])
    
    multipliers = [None, -1, +1]  # curatedOrder -> stoichiometry sign, 1-based
    stoichiometry = {}
    for side in rhea_rxn["rh:side"]:
        multiplier = multipliers[side["rh:curatedOrder"]]
        for count in range(1, 21):
            key = "rh:contains" + str(count)
            for rhea_compound in side.get(key, []):
                chebi_id = rhea_compound["rh:compound"][0]["rh:accession"]
                if chebi_id.startswith("CHEBI:"):
                    # We'll use the in-memory molecules for now, but this will utimately be a KB lookup
                    compound = compounds[int(chebi_id[6:])]
                    stoichiometry[compound] = multiplier * count
                else:
                    raise ValueError(f"Unrecognized compound ID {chebi_id}")
    
    reversible = False
    if "rh:bidirectionalReaction" in rhea_rxn and rhea_rxn["rh:bidirectionalReaction"][0].get("rdfs:seeAlso"):
        reversible = True
    
    return Reaction(
        _id = rhea_rxn["rh:id"],
        name = rhea_rxn["rdfs:label"],
        crossref = crossref or None,
        stoichiometry = stoichiometry,
        reversible = reversible,
    )

to_reaction(extract_object(rhea_rdf, RH["16109"]))

Reaction(_id=16109, name='ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)', shorthand=None, aka=None, crossref=[EC:2.7.1.11, GO:0003872, ECOCYC:6PFRUCTPHOS-RXN, METACYC:6PFRUCTPHOS-RXN, REACTOME:R-HSA-70467.5, KEGG:R00756], stoichiometry={Molecule(_id=30616, name='ATP(4-)', shorthand=None, aka=['atp', "adenosine 5'-triphosphate(4-)", 'ATP'], crossref=[ChEBI:30616], formula='C10H12N5O13P3', mass=503.14946, charge=-4, inchi='InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25-30(21,22)28-31(23,24)27-29(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H,23,24)(H2,11,12,13)(H2,18,19,20)/p-4/t4-,6-,7-,10-/m1/s1'): -1, Molecule(_id=57634, name='beta-D-fructofuranose 6-phosphate(2-)', shorthand=None, aka=['beta-D-fructose 6-phosphate', 'beta-D-fructofuranose 6-phosphate', '6-O-phosphonato-beta-D-fructofuranose', 'beta-D-fructofuranose 6-phosphate dianion'], crossref=[ChEBI:57634], formula='C6H11O9P', mass=258.1199, charge=-2, inchi='InChI=

In [44]:
reactions = {}
skipped = {}
for s, _, _ in rhea_rdf.triples((None, RDFS.subClassOf, RH.Reaction)):
    rhea_rxn = extract_object(rhea_rdf, s)
    if rhea_rxn["rh:status"] == "rh:Approved":
        try:
            reaction = to_reaction(rhea_rxn)
            reactions[reaction._id] = reaction
        
        except Exception:
            skipped[s] = rhea_rxn

print()
print(f"{len(reactions)} reactions parsed succesfully, {len(skipped)} skipped")

Ignoring rh:Compound_12833 rh:polymerizationIndex
Ignoring rh:Compound_12833 rh:underlyingChebi
Ignoring rh:10068_L rh:containsN
Ignoring rh:Compound_14260 rh:polymerizationIndex
Ignoring rh:Compound_14260 rh:underlyingChebi
Ignoring rh:10068_R rh:containsN
Ignoring rh:10068_R rh:containsN
Ignoring rh:Compound_12983 rh:polymerizationIndex
Ignoring rh:Compound_12983 rh:underlyingChebi
Ignoring rh:10256_L rh:containsN
Ignoring rh:10256_L rh:containsN
Ignoring rh:Compound_14598 rh:polymerizationIndex
Ignoring rh:Compound_14598 rh:underlyingChebi
Ignoring rh:10256_R rh:containsN
Ignoring rh:10256_R rh:containsN
Ignoring rh:10256_R rh:contains2n
Ignoring rh:Compound_9593 rh:polymerizationIndex
Ignoring rh:Compound_9593 rh:underlyingChebi
Ignoring rh:10464_L rh:containsN
Ignoring rh:10464_R rh:containsN
Ignoring rh:Compound_9597 rh:polymerizationIndex
Ignoring rh:Compound_9597 rh:underlyingChebi
Ignoring rh:Compound_14738 rh:polymerizationIndex
Ignoring rh:Compound_14738 rh:underlyingChebi
I

Ignoring rh:14993_L rh:contains2n
Ignoring rh:14993_L rh:contains2n
Ignoring rh:14993_L rh:containsN
Ignoring rh:14993_R rh:containsN
Ignoring rh:14993_R rh:containsNplus1
Ignoring rh:14993_R rh:contains2n
Ignoring rh:Compound_9830 rh:polymerizationIndex
Ignoring rh:Compound_9830 rh:underlyingChebi
Ignoring rh:Compound_11585 rh:polymerizationIndex
Ignoring rh:Compound_11585 rh:underlyingChebi
Ignoring rh:Compound_12635 rh:polymerizationIndex
Ignoring rh:Compound_12635 rh:underlyingChebi
Ignoring rh:Compound_9517 rh:polymerizationIndex
Ignoring rh:Compound_9517 rh:underlyingChebi
Ignoring rh:Compound_9548 rh:polymerizationIndex
Ignoring rh:Compound_9548 rh:underlyingChebi
Ignoring rh:Compound_9549 rh:polymerizationIndex
Ignoring rh:Compound_9549 rh:underlyingChebi
Ignoring rh:Compound_9517 rh:polymerizationIndex
Ignoring rh:Compound_9517 rh:underlyingChebi
Ignoring rh:Compound_9526 rh:polymerizationIndex
Ignoring rh:Compound_9526 rh:underlyingChebi
Ignoring rh:Compound_9517 rh:polymeriz

Ignoring rh:Compound_12623 rh:polymerizationIndex
Ignoring rh:Compound_12623 rh:underlyingChebi
Ignoring rh:Compound_14295 rh:polymerizationIndex
Ignoring rh:Compound_14295 rh:underlyingChebi
Ignoring rh:Compound_12840 rh:polymerizationIndex
Ignoring rh:Compound_12840 rh:underlyingChebi
Ignoring rh:21012_L rh:containsN
Ignoring rh:Compound_14256 rh:polymerizationIndex
Ignoring rh:Compound_14256 rh:underlyingChebi
Ignoring rh:21012_R rh:containsN
Ignoring rh:21012_R rh:containsN
Ignoring rh:Compound_9829 rh:polymerizationIndex
Ignoring rh:Compound_9829 rh:underlyingChebi
Ignoring rh:Compound_9965 rh:polymerizationIndex
Ignoring rh:Compound_9965 rh:underlyingChebi
Ignoring rh:Compound_12939 rh:polymerizationIndex
Ignoring rh:Compound_12939 rh:underlyingChebi
Ignoring rh:Compound_14378 rh:polymerizationIndex
Ignoring rh:Compound_14378 rh:underlyingChebi
Ignoring rh:Compound_9517 rh:polymerizationIndex
Ignoring rh:Compound_9517 rh:underlyingChebi
Ignoring rh:Compound_9527 rh:polymerization

Ignoring rh:Compound_9537 rh:polymerizationIndex
Ignoring rh:Compound_9537 rh:underlyingChebi
Ignoring rh:Compound_9539 rh:polymerizationIndex
Ignoring rh:Compound_9539 rh:underlyingChebi
Ignoring rh:Compound_9565 rh:polymerizationIndex
Ignoring rh:Compound_9565 rh:underlyingChebi
Ignoring rh:Compound_9566 rh:polymerizationIndex
Ignoring rh:Compound_9566 rh:underlyingChebi
Ignoring rh:Compound_9537 rh:polymerizationIndex
Ignoring rh:Compound_9537 rh:underlyingChebi
Ignoring rh:Compound_9539 rh:polymerizationIndex
Ignoring rh:Compound_9539 rh:underlyingChebi
Ignoring rh:Compound_9565 rh:polymerizationIndex
Ignoring rh:Compound_9565 rh:underlyingChebi
Ignoring rh:Compound_9566 rh:polymerizationIndex
Ignoring rh:Compound_9566 rh:underlyingChebi
Ignoring rh:Compound_9565 rh:polymerizationIndex
Ignoring rh:Compound_9565 rh:underlyingChebi
Ignoring rh:Compound_9566 rh:polymerizationIndex
Ignoring rh:Compound_9566 rh:underlyingChebi
Ignoring rh:Compound_9537 rh:polymerizationIndex
Ignoring rh

Ignoring rh:Compound_14378 rh:polymerizationIndex
Ignoring rh:Compound_14378 rh:underlyingChebi
Ignoring rh:Compound_12939 rh:polymerizationIndex
Ignoring rh:Compound_12939 rh:underlyingChebi
Ignoring rh:Compound_9550 rh:polymerizationIndex
Ignoring rh:Compound_9550 rh:underlyingChebi
Ignoring rh:Compound_9551 rh:polymerizationIndex
Ignoring rh:Compound_9551 rh:underlyingChebi
Ignoring rh:Compound_14004 rh:polymerizationIndex
Ignoring rh:Compound_14004 rh:underlyingChebi
Ignoring rh:32087_L rh:containsN
Ignoring rh:Compound_13379 rh:polymerizationIndex
Ignoring rh:Compound_13379 rh:underlyingChebi
Ignoring rh:32087_R rh:containsN
Ignoring rh:Compound_9521 rh:polymerizationIndex
Ignoring rh:Compound_9521 rh:underlyingChebi
Ignoring rh:Compound_9525 rh:polymerizationIndex
Ignoring rh:Compound_9525 rh:underlyingChebi
Ignoring rh:34391_L rh:contains28
Ignoring rh:34391_R rh:contains28
Ignoring rh:34391_R rh:contains28
Ignoring rh:35591_L rh:containsN
Ignoring rh:35591_R rh:containsNminus1


Ignoring rh:Compound_12390 rh:polymerizationIndex
Ignoring rh:Compound_12390 rh:underlyingChebi
Ignoring rh:Compound_12391 rh:polymerizationIndex
Ignoring rh:Compound_12391 rh:underlyingChebi
Ignoring rh:Compound_12420 rh:polymerizationIndex
Ignoring rh:Compound_12420 rh:underlyingChebi
Ignoring rh:Compound_12421 rh:polymerizationIndex
Ignoring rh:Compound_12421 rh:underlyingChebi
Ignoring rh:Compound_12583 rh:polymerizationIndex
Ignoring rh:Compound_12583 rh:underlyingChebi
Ignoring rh:50240_R rh:containsN
Ignoring rh:Compound_12634 rh:polymerizationIndex
Ignoring rh:Compound_12634 rh:underlyingChebi
Ignoring rh:Compound_9529 rh:polymerizationIndex
Ignoring rh:Compound_9529 rh:underlyingChebi
Ignoring rh:Compound_12939 rh:polymerizationIndex
Ignoring rh:Compound_12939 rh:underlyingChebi
Ignoring rh:Compound_14378 rh:polymerizationIndex
Ignoring rh:Compound_14378 rh:underlyingChebi
Ignoring rh:Compound_12939 rh:polymerizationIndex
Ignoring rh:Compound_12939 rh:underlyingChebi
Ignoring 

Ignoring rh:55540_L rh:contains40
Ignoring rh:55540_L rh:contains40
Ignoring rh:55540_R rh:contains40
Ignoring rh:55540_R rh:contains40
Ignoring rh:55540_R rh:contains26
Ignoring rh:Compound_12840 rh:polymerizationIndex
Ignoring rh:Compound_12840 rh:underlyingChebi
Ignoring rh:55672_L rh:containsN
Ignoring rh:Compound_14257 rh:polymerizationIndex
Ignoring rh:Compound_14257 rh:underlyingChebi
Ignoring rh:55672_R rh:containsN
Ignoring rh:55672_R rh:containsN
Ignoring rh:Compound_14274 rh:polymerizationIndex
Ignoring rh:Compound_14274 rh:underlyingChebi
Ignoring rh:Compound_14276 rh:polymerizationIndex
Ignoring rh:Compound_14276 rh:underlyingChebi
Ignoring rh:Compound_12939 rh:polymerizationIndex
Ignoring rh:Compound_12939 rh:underlyingChebi
Ignoring rh:Compound_14378 rh:polymerizationIndex
Ignoring rh:Compound_14378 rh:underlyingChebi
Ignoring rh:Compound_14318 rh:polymerizationIndex
Ignoring rh:Compound_14318 rh:underlyingChebi
Ignoring rh:55848_L rh:containsN
Ignoring rh:Compound_14319

In [45]:
reactions[11816]

Reaction(_id=11816, name='aldehydo-D-glucose 6-phosphate = keto-D-fructose 6-phosphate', shorthand=None, aka=None, crossref=[EC:5.3.1.9, GO:0004347, KEGG:R00771, ECOCYC:PGLUCISOM-RXN, METACYC:PGLUCISOM-RXN], stoichiometry={Molecule(_id=57584, name='aldehydo-D-glucose 6-phosphate(2-)', shorthand=None, aka=['aldehydo-D-glucose 6-phosphate', '6-O-phosphonato-D-glucose', 'aldehydo-D-glucose 6-phosphate dianion'], crossref=[ChEBI:57584], formula='C6H11O9P', mass=258.1199, charge=-2, inchi='InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h1,3-6,8-11H,2H2,(H2,12,13,14)/p-2/t3-,4+,5+,6+/m0/s1'): -1, Molecule(_id=57579, name='D-fructose 6-phosphate(2-)', shorthand=None, aka=['D-fructose 6-phosphate dianion', '6-O-phosphonato-D-fructose', 'keto-D-fructose 6-phosphate', 'D-fructose 6-phosphate'], crossref=[ChEBI:57579, MetaCyc:CPD-15709], formula='C6H11O9P', mass=258.1199, charge=-2, inchi='InChI=1S/C6H13O9P/c7-1-3(8)5(10)6(11)4(9)2-15-16(12,13)14/h4-7,9-11H,1-2H2,(H2,12,13,14)/p-2/t4-,