# Variations Revisited

## Case Study: Glucose

In [11]:
import projectpath

import itertools
import ipysheet
import pandas as pd
from mosmo.knowledge import kb
from mosmo.knowledge.thermo import Thermodynamics
from mosmo.model import DbXref, Molecule, Variation, Specialization

KB = kb.configure_kb()
thermo = Thermodynamics(p_h=7.3, p_mg=1.5, ionic_strength=0.25, temperature=298.15)

In [2]:
variations = [
    Variation('DL', ['D', 'L']),
    Variation('phospho', ['1P', '2P', '3P', '4P', '5P', '6P', '7P', 'bis15', 'bis16', 'bis17', 'BIS']),
    Variation('ring-chain', ['open', 'r6', 'r5', 'RING']),
    Variation('ring-linkage', ['α', 'β']),
    Variation('protons', ['full', '2-', '4-']),
]

# Relying completely on systematic ID scheme.
glc_forms = {}
for form in itertools.product(*(([None] + v.form_names) for v in variations)):
    clean_form = tuple(f for f in form if f)
    id = ".".join(("Glc",) + clean_form)
    mol = KB.get(KB.compounds, id)
    if mol:
        glc_forms[clean_form] = mol

print(f"{len(glc_forms)} forms of glucose and phosphoglucose in the KB")

43 forms of glucose and phosphoglucose in the KB


In [13]:
data = []
sources = set()
for form, mol in glc_forms.items():
    row = {"id": mol.id, "name": mol.name}
    
    if mol.canonical_form is not None:
        row["canonical"] = mol.canonical_form.parent_id
        # row["canonical"] = ", ".join(mol.canonical_form.form)
    else:
        row["canonical"] = ""
    
    row["variations"] = len(mol.variations or [])
    if mol.default_form is not None:
        row["assumed"] = mol.default_form.child_id
        # row["assumed"] = ", ".join(mol.default_form.form)
    else:
        row["assumed"] = ""
    
    row["ΔG"] = thermo.formation_delta_g(mol)
        
    for xref in mol.xrefs or []:
        row[xref.db.id] = xref.id

    data.append(row)
df = pd.DataFrame(data)

def xref_format_fn(db):
    def format(id):
        if id and not pd.isna(id):
            xref = DbXref(db, id)
            url = xref.url(Molecule)
            if url:
                return f"<a href='{url}'>{id}</a>"
            else:
                return id
        else:
            return ""

formats = {source.id: xref_format_fn(source) for source in sources}
formats["ΔG"] = lambda v: f"{v:.1f}" if v else ""
df.style.format(formats)

Unnamed: 0,id,name,canonical,variations,assumed,ΔG,CHEBI,WIKI,KEGG,CAS,METACYC
0,Glc,glucose,,0,Glc.D,,17234.0,Glucose,C00293,50-99-7,
1,Glc.D,D-glucose,,2,,-406.0,17634.0,,,50-99-7,
2,Glc.D.open,aldehydo-D-glucose,Glc.D,0,,-400.2,42758.0,Glucose,,50-99-7,
3,Glc.D.r6,D-glucopyranose,Glc.D,0,,-406.0,4167.0,Glucose,D00009,2280-44-6,D-Glucose
4,Glc.D.r6.α,alpha-D-glucose,Glc.D,0,,-404.3,17925.0,,C00267,492-62-6,
5,Glc.D.r6.β,beta-D-glucose,Glc.D,0,,-404.3,15903.0,,C00221,492-61-5,
6,Glc.D.r5,D-glucofuranose,Glc.D,0,,,145664.0,,,,
7,Glc.D.r5.α,alpha-D-glucofuranose,Glc.D,0,,,148749.0,,,36468-84-5,
8,Glc.D.r5.β,beta-D-glucofuranose,Glc.D,0,,,145606.0,,,,
9,Glc.D.1P,D-glucose 1-phosphate,,3,,,,,,,


- That's a lot of detail. Most of the time we don't need to be confronted with that.
- OTOH many of those forms are cross-referenced to unique ChEBI records. Unwanted detail or not, it's probably (mostly) scientifically accurate
- OTOH many of the cross-references into KEGG and METACYC (and WIKI?) are redundant, or ambiguous, or to/from a form that is probably not what is intended semantically.
  - part of the challenge here is that e.g. KEGG:C00103 is defined in KEGG as the fully protonated form of G1P, and so is cross-referenced to CHEBI:29042, but is used semantically as generalized G1P (which isn't even in ChEBI in the specific sense we need).
- Basically the world out there is a mess. We need to do the best we can.
    - Invest a lot of curation effort into the generalized forms (specifying DL and phospho only)
        - These are what we will use to build pathways and/or reaction networks
    - Favor cross-references of these forms to KEGG and EcoCyc/MetaCyc, maybe some others if they come up
    - Cross-reference as precisely as possible to ChEBI, but defer a too-specific form is necessary
- Separately, we should validate and correct the ΔG values for specific forms. I assume _internally_ eQuilibrator does exactly that, but the API helpfully hides a lot of that detail