# Init

In [1]:
import collections
import itertools

from mosmo.knowledge import codecs, kb
from mosmo.model import Molecule, Reaction, DbXref, Variation, Specialization

KB = kb.configure_kb()
CHEBI = KB.client[KB.CHEBI.client_db][KB.CHEBI.collection]

# Identify all forms of Glucose and other sugars

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

## Generate possible names and map to reference ChEBI compounds
- Monosaccharide code reference: https://www.genome.jp/kegg/catalog/codes2.html
- Name could hit preferred name or AKA
    - Preferred name takes priority
    - More general form takes priority
- Some naming conventions are contextual, e.g. 'β-D-glucose' is assumed to mean 'β-D-gluco**pyran**ose', while 'β-D-fructose' means 'β-D-fructo**furan**ose'. We handle this by including implied form names that get substituted to actual specified forms.

### On protons
- ChEBI standardizes on the fully protonated (balanced) form across the board, even though for many of the species we are interested in, specifically sugar-phosphates, the dominant form at pH=7.3 is ionic.
    - e.g. 'glucose 6-phosphate' refers to the fully protonated form, while the dominant form is 'glucose 6-phosphate(2-)'
- Our goal is inutuitive behavior for the biological modeler. So we will standardize on the expected form under biological conditions
    - 'glucose 6-phosphate' refers to the form that has lost two protons. The fully protonated form is understood, but treated as a variation on this otherwise expected form, i.e. 'glucose 6-phosphate (fully protonated)'

### Define the space of possible forms of any sugar

In [2]:
# Arguably this is the 'Knowledge' in Knowledge Base
VARIATIONS = {
    # 'deoxy': Variation('deoxy', ['deoxy']),
    'DL': Variation('DL', ['dl', 'D', 'L']), # implied only for sedoheptulose, apparently

    'ring-chain': Variation('ring-chain', ['open', 'ring', 'r6', 'r5']), # Sometimes implied
    'ring-linkage': Variation('ring-linkage', ['α', 'β']),

    'phospho': Variation('phospho', ['1P', '2P', '3P', '4P', '5P', '6P', '7P',
                                     'bis', 'bis15', 'bis16', 'bis17']), # Sometimes implied
    'protons': Variation('protons', ['full', '2-', '4-']), # -1 and -3 forms are absent from ChEBI so don't bother
}

def generate_ringchain_tautomers():
    """Yields tuples of form names covering 'ring-chain' and 'ring-linkage'."""
    for fring in VARIATIONS['ring-chain'].form_names:
        # ring-chain may be infered, but only when a linkage is specified
        if fring != 'ring':
            yield fring, None
        if fring != 'open':
            for flink in VARIATIONS['ring-linkage'].form_names:
                yield fring, flink,
            
def generate_phospho_forms():
    """Yields tuples of form names covering 'phospho' and 'protons'."""
    for fphos in VARIATIONS['phospho'].form_names:
        # Always specify protonation state
        for fcharge in VARIATIONS['protons'].form_names:
            yield fphos, fcharge

def generate_all_forms():
    """Yields tuples for all potentially valid forms of a typical sugar."""
    def remove_nones(form):
        return tuple(f for f in form if f is not None)

    yield ()  # root form, unspecified over all types of variation.
    
    # We will consider DL stereoisomer always to be specified for all other forms.
    for fstereo in VARIATIONS['DL'].form_names:
        yield fstereo,
        
        # Unphosphorylated form exhibits ring-chain tautomerism
        for fring, flink in generate_ringchain_tautomers():
            yield remove_nones((fstereo, fring, flink))
        
        # Phosphorylated have protonation state, but may or may not specify tautomerization. Maintain order of precedence.
        for fphos, fprot in generate_phospho_forms():
            yield remove_nones((fstereo, fphos, fprot))
            for fring, flink in generate_ringchain_tautomers():
                yield remove_nones((fstereo, fphos, fring, flink, fprot))

### Define forms of actual sugars, with a broad range of possible 'standardized' names

In [3]:
sugars = {
    # Hexoses
    'All': {'code': 'All', 'stem': 'all', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},
    'Alt': {'code': 'Alt', 'stem': 'altr', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('L',)},
    'Glc': {'code': 'Glc', 'stem': 'gluc', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},
    'Man': {'code': 'Man', 'stem': 'mann', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},
    'Gul': {'code': 'Gul', 'stem': 'gul', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},
    'Ido': {'code': 'Ido', 'stem': 'id', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('L',)},
    'Gal': {'code': 'Gal', 'stem': 'galact', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},
    'Tal': {'code': 'Tal', 'stem': 'tal', 'carbons': 6, 'ring': 'r6', 'bis': 'bis16', 'default': ('D',)},

    'Psi': {'code': 'Psi', 'stem': 'psic', 'carbons': 6, 'ring': 'r5', 'bis': 'bis16', 'default': ('D',)},
    'Fru': {'code': 'Fru', 'stem': 'fruct', 'carbons': 6, 'ring': 'r5', 'bis': 'bis16', 'default': ('D',)},
    'Sor': {'code': 'Sor', 'stem': 'sorb', 'carbons': 6, 'ring': 'r5', 'bis': 'bis16', 'default': ('L',)},
    'Tag': {'code': 'Tag', 'stem': 'tagat', 'carbons': 6, 'ring': 'r5', 'bis': 'bis16', 'default': ('D',)},

    # Pentoses
    'Ara': {'code': 'Ara', 'stem': 'arabin', 'carbons': 5, 'ring': 'r6', 'bis': 'bis15', 'default': ('L',)},  # r6???
    'Lyx': {'code': 'Lyx', 'stem': 'lyx', 'carbons': 5, 'ring': 'r5', 'bis': 'bis15', 'default': ('D',)},
    'Rib': {'code': 'Rib', 'stem': 'rib', 'carbons': 5, 'ring': 'r5', 'bis': 'bis15', 'default': ('D',)},
    'Xyl': {'code': 'Xyl', 'stem': 'xyl', 'carbons': 5, 'ring': 'r6', 'bis': 'bis15', 'default': ('D',)},  # r6???

    'Rul': {'code': 'Rul', 'stem': 'ribul', 'carbons': 5, 'ring': 'r5', 'bis': 'bis15', 'default': ('D',)},
    'Xul': {'code': 'Xul', 'stem': 'xylul', 'carbons': 5, 'ring': 'r5', 'bis': 'bis15', 'default': ('D',)},

    # Tetroses
    'Ery': {'code': 'Ery', 'stem': 'erythr', 'carbons': 4, 'ring': 'r5', 'default': ('D',)},
    'Tho': {'code': 'Tho', 'stem': 'thre', 'carbons': 4, 'ring': 'r5', 'default': ('D',)},
    'Eul': {'code': 'Eul', 'stem': 'erythrul', 'carbons': 4, 'ring': 'r5', 'default': ('D',)},
    
    # Heptose
    'Sed': {'code': 'Sed', 'stem': 'sedoheptul', 'carbons': 7, 'ring': 'r6', 'bis': 'bis17', 'dl': 'D', 'default': ('D',)},
}

# Form-name generation patterns, in the order they should be applied.
name_gen = {
    # 'deoxy': ['deoxy-{}', 'deoxy{}'],
    'dl': ['{}'], # D or L implied, as opposed to explicitly DL-
    'D': ['D-{}'],
    'L': ['L-{}'],

    'open': ['aldehydo-{}', 'keto-{}'],
    'ring': ['{}'], # r5 or r6 is implied
    'r6': ['{}opyran'],
    'r5': ['{}ofuran'],

    'α': ['alpha-{}'],
    'β': ['beta-{}'],

    'ose': ['{}ose'],

    '1P': ['{} 1-phosphate', '1-O-phosphono-{}'],
    '2P': ['{} 2-phosphate', '2-O-phosphono-{}'],
    '3P': ['{} 3-phosphate', '3-O-phosphono-{}'],
    '4P': ['{} 4-phosphate', '4-O-phosphono-{}'],
    '5P': ['{} 5-phosphate', '5-O-phosphono-{}'],
    '6P': ['{} 6-phosphate', '6-O-phosphono-{}'],
    '7P': ['{} 7-phosphate', '7-O-phosphono-{}'],
    'bis': ['{} bisphosphate', '{}-bisphosphate'],  # which bis is implied
    'bis15': ['{} 1,5-bisphosphate', '{}-1,5-bisphosphate'],
    'bis16': ['{} 1,6-bisphosphate', '{}-1,6-bisphosphate'],
    'bis17': ['{} 1,7-bisphosphate', '{}-1,7-bisphosphate'],

    'full': ['{}'],  # This is the ChEBI name, unspecified = fully protonated
    '1-': ['{}(1-)', '{}[1-]'],
    '2-': ['{}(2-)', '{}[2-]'],
    '3-': ['{}(3-)', '{}[3-]'],
    '4-': ['{}(4-)', '{}[4-]'],
}

def build_form_names(sugar, form):
    """Builds possible names for the specified form of the given sugar."""
    names = [sugar['stem']]
    tags = set(form) | {'ose'}

    # Generation steps need to be applied in the order they appear in the dict
    for tag, patterns in name_gen.items():
        if tag in tags:
            names = [pat.format(name) for pat in patterns for name in names]
    return names

def infered_form(form, replacements):
    # A lot of trouble to go through so we can infer e.g. that 'β-D-glucose' is (D, **r6**, β).
    actual = []
    for tag in form:
        tag = replacements.get(tag, tag)
        if tag is None:
            # This form is not used by the specified sugar
            return None
        actual.append(tag)
    return tuple(actual)

def build_all_names(sugar):
    """Generate all possible names for each form of the designated sugar, whether or not they actually exist."""
    names = collections.defaultdict(list)
    replacements = {tag: sugar.get(tag) for tag in ['dl', 'ring', 'bis']}
    for form in generate_all_forms():
        actual = infered_form(form, replacements)
        if actual is not None:
            names[actual].extend(build_form_names(sugar, form))
            
    return names

### Look up ChEBI compounds by all the _possible_ names to find the ones that actually exist

In [4]:
def map_compounds(sugar):
    print(sugar['code'])
    names = {}
    for form, form_names in build_all_names(sugar).items():
        key = (sugar['code'],) + form
        names[key] = form_names

    # Reverse the mapping to identify the form denoted by a given name.
    name_key = {}
    for key in names.keys():
        for name in names[key]:
            name = name.lower()
            # No ambiguity, please
            if name in name_key:
                print(f'"{name}" remapped from {name_key[name]} to {key}')
            name_key[name] = key
    all_names = set(name_key.keys())

    # All found compounds are found by name. No additional hits by AKA.
    chebi_compounds = {}
    for doc in CHEBI.find({'name': {'$in': list(all_names)}}).collation({'locale': 'en', 'strength': 1}):
        compound = KB.CHEBI.codec.decode(doc)
        name = compound.name.lower()
        chebi_compounds[name_key[name]] = compound
        all_names.remove(name)
    print(f'  {len(chebi_compounds)} forms found by name')
    return chebi_compounds

## Do the actual mapping to ChEBI

In [5]:
%%time
chebi_compounds = {}
for sugar in sugars.values():
    chebi_compounds.update(map_compounds(sugar))
    print()
    
print(f'Grand total: {len(chebi_compounds)} sugars systematically mapped.')

All
  19 forms found by name

Alt
  17 forms found by name

Glc
  40 forms found by name

Man
  29 forms found by name

Gul
  17 forms found by name

Ido
  17 forms found by name

Gal
  34 forms found by name

Tal
  17 forms found by name

Psi
  5 forms found by name

Fru
  44 forms found by name

Sor
  21 forms found by name

Tag
  32 forms found by name

Ara
  27 forms found by name

Lyx
  17 forms found by name

Rib
  32 forms found by name

Xyl
  20 forms found by name

Rul
  12 forms found by name

Xul
  13 forms found by name

Ery
  5 forms found by name

Tho
  4 forms found by name

Eul
  11 forms found by name

Sed
"sedoheptulose" remapped from ('Sed',) to ('Sed', 'D')
  9 forms found by name

Grand total: 442 sugars systematically mapped.
CPU times: user 257 ms, sys: 4.75 ms, total: 261 ms
Wall time: 3.23 s


## Translate to KB compounds
- Anchor on deprotonated forms of sugar-phosphates
- Systematic ID based on sugar + form
- Xref to ChEBI ID

In [6]:
def kbid(key):
    return '.'.join(key)

def build_kb_compound(key, chebi_compound):
    # For sugar-phosphates, set the key and name with the charge unspecified.
    # TODO: make this explicit with a canonical form with unspecified charge, whose default form is charged.
    if key[-1] in ('2-', '4-'):
        key = key[:-1]
    id = kbid(key)
    
    # Assignment of a shorthand name indicates manual curation, so keep the existing KB form
    existing = KB.get(KB.compounds, id)
    if existing and existing.label != existing.id:
        return key, existing

    name = build_form_names(sugars[key[0]], key[1:])[0]
    if key[-1] == 'full':
        name += ' (fully protonated)'

    aka = set(chebi_compound.aka or [])
    if name != chebi_compound.name:
        aka.add(chebi_compound.name)
    if name in aka:
        aka.remove(name)

    xrefs = {chebi_compound.ref()}
    if chebi_compound.xrefs:
        xrefs.update(chebi_compound.xrefs)
    
    return key, Molecule(
        id=id,
        name=name,
        shorthand=None,
        description=chebi_compound.description,
        aka=list(aka) or None,
        xrefs=xrefs,
        formula=chebi_compound.formula,
        mass=chebi_compound.mass,
        charge=chebi_compound.charge,
        inchi=chebi_compound.inchi,
    )

kb_compounds = {}
for key, chebi_compound in chebi_compounds.items():
    key, kb_compound = build_kb_compound(key, chebi_compound)
    kb_compounds[key] = kb_compound

print(f"{len(kb_compounds)} KB compounds found or constructed")

442 KB compounds found or constructed


## Build Specialization relationships. Fill in any missing canonical forms.
- D and L forms are canonical
- Phosphosugars with no further specified form are canonical
- Remaining variation (esp ring-chain tautomerism) is with respect to canonical forms

In [7]:
canon = [
    {'groups': ['DL'], 'variations': ['ring-chain', 'ring-linkage']},
    {'groups': ['DL', 'phospho'], 'variations': ['ring-chain', 'ring-linkage', 'protons']},
]

def build_canonical(sugar, form):
    """Builds a new Molecule for the specified canonical-form sugar."""
    formula = {
        (4, 1): 'C4H7O7P',
        (5, 1): 'C5H9O8P',
        (5, 2): 'C5H8O11P2',
        (6, 1): 'C6H11O9P',
        (6, 2): 'C6H10O12P2',
        (7, 1): 'C7H13O10P',
        (7, 2): 'C7H12O13P2',
    }
    mass = {
        (4, 1): 198.0679,
        (5, 1): 228.0939,
        (5, 2): 306.0579,
        (6, 1): 258.1199,
        (6, 2): 336.0839,
        (7, 1): 288.1459,
        (7, 2): 366.1099,
    }
    carbons = sugar['carbons']
    if form[-1] in ['1P', '2P', '3P', '4P', '5P', '6P', '7P']:
        phosphates = 1
    elif form[-1] in ['bis15', 'bis16', 'bis17']:
        phosphates = 2
    else:
        phosphates = 0

    key = (sugar['code'],) + form
    return Molecule(
        id=kbid(key),
        name=build_form_names(sugar, form)[0],
        formula=formula[carbons, phosphates],
        mass=mass[carbons, phosphates],
        charge = -2 * phosphates,
    )

# Assumption: form names are unique _among all variations referred to in this group_.
form_variant = {}
for v in VARIATIONS.values():
    for f in v.form_names:
        form_variant[f] = v

for group_def in canon:
    # Possible canonical forms from all combinations in the group
    for canonical_form in itertools.product(*(VARIATIONS[v].form_names for v in group_def['groups'])):
        for code in sugars:
            canonical_key = (code,) + canonical_form
            canonical_id = kbid(canonical_key)
            observed = collections.defaultdict(set)

            # Find all subforms from combinations of the remaining variations
            for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                               for v in group_def['variations'])):
                subform = tuple(f for f in subform if f is not None)
                if not subform:
                    # None for all remaining variations is just the canonical form itself; skip.
                    continue

                key = canonical_key + subform
                if key in kb_compounds:
                    compound = kb_compounds[key]
                    compound.canonical_form = Specialization(kbid(canonical_key), subform, compound.id)
                    for f in subform:
                        observed[form_variant[f].name].add(f)

            # A possible becomes an actual canonical form if any of its subforms was observed
            if observed:
                canonical = kb_compounds.get(canonical_key)
                # An actual canonical form must exist, even if overlooked in the reference
                if canonical is None:
                    existing = KB.get(KB.compounds, canonical_id)
                    if existing and existing.label != canonical_id:
                        print(f'Reusing existing {canonical_id}')
                        canonical = existing
                    else:
                        print(f'Creating canonical form {canonical_key}.')
                        canonical = build_canonical(sugars[code], canonical_form)
                    kb_compounds[canonical_key] = canonical
                canonical.variations = [Variation(k, list(v)) for k, v in observed.items()]

# Finally, set the default form for each otherwise unspecified sugar
for code in sugars:
    if (code,) in kb_compounds:
        default_form = sugars[code]['default']
        default = kb_compounds[(code,) + default_form]
        kb_compounds[code,].default_form = Specialization(code, default_form, default.id)
            

Reusing existing Glc.D.1P
Creating canonical form ('Man', 'D', '1P').
Creating canonical form ('Gal', 'D', '1P').
Creating canonical form ('Fru', 'D', '1P').
Creating canonical form ('Sor', 'D', '1P').
Creating canonical form ('Tag', 'D', '1P').
Creating canonical form ('Rib', 'D', '1P').
Creating canonical form ('Xyl', 'D', '1P').
Creating canonical form ('Xul', 'D', '1P').
Creating canonical form ('Sed', 'D', '1P').
Creating canonical form ('Fru', 'D', '2P').
Creating canonical form ('Glc', 'D', '3P').
Creating canonical form ('Tho', 'D', '4P').
Creating canonical form ('Ara', 'D', '5P').
Reusing existing Rib.D.5P
Creating canonical form ('Xyl', 'D', '5P').
Reusing existing Glc.D.6P
Creating canonical form ('Man', 'D', '6P').
Creating canonical form ('Gal', 'D', '6P').
Creating canonical form ('Tag', 'D', '6P').
Creating canonical form ('Man', 'D', 'bis16').
Reusing existing Fru.D.bis16
Creating canonical form ('Sor', 'D', 'bis16').
Creating canonical form ('Tag', 'D', 'bis16').
Crea

In [8]:
kb_compounds['Glc', 'D', 'r6', 'β'].data()

Glc.D.r6.β
name: beta-D-glucopyranose
shorthand: None
aka: 
    BETA-D-GLUCOSE
    D-gluco-Hexose
    WURCS=2.0/1,1,0/[a2122h-1b_1-5]/1/
    beta-D-glucose
    beta-D-Glucose
description: None
xrefs: 
    CAS:492-61-5
    CHEBI:15903
    KEGG:C00221
formula: C6H12O6
mass: 180.15588
charge: 0
inchi: 
    InChI=1S/C6H12O6/c7-1-2-3(8)4(9)5(10)6(11)12-2/h2-
    11H,1H2/t2-,3-,4+,5-,6-/m1/s1
canonical_form: Glc.D


## Put it officially in the KB

_uncomment to run_

In [9]:
# %%time
# with KB.unlock(KB.compounds):
#     for compound in kb_compounds.values():
#         KB.put(KB.compounds, compound, bypass_cache=True)

CPU times: user 202 ms, sys: 7.94 ms, total: 210 ms
Wall time: 329 ms
