# Init

In [1]:
import projectpath

import collections
import itertools
import os

import ipywidgets as widgets
import ipysheet
import pandas as pd

from kb import codecs, kb
from model.core import Molecule, Reaction, DbXref, Variation, Specialization

KB = kb.configure_kb()

# Identify all forms of Glucose (and other sugars)

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

## Generate possible names and map to reference ChEBI compounds
- Monosaccharide code reference: https://www.genome.jp/kegg/catalog/codes2.html
- Name could hit preferred name or AKA
    - Preferred name takes priority
    - More general form takes priority
- Some naming conventions are contextual, e.g. 'β-D-glucose' is assumed to mean 'β-D-glucopyranose', while 'β-D-fructose' means 'β-D-fructo_furan_ose'. We handle this by including implied form names that get substituted to actual specified forms.

### On protons
- ChEBI standardizes on the fully protonated (balanced) form across the board, even though for many of the species we are interested in, specifically sugar-phosphates, the dominant form at pH=7.3 is ionic.
    - e.g. 'glucose 6-phosphate' refers to the fully protonated form, while the dominant form is 'glucose 6-phosphate(2-)'
- Our goal is inutuitive behavior for the biological modeler. So we will standardize on the expected form under biological conditions
    - 'glucose 6-phosphate' refers to the form that has lost two protons. The fully protonated form is understood, but treated as a variation on this otherwise expected form, i.e. 'glucose 6-phosphate (fully protonated)'

In [2]:
# Arguably this is the 'Knowledge' in Knowledge Base
VARIATIONS = {
    'DL': Variation('DL', ['D', 'L']),

    # 'deoxy': Variation('deoxy', ['deoxy']),
    'phospho': Variation('phospho', ['1P', '2P', '3P', '4P', '5P', '6P', '7P', 'bis15', 'bis16', 'bis17', 'BIS']),

    'ring-chain': Variation('ring-chain', ['open', 'r6', 'r5', 'RING']),
    'ring-linkage': Variation('ring-linkage', ['α', 'β']),

    'protons': Variation('protons', ['full', '2-', '4-']),
}
# Assumption: form names are unique _among all variations referred to in this group_.
form_variant = {}
for v in VARIATIONS.values():
    for f in v.form_names:
        form_variant[f] = v

sugars = {
    # Hexoses
    'All': {'code': 'All', 'stem': 'all', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},
    'Alt': {'code': 'Alt', 'stem': 'altr', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('L',)},
    'Glc': {'code': 'Glc', 'stem': 'gluc', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},
    'Man': {'code': 'Man', 'stem': 'mann', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},
    'Gul': {'code': 'Gul', 'stem': 'gul', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},
    'Ido': {'code': 'Ido', 'stem': 'id', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('L',)},
    'Gal': {'code': 'Gal', 'stem': 'galact', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},
    'Tal': {'code': 'Tal', 'stem': 'tal', 'carbons': 6, 'sub': {'RING': 'r6', 'BIS': 'bis16'}, 'default': ('D',)},

    'Psi': {'code': 'Psi', 'stem': 'psic', 'carbons': 6, 'sub': {'RING': 'r5', 'BIS': 'bis16'}, 'default': ('D',)},
    'Fru': {'code': 'Fru', 'stem': 'fruct', 'carbons': 6, 'sub': {'RING': 'r5', 'BIS': 'bis16'}, 'default': ('D',)},
    'Sor': {'code': 'Sor', 'stem': 'sorb', 'carbons': 6, 'sub': {'RING': 'r5', 'BIS': 'bis16'}, 'default': ('L',)},
    'Tag': {'code': 'Tag', 'stem': 'tagat', 'carbons': 6, 'sub': {'RING': 'r5', 'BIS': 'bis16'}, 'default': ('D',)},

    # Pentoses
    'Ara': {'code': 'Ara', 'stem': 'arabin', 'carbons': 5, 'sub': {'RING': 'r6', 'BIS': 'bis15'}, 'default': ('L',)},  # r6???
    'Lyx': {'code': 'Lyx', 'stem': 'lyx', 'carbons': 5, 'sub': {'RING': 'r5', 'BIS': 'bis15'}, 'default': ('D',)},
    'Rib': {'code': 'Rib', 'stem': 'rib', 'carbons': 5, 'sub': {'RING': 'r5', 'BIS': 'bis15'}, 'default': ('D',)},
    'Xyl': {'code': 'Xyl', 'stem': 'xyl', 'carbons': 5, 'sub': {'RING': 'r6', 'BIS': 'bis15'}, 'default': ('D',)},  # r6???

    'Rul': {'code': 'Rul', 'stem': 'ribul', 'carbons': 5, 'sub': {'RING': 'r5', 'BIS': 'bis15'}, 'default': ('D',)},
    'Xul': {'code': 'Xul', 'stem': 'xylul', 'carbons': 5, 'sub': {'RING': 'r5', 'BIS': 'bis15'}, 'default': ('D',)},

    # Tetroses
    'Ery': {'code': 'Ery', 'stem': 'erythr', 'carbons': 4, 'sub': {'RING': 'r5'}, 'default': ('D',)},
    'Tho': {'code': 'Tho', 'stem': 'thre', 'carbons': 4, 'sub': {'RING': 'r5'}, 'default': ('D',)},
    'Eul': {'code': 'Eul', 'stem': 'erythrul', 'carbons': 4, 'sub': {'RING': 'r5'}, 'default': ('D',)},
    
    # Heptose
    'Sed': {'code': 'Sed', 'stem': 'sedoheptul', 'carbons': 7, 'sub': {'RING': 'r6', 'BIS': 'bis17'}, 'default': ('D',)},
}

# Form-name generation patterns, in the order they should be applied.
name_gen = {
    'D': ['D-{}'],
    'L': ['L-{}'],

    # 'deoxy': ['deoxy-{}', 'deoxy{}'],

    'open': ['aldehydo-{}', 'keto-{}'],
    'r6': ['{}opyran'],
    'r5': ['{}ofuran'],
    'RING': ['{}RING'],

    'α': ['alpha-{}'],
    'β': ['beta-{}'],

    'ose': ['{}ose'],

    '1P': ['{} 1-phosphate'],
    '2P': ['{} 2-phosphate'],
    '3P': ['{} 3-phosphate'],
    '4P': ['{} 4-phosphate'],
    '5P': ['{} 5-phosphate'],
    '6P': ['{} 6-phosphate'],
    '7P': ['{} 7-phosphate'],
    'bis15': ['{} 1,5-bisphosphate', '{}-1,5-bisphosphate'],
    'bis16': ['{} 1,6-bisphosphate', '{}-1,6-bisphosphate'],
    'bis17': ['{} 1,7-bisphosphate', '{}-1,7-bisphosphate'],
    'BIS': ['{} BISbisphosphate', '{}-BISbisphosphate'],

    'full': ['{}'],  # This is the ChEBI name, unspecified = fully protonated
    '2-': ['{}(2-)', '{}[2-]'],
    '4-': ['{}(4-)', '{}[4-]'],
}


def generate_ringchain_tautomers():
    """Yields tuples of form names covering 'ring-chain' and 'ring-linkage'."""
    for fring in VARIATIONS['ring-chain'].form_names:
        if fring != 'RING':
            yield fring, None
        if fring != 'open':
            for flink in VARIATIONS['ring-linkage'].form_names:
                yield fring, flink,
            
def generate_phospho_forms():
    """Yields tuples of form names covering 'phospho' and 'protons'."""
    for fphos in VARIATIONS['phospho'].form_names:
        # Always specify protonation state
        for fcharge in VARIATIONS['protons'].form_names:
            yield fphos, fcharge

def generate_all_forms():
    """Yields tuples for all potentially valid forms of a typical sugar."""
    def remove_nones(form):
        return tuple(f for f in form if f is not None)

    yield ()  # root form, unspecified over all types of variation.
    
    # We will consider DL stereoisomer always to be specified for all other forms.
    for fstereo in VARIATIONS['DL'].form_names:
        yield fstereo,
        
        # Unphosphorylated form exhibits ring-chain tautomerism
        for fring, flink in generate_ringchain_tautomers():
            yield remove_nones((fstereo, fring, flink))
        
        # Phosphorylated have protonation state, but may or may not specify tautomerization. Maintain order of precedence.
        for fphos, fprot in generate_phospho_forms():
            yield remove_nones((fstereo, fphos, fprot))
            for fring, flink in generate_ringchain_tautomers():
                yield remove_nones((fstereo, fphos, fring, flink, fprot))


def build_form_names(sugar, form):
    """Builds possible names for the specified form of the given sugar."""
    names = [sugar['stem']]
    tags = set(form) | {'ose'}

    # Generation steps need to be applied in the order they appear in the dict
    for tag, patterns in name_gen.items():
        if tag in tags:
            names = [pat.format(name) for pat in patterns for name in names]
    return names


def build_all_names(sugar):
    """Generate all possible names for each form of the designated sugar."""
    sub = sugar['sub']
    names = collections.defaultdict(list)
    for form in generate_all_forms():
        names_ = build_form_names(sugar, form)
        
        # A lot of trouble to go through so we can infer e.g. that 'β-D-glucose' is (D, **r6**, β).
        actual_form = []
        for f in form:
            if f in sub:
                names_ = [name.replace(f, '') for name in names_]
                f = sub[f]
            actual_form.append(f)
        names[tuple(actual_form)].extend(names_)
            
    return names


def map_compounds(sugar):
    names = {}
    for form, form_names in build_all_names(sugar).items():
        key = (sugar['code'],) + form
        names[key] = form_names

    # Reverse the mapping to identify the form denoted by a given name.
    name_key = {}
    for key in names.keys():
        for name in names[key]:
            # No ambiguity, please
            if name not in name_key:
                name_key[name] = key
            else:
                raise ValueError(f'Name collision: {name} = {name_key[name]} or {key}')

    # Find compounds in ref.CHEBI, by name first. Hits by name are never ambiguous here.
    all_names = list(name_key.keys())
    found = collections.defaultdict(list)
    for doc in KB.client.ref.CHEBI.find({'name': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
        found[name_key[doc['name']]].append(doc)
    # Fall back to AKA. There are cases where the same name is an AKA for multiple forms.
    for doc in KB.client.ref.CHEBI.find({'aka': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
        for name in doc['aka']:
            if name in name_key:
                found[name_key[name]].append(doc)

    # Map key -> compound, most general first. Also track compound -> key to detect ambiguities.
    chebi_compounds = {}
    compound_key = collections.defaultdict(list)
    for key in sorted(found):
        docs = found[key]
        # Only the first hit.
        compound = codecs.CODECS[Molecule].decode(docs[0])
        # Only the most general form.
        if compound not in compound_key:
            chebi_compounds[key] = compound
        # But keep track of form ambiguities
        compound_key[compound].append(key)

    for compound, keys in compound_key.items():
        if len(keys) > 1:
            print(f'{compound.name} [{compound._id}]'
                  f' may be {" or ".join(".".join(key) for key in keys)}')

    return chebi_compounds


def form_grid(compounds):
    form_compounds = collections.defaultdict(dict)
    for key in sorted(compounds.keys()):
        compound = compounds[key]
        code = key[0]
        form = key[1:]
        form_compounds['.'.join(form)][code] = f'[{compound._id}] {compound.name}'

    forms = sorted(form_compounds.keys())
    return pd.DataFrame([form_compounds[form] for form in forms], index=forms).fillna('-')

## Do the actual mapping to ChEBI

In [3]:
%%time
chebi_compounds = {}
for sugar in sugars.values():
    chebi_compounds.update(map_compounds(sugar))
    
# Manual overrides for cases where the naming in ChEBI does not quite adhere to the strict rules
for key, _id in {
    ('Tag', 'D', '1P', 'full'): None,  # pyranose
    ('Tag', 'D', '1P', 'r6', 'full'): 138801,
    ('Tag', 'D', '1P', '2-'): None,  # pyranose
    ('Tag', 'D', '1P', 'r6', '2-'): 138150,
    ('Tag', 'D', '6P', '2-'): None,  # keto-
    ('Tag', 'D', '6P', 'open', '2-'): 134283,
    ('Tag', 'L', '6P', '2-'): None,  # keto-
    ('Tag', 'L', '6P', 'open', '2-'): 134284,

    ('Sed',): None,
    ('Sed', 'D'): 16802,  # D assumed
    ('Sed', 'D', '1P', 'full'): 9082,  # D assumed
    ('Sed', 'D', '7P', 'full'): 15721,  # D assumed
    ('Sed', 'D', '7P', '2-'): 57483,  # D assumed
    ('Sed', 'D', 'bis17'): 17969,  # D assumed
    ('Sed', 'D', 'bis17', '4-'): 58335,  # D assumed
}.items():
    if _id is not None:
        chebi_compounds[key] = KB.get(KB.CHEBI, _id)
    else:
        chebi_compounds.pop(key)

print()
print(f'Grand total: {len(chebi_compounds)} sugars systematically mapped.')

D-tagatopyranose 1-phosphate(2-) [138150] may be Tag.D.1P.2- or Tag.D.1P.r6.2-
D-tagatopyranose 1-phosphate [138801] may be Tag.D.1P.full or Tag.D.1P.r6.full
keto-D-tagatose 6-phosphate(2-) [134283] may be Tag.D.6P.2- or Tag.D.6P.open.2-
keto-L-tagatose 6-phosphate(2-) [134284] may be Tag.L.6P.2- or Tag.L.6P.open.2-
sedoheptulose [16802] may be Sed or Sed.D

Grand total: 430 sugars systematically mapped.
CPU times: user 386 ms, sys: 2.14 ms, total: 388 ms
Wall time: 625 ms


## Translate to KB compounds
- Anchor on deprotonated forms of sugar-phosphates
- Systematic ID based on sugar + form
- Xref to ChEBI ID

In [4]:
def build_kb_compound(key, chebi_compound):
    sugar = sugars[key[0]]
    fprot = key[-1] if key[-1] in VARIATIONS['protons'].form_names else None

    # Update key and name to match KB conventions
    if fprot == 'full':
        name = chebi_compound.name + ' (fully protonated)'
    elif fprot is not None:
        key = key[:-1]
        name = build_form_names(sugar, key[1:])[0]
    else:
        name = chebi_compound.name
        
    shorthand = '.'.join(key)
    xrefs = {DbXref('CHEBI', chebi_compound._id)}
    if chebi_compound.xrefs:
        xrefs.update(chebi_compound.xrefs)
    
    return key, Molecule(
        _id=shorthand,
        name=name,
        shorthand=shorthand,
        description=chebi_compound.description,
        aka=chebi_compound.aka,
        xrefs=xrefs,
        formula=chebi_compound.formula,
        mass=chebi_compound.mass,
        charge=chebi_compound.charge,
        inchi=chebi_compound.inchi,
    )
        
kb_compounds = {}
for key, chebi_compound in chebi_compounds.items():
    key, kb_compound = build_kb_compound(key, chebi_compound)
    kb_compounds[key] = kb_compound
    

## Build Specialization relationships. Fill in any missing canonical forms.
- D and L forms are canonical
- Phosphosugars with no further specified form are canonical
- Remaining variation (esp ring-chain tautomerism) is with respect to canonical forms

In [5]:
canon = [
    {'groups': ['DL'], 'variations': ['ring-chain', 'ring-linkage']},
    {'groups': ['DL', 'phospho'], 'variations': ['ring-chain', 'ring-linkage', 'protons']},
]

def build_canonical(sugar, form):
    """Builds a new Molecule for the specified canonical-form sugar."""
    formula = {
        (4, 1): 'C4H7O7P',
        (5, 1): 'C5H9O8P',
        (5, 2): 'C5H8O11P2',
        (6, 1): 'C6H11O9P',
        (6, 2): 'C6H10O12P2',
        (7, 1): 'C7H13O10P',
        (7, 2): 'C7H12O13P2',
    }
    mass = {
        (4, 1): 198.0679,
        (5, 1): 228.0939,
        (5, 2): 306.0579,
        (6, 1): 258.1199,
        (6, 2): 336.0839,
        (7, 1): 288.1459,
        (7, 2): 366.1099,
    }
    carbons = sugar['carbons']
    if form[-1] in ['1P', '2P', '3P', '4P', '5P', '6P', '7P']:
        phosphates = 1
    elif form[-1] in ['bis15', 'bis16', 'bis17']:
        phosphates = 2
    else:
        phosphates = 0

    name = build_form_names(sugar, form)[0]
    key = (sugar['code'],) + form
    shorthand = '.'.join(key)
    return Molecule(
        _id=shorthand,
        name=name,
        shorthand=shorthand,
        formula=formula[carbons, phosphates],
        mass=mass[carbons, phosphates],
        charge = -2 * phosphates,
    )


for group_def in canon:
    # Possible canonical forms from all combinations in the group
    for code in sugars:
        for canonical_form in itertools.product(*(VARIATIONS[v].form_names for v in group_def['groups'])):
            canonical_key = (code,) + canonical_form
            observed = collections.defaultdict(set)

            # Find all subforms from combinations of the remaining variations
            for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                               for v in group_def['variations'])):
                subform = tuple(f for f in subform if f is not None)
                if not subform:
                    # None for all remaining variations is just the canonical form itself; skip.
                    continue

                key = canonical_key + subform
                if key in kb_compounds:
                    compound = kb_compounds[key]
                    compound.canonical_form = Specialization('.'.join(canonical_key), subform, compound._id)
                    for f in subform:
                        observed[form_variant[f].name].add(f)

            # A possible becomes an actual canonical form if any of its subforms was observed
            if observed:
                canonical = kb_compounds.get(canonical_key)
                # An actual canonical form must exist, even if overlooked in the reference
                if canonical is None:
                    print(f'Canonical form {canonical_key} is not defined in the reference. Creating...')
                    name = build_form_names(sugars[code], canonical_form)[0]
                    shorthand = '.'.join(canonical_key)
                    canonical = build_canonical(sugars[code], canonical_form)
                    kb_compounds[canonical_key] = canonical
                canonical.variations = [Variation(k, list(v)) for k, v in observed.items()]

# Finally, set the default form for each otherwise unspecified sugar
for code in sugars:
    if (code,) in kb_compounds:
        default_form = sugars[code]['default']
        default = kb_compounds[(code,) + default_form]
        kb_compounds[code,].default_form = Specialization(code, default_form, default._id)
            

Canonical form ('Glc', 'D', '1P') is not defined in the reference. Creating...
Canonical form ('Glc', 'D', '3P') is not defined in the reference. Creating...
Canonical form ('Glc', 'D', '6P') is not defined in the reference. Creating...
Canonical form ('Man', 'D', '1P') is not defined in the reference. Creating...
Canonical form ('Man', 'D', '6P') is not defined in the reference. Creating...
Canonical form ('Gal', 'D', '1P') is not defined in the reference. Creating...
Canonical form ('Gal', 'D', '6P') is not defined in the reference. Creating...
Canonical form ('Fru', 'D', '1P') is not defined in the reference. Creating...
Canonical form ('Fru', 'D', '2P') is not defined in the reference. Creating...
Canonical form ('Fru', 'D', 'bis16') is not defined in the reference. Creating...
Canonical form ('Sor', 'D', '1P') is not defined in the reference. Creating...
Canonical form ('Sor', 'D', 'bis16') is not defined in the reference. Creating...
Canonical form ('Tag', 'D', '1P') is not defin

In [6]:
kb_compounds['Glc', 'D', '3P'].__dict__

{'_id': 'Glc.D.3P',
 'name': 'D-glucose 3-phosphate',
 'shorthand': 'Glc.D.3P',
 'description': None,
 'aka': None,
 'xrefs': None,
 'formula': 'C6H11O9P',
 'mass': 258.1199,
 'charge': -2,
 'inchi': None,
 'variations': [Variation(name='protons', form_names=['full']),
  Variation(name='ring-chain', form_names=['r6', 'open']),
  Variation(name='ring-linkage', form_names=['α'])],
 'canonical_form': None,
 'default_form': None}

## Put it officially in the KB

In [7]:
%%time
for compound in kb_compounds.values():
    KB.put(KB.compounds, compound, bypass_cache=True)


CPU times: user 232 ms, sys: 10.8 ms, total: 243 ms
Wall time: 360 ms
