# Init

In [1]:
import projectpath

import collections
import itertools
import os

import ipywidgets as widgets
import ipysheet
import pandas as pd

from kb import kb
from scheme import Molecule, Reaction, DbXref, Variation, Specialization

REFDB = kb.REFDB
KB = kb.KB

# Identify all forms of Glucose (and other sugars)

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

## Variations and Specialization relationships
- D and L forms are canonical
- For most sugars, D is the default (call out the exceptions?)
    - For phospho forms, neutral is canonical but 2- is default for each phosphate @pH=7.3
- Remaining variation (esp ring-chain tautomerism) is with respect to canonical forms


In [2]:
VARIATIONS = {
    'DL': Variation('DL', ['D', 'L']),
    'ring-chain': Variation('ring-chain', ['open', 'r6a', 'r6b', 'r5a', 'r5b']),
    
    # Modification, not isomerization
    'phospho': Variation('phospho', ['1P', '2P', '3P', '4P', '5P', '6P', 'bis']),
    'deoxy': Variation('deoxy', ['deoxy']),
    
    # For sugars, these are only relevant for phospho- forms
    'charge': Variation('charge', ['4-', '2-', '0']),
}

variations = ['DL', 'phospho', 'ring-chain', 'charge']

## Generate possible names and map to reference ChEBI compounds
- Monosaccharide code reference: https://www.genome.jp/kegg/catalog/codes2.html
- Name could hit preferred name or AKA
    - Preferred name takes priority
    - More general form takes priority
- For hexoses, pyranose is semi-canonical, e.g. 'β-D-glucose' is 'β-D-glucopyranose'. But for pentoses we assume the furanose form, e.g 'β-D-ribose' is 'β-D-ribofuranose'. Ribopyranose is still possible. This could be trickyish.

In [82]:
code = 'Hep'
stem = 'hept'
carbons = 7

In [83]:
# Form-name generation patterns, in the order they should be applied.
name_gen = {
    'D': ['D-{}'],
    'L': ['L-{}'],

    'deoxy': ['deoxy-{}', 'deoxy{}'],

    'open': ['aldehydo-{}', 'keto-{}'],
    # 'r6a': ['alpha-{}opyran'],  # too ambiguous: 'alpha-{}'],
    # 'r6b': ['beta-{}opyran'],  # too ambiguous: 'beta-{}'],
    'r6a': ['alpha-{}opyran', 'alpha-{}'],  # accept the ambiguity and resolve it later
    'r6b': ['beta-{}opyran', 'beta-{}'],
    'r5a': ['alpha-{}ofuran'],
    'r5b': ['beta-{}ofuran'],

    'sugar': ['{}ose'],

    '1P': ['{} 1-phosphate', '{} 1 phosphate'],
    '2P': ['{} 2-phosphate', '{} 2 phosphate'],
    '3P': ['{} 3-phosphate', '{} 3 phosphate'],
    '4P': ['{} 4-phosphate', '{} 4 phosphate'],
    '5P': ['{} 5-phosphate', '{} 5 phosphate'],
    '6P': ['{} 6-phosphate', '{} 6 phosphate'],
    'bis': ['{} bisphosphate',
            '{}-bisphosphate',
            '{} 1,6-bisphosphate',
            '{}-1,6-bisphosphate',
            '{} 1,5-bisphosphate',
            '{}-1,5-bisphosphate'],
    
    '0': ['{}(0)', '{}(neutral)'],  # No modifier is indistinguishable from the parent form.
    '2-': ['{}(2-)', '{}[2-]'],
    '4-': ['{}(4-)', '{}[4-]'],
}
if carbons <= 5:
    name_gen.update({
        'r6a': ['alpha-{}opyran'],
        'r6b': ['beta-{}opyran'],
        'r5a': ['alpha-{}ofuran', 'alpha-{}'],
        'r5b': ['beta-{}ofuran', 'beta-{}'],
    })

def generate_names(stem, form):
    names = [stem]
    tags = {'sugar'} | {tag for tag in form if tag is not None}

    # Generation steps need to be applied in the order they appear in the dict
    for tag, patterns in name_gen.items():
        if tag in tags:
            names = [pat.format(name) for pat in patterns for name in names]
    return names

def build_shorthand(code, form):
    tokens = [code]
    tokens.extend(token for token in form if token)
    return '.'.join(tokens)

# Generate all possible names for each form, and map name back to form.
possible_names = {}
for form in itertools.product(*[[None] + VARIATIONS[v].form_names for v in variations]):
    form = tuple(f for f in form if f is not None)
    possible_names[form] = generate_names(stem, form)
name_form = {}
for form, names in possible_names.items():
    for name in names:
        if name not in name_form:
            name_form[name] = form
        else:
            print(f'Name collision: {name} = {name_form[name]} or {form}')

# Find by name first
all_names = list(name_form.keys())
found = collections.defaultdict(list)
for doc in REFDB.CHEBI.find({'name': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    found[name_form[doc['name']]].append(doc)
# Fall back to AKA. There are cases where the same name is an AKA for multiple forms.
for doc in REFDB.CHEBI.find({'aka': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    for name in doc['aka']:
        if name in name_form:
            found[name_form[name]].append(doc)

# Map form -> compound, most general first. Also track compound -> form to detect ambiguities.
mapped_compound = {}
compound_form = collections.defaultdict(list)
for form, docs in sorted(found.items(), key=lambda kv: [x if x else '' for x in kv[0]]):
    # Only the first hit.
    compound = kb.CODECS[Molecule].decode(docs[0])
    # Only the most general form.
    if compound not in compound_form:
        mapped_compound[form] = compound
    # But keep track of form ambiguities
    compound_form[compound].append(form)

# Switch over to KB id scheme, but keep the xref to ChEBI.
for form, compound in mapped_compound.items():
    if compound.xrefs == None:
        compound.xrefs = set()
    compound.xrefs.add(DbXref('CHEBI', compound._id))
    compound.shorthand = build_shorthand(code, form)
    compound._id = compound.shorthand

df = pd.DataFrame(({
    'form': f'({", ".join(f for f in form if f)})',
    'name': compound.name,
    'formula': compound.formula if compound.formula else '',
    'charge': compound.charge if compound.charge is not None else '',
    'xrefs': ', '.join(str(xref) for xref in sorted(compound.xrefs)) if compound.xrefs else '',
} for form, compound in mapped_compound.items()))
widget = ipysheet.from_dataframe(df)
widget

Sheet(cells=(Cell(column_end=0, column_start=0, numeric_format=None, row_end=0, row_start=0, squeeze_row=False…

### Build Specialization relationships. Fill in any missing canonical forms.

In [84]:
canon = [
    {'groups': ['DL'], 'variations': ['ring-chain']},
    {'groups': ['DL', 'phospho'], 'variations': ['ring-chain', 'charge']},
]

for group_def in canon:
    variant_map = {}
    for v in group_def['groups']:
        for f in VARIATIONS[v].form_names:
            variant_map[f] = v
    for v in group_def['variations']:
        for f in VARIATIONS[v].form_names:
            variant_map[f] = v

    # Possible canonical forms from all combinations in the group
    for canonical_form in itertools.product(*(VARIATIONS[v].form_names
                                              for v in group_def['groups'])):
        canonical_id = build_shorthand(code, canonical_form)
        observed = collections.defaultdict(set)
        
        # Find all subforms from combinations of the remaining variations
        for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                           for v in group_def['variations'])):
            subform = tuple(f for f in subform if f is not None)
            if not subform:
                # None for all remaining variations is just the canonical form itself; skip.
                continue
            
            specific_variant = mapped_compound.get(canonical_form + subform)
            if specific_variant:
                specific_variant.canonical_form = Specialization(canonical_id, subform, specific_variant._id)
                for f in subform:
                    observed[variant_map[f]].add(f)

        # A possible becomes an actual canonical form if any of its subforms was observed
        if observed:
            canonical = mapped_compound.get(canonical_form)
            # An actual canonical form must exist, even if overlooked in the reference
            if canonical is None:
                print(f'Canonical form {canonical_form} is not defined in the reference. Creating...')
                canonical = Molecule(_id=canonical_id, name=possible_names[canonical_form][0])
                mapped_compound[canonical_form] = canonical
            # Keep track of its observed variations
            # canonical.variations = [VARIATIONS[v]
            #                         for v in group_def['variations']
            #                         if v in observed]
            canonical.variations = [Variation(k, v) for k, v in observed.items()]

# Finally, set the defaults. Figure out a more general/scalable way to do this...
root = mapped_compound[()]
root.default_form = Specialization(root._id, ('D',), mapped_compound['D',]._id)

# root.variations ...? only isomers, or isomers + mods? is it then canonical for anything?

# Default for phospho- forms is 2- * num_phosphates. But may be a specific ring-chain tautomer.
# And that may vary by the sugar. Skip doing this automatically for now.


KeyError: ('D',)

## Put it officially in the KB

In [81]:
for compound in mapped_compound.values():
    KB.compounds.insert_one(kb.CODECS[Molecule].encode(compound))


In [127]:
changes = []
def log_it(change):
    changes.append(change)

class CompoundEditor:
    def __init__(self, compound=None):
        layout = widgets.Layout(width='auto')
        self._name = widgets.Text(placeholder='Compound name', layout=layout)
        self._shorthand = widgets.Text(placeholder='Shorthand', layout=layout)
        self._formula = widgets.Text(placeholder='Chemical formula', layout=widgets.Layout(width='50%'))
        self._mass = widgets.FloatText(placeholder='Mass', layout=widgets.Layout(width='50%'))
        self._aka = widgets.Textarea(placeholder='AKA', layout=layout)
        self._xrefs = widgets.Textarea(placeholder='Xrefs', layout=layout)
        self._description = widgets.Textarea(placeholder='Description', layout=layout)

        # self._name.observe(self._apply_name, 'value')
        # self._shorthand.observe(self._apply_shorthand, 'value')
        # self._formula.observe(self._apply_formula, 'value')
        # self._mass.observe(self._apply_mass, 'value')
        # self._aka.observe(self._apply_aka, 'value')
        # self._xrefs.observe(self._apply_xrefs, 'value')
        # self._description.observe(self._apply_description, 'value')
        self._description.observe(log_it, names='All', type='All')

        self._widget = widgets.VBox([
            self._name,
            self._shorthand,
            widgets.HBox([self._formula, self._mass]),
            self._aka,
            self._xrefs,
            self._description,
        ], layout=widgets.Layout(width='450px'))
        self.set_compound(compound)

    def set_compound(self, compound):
        self._compound = None  # No value change events will be applied
        if compound:
            self._name.value = compound.name
            self._shorthand.value = compound.shorthand or ''
            self._description.value = compound.description or ''
            self._formula.value = compound.formula or ''
            self._aka.value = '\n'.join(compound.aka) if compound.aka else ''
            self._xrefs = '\n'.join(str(xref) for xref in compound.xrefs) if compound.xrefs else ''
            self._compound = compound

    def _apply_name(self, change):
        if self._compound:
            self._compound.name = change['new']

    def _apply_shorthand(self, change):
        if self._compound:
            self._compound.shorthand = change['new']

    def _apply_description(self, change):
        if self._compound:
            self._compound.description = change['new']

    def _apply_formula(self, change):
        if self._compound:
            self._compound.formula = change['new']

    def _apply_mass(self, change):
        if self._compound:
            self._compound.mass = change['new']

    def _apply_aka(self, change):
        if self._compound:
            self._compound.aka = change['new'].split('\n')

    def _apply_xrefs(self, change):
        if self._compound:
            self._compound.xrefs = {DbXref.from_string(xref) for xref in change['new'].split('\n')}

    @property
    def widget(self):
        return self._widget


In [128]:
CompoundEditor(compound).widget

VBox(children=(Text(value='fructose 6-phosphate', layout=Layout(width='auto'), placeholder='Compound name'), T…

In [129]:
changes

[]

In [132]:
traitlets.All

NameError: name 'traitlets' is not defined

In [None]:
%%time
mapped_compounds = {}
for root, shorthand in [
    ('all', 'All'),
    ('altr', 'Alt'),
    ('gluc', 'Glc'),
    ('mann', 'Man'),
    ('gul', 'Gul'),
    ('id', 'Ido'),
    ('galact', 'Gal'),
    ('tal', 'Tal'),
    
    ('psic', 'Psi'),
    ('fruct', 'Fru'),
    ('sorb', 'Sor'),
    ('tagat', 'Tag'),
]:
    mapped_compounds.update(map_forms(root, shorthand, forms, name_gen))

for shorthand, compound in mapped_compounds.items():
    compound.shorthand = shorthand

print(f'{len(mapped_compounds)} total hexoses and hexose-phosphates organized')

- Overall this looks really good, at least modestly scalable
- Looming challenges:
    - Which form combos are present or absent. ChEBI is highly curated and _mostly_ systematic, but there are definite holes
    - Form combos should be unordered, but shorthand strings have a set order. Glc.D.r6β.6P.2- maybe makes more sense as Glc.D.6P.r6β.2- or Glc.D.6P.2-.r6β.
        - Less important for more canonical forms, e.g. Glc.D.6P
        
**Next:** read in some RHEA reactions and map to canonical forms

# Explore is_a relationships

In [None]:
parents = collections.defaultdict(list)
children = collections.defaultdict(list)
for row in pd.read_csv(os.path.join(chebi_dir, 'relation.tsv'), sep='\t').itertuples():
    if row.TYPE == 'is_a' and row.INIT_ID in compounds and row.FINAL_ID in compounds:
        parents[row.FINAL_ID].append(row.INIT_ID)
        children[row.INIT_ID].append(row.FINAL_ID)

print(f'{len(parents)} entries have parents; {len(children)} entries have children')

In [None]:
parent_dist = collections.Counter(len(v) for v in parents.values())
sorted(parent_dist.items())

In [None]:
children_dist = collections.Counter(len(v) for v in children.values())
sorted(children_dist.items())

## Start from D-glucose [17634], and collect all descendents

- We could start from (DL-) glucose [17234], but this parent itself is stereochemically undefined and therefore not biologically relevant. It also includes children such as (DL-) glucopyranose, which is of likewise dubious value for biological modeling.

In [None]:
glucose_ids = set()
pending = [17634]
while pending:
    id = pending.pop()
    if id not in glucose_ids:
        glucose_ids.add(id)
        pending.extend(children[id])

glucoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in glucose_ids
]

glucoses_df = pd.DataFrame([
    {'id': c.id, 'name': c.name, 'formula': c.formula, 'inchi': c.inchi}
    for c in glucoses
]).set_index('id')
glucoses_df

In [None]:
glucoses_df.to_csv(os.path.join(chebi_dir, 'glucoses.tsv'), sep='\t')

### Quick conclusions

- This does seem to find more or less all the forms of glucose we want for modeling
- Their relationships are intuitively clear
- These relationships are also there in the inchi strings, but not in a way that is trivial to infer in code.
- So. Just dump them all (meaning all simple sugars, however we define that) to a spreadsheet, assuming the total is manageable. Then manually annotate them according to a rational isomer addressing scheme.

In [None]:
# Take all descendents of hexose [18133]

hexose_ids = set()
pending = [18133]
while pending:
    id = pending.pop()
    if id not in hexose_ids:
        hexose_ids.add(id)
        pending.extend(children[id])
print(f'{len(hexose_ids)} hexose descendents')

hexoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in hexose_ids
]

hexoses_df = pd.DataFrame([
    {'id': c.id, 'name': c.name, 'formula': c.formula, 'inchi': c.inchi}
    for c in hexoses
]).set_index('id')
print(hexoses_df.shape)

# hexoses_df.to_csv(os.path.join(chebi_dir, 'hexoses.tsv'), sep='\t')