# Init

In [1]:
import collections
import itertools
import pandas as pd

from mosmo.knowledge import codecs, kb
from mosmo.model import Molecule, Reaction, DbXref, Variation, Specialization

KB = kb.configure_kb()
CHEBI = KB.client[KB.CHEBI.client_db][KB.CHEBI.collection]

# Identify all forms of all nucleotides

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

### On protons
- ChEBI standardizes on the charge-balanced form across the board, even though for many of the species we are interested in, specifically phosphorylated forms, the dominant form at pH=7.3 is ionic.
    - e.g. 'ATP' refers to the fully protonated form, while the dominant form is 'ATP(4-)'
- Our goal is inutuitive behavior for the biological modeler. So we will standardize on the expected form under biological conditions
    - 'ATP' refers to the form that has lost four protons. The fully protonated form is understood, but treated as a variation on this otherwise expected form, i.e. 'ATP (fully protonated)'


## Generate possible names for each form of each base / nucleoside / nucleotide

### Define the space of possible forms for any nucleotide

In [2]:
VARIATIONS = {
    'conjugation': Variation('conjugation', ['base', 'ribo', 'deoxy']),
    'phosphates': Variation('phosphates', ['mono', 'di', 'tri', 'cyc35', 'cyc23']),  # + None
    'chelation': Variation('chelation', ['mg']),  # + None
    'protons': Variation('protons', ['full', '1-', '2-', '3-', '4-']),
}

def generate_forms():
    """Yields tuples for all potentially valid forms of a nucleobase, nucleoside, or nucleotide."""
    for fconjugation in VARIATIONS['conjugation'].form_names:
        yield fconjugation,  # == (fconjugation, None, None, None)
        # Nucleoside forms may have phosphates and protonation state, and sometimes chelation.
        if fconjugation != 'base':
            for fphos in VARIATIONS['phosphates'].form_names:
                for fprot in VARIATIONS['protons'].form_names:
                    yield fconjugation, fphos, fprot,
                    for fchel in VARIATIONS['chelation'].form_names:
                        # Maintain order of precedence.
                        yield fconjugation, fphos, fchel, fprot


### Define forms of actual nucleotides, with range of possible 'standardized' names and codes

In [3]:
# Names are almost but not quite systematic. E.g. guanosine and adenosine are nucleosides but
# cytosine is the free base where cytidine is the nucleoside. Annoying, but it is what it is.
bases = {
    'A': {'code': 'A', 'code3': 'Ade', 'stem': 'aden', 'scheme': 'osine'},
    'C': {'code': 'C', 'code3': 'Cyt', 'stem': 'cyt', 'scheme': 'idine'},
    'G': {'code': 'G', 'code3': 'Gua', 'stem': 'guan', 'scheme': 'osine'},
    'I': {'code': 'I', 'code3': 'Hyp', 'stem': 'in', 'scheme': 'osine'},
    'T': {'code': 'T', 'code3': 'Thy', 'stem': 'thym', 'scheme': 'thy'},
    'U': {'code': 'U', 'code3': 'Ura', 'stem': 'ur', 'scheme': 'idine'},
}

code_gen = {
    # 'base' and 'ribo' do not modify the code (and so code is ambiguous in that context)
    'deoxy': 'd{code}',

    'mono': '{code}MP',
    'di': '{code}DP',
    'tri': '{code}TP',
    'cyc35': 'c{code}MP',
    'cyc23': "c23{code}MP",
    
    'mg': 'Mg{code}',

    'full': '{code}',  # ChEBI convention: unspecified = fully protonated
    '1-': '{code}(1-)',
    '2-': '{code}(2-)',
    '3-': '{code}(3-)',
    '4-': '{code}(4-)',
}
root_gen = {
    'osine': {
        'base': ['{stem}ine'],
        'ribo': ['{stem}osine'],
        'deoxy': ['deoxy{stem}osine', "2'-deoxy{stem}osine"],
    },
    'idine': {
        'base': ['{stem}osine', '{stem}acil'],  # Special case for uracil
        'ribo': ['{stem}idine'],
        'deoxy': ['deoxy{stem}idine', "2'-deoxy{stem}idine", 'deoxy{stem}osine', "2'-deoxy{stem}osine"],
    },
    'thy': {
        'base': ['{stem}ine'],
        'ribo': ['ribo{stem}idine'],
        'deoxy': ['{stem}idine', 'deoxy{stem}idine', "2'-deoxy{stem}idine"],
    }
}
name_gen = {
    'mono': ["{name} monophosphate", "{name} 5'-monophosphate", "{name} phosphate", "{name} 5'-phosphate", 
             "{name}-monophosphate", "{name}-5'-monophosphate", "{name}-phosphate", "{name}-5'-phosphate", ],
    'di': ["{name} diphosphate", "{name} 5'-diphosphate", "{name}-diphosphate", "{name}-5'-diphosphate", ],
    'tri': ["{name} triphosphate", "{name} 5'-triphosphate", "{name}-triphosphate", "{name}-5'-triphosphate", ],
    'cyc23': ["{name} 2',3'-cyclic monophosphate", "{name}-2',3'-cyclic monophosphate", "2',3'-cyclic {code}MP", ],
    'cyc35': ["{name} 3',5'-cyclic monophosphate", "{name}-3',5'-cyclic monophosphate", "3',5'-cyclic {code}MP", ],

    'mg': ['magnesium {name}', 'magnesium-{name}', 'magnesium2+-{name}'],

    'full': ['{name}'],  # ChEBI convention: unspecified = fully protonated
    '1-': ['{name}(1-)'],
    '2-': ['{name}(2-)'],
    '3-': ['{name}(3-)'],
    '4-': ['{name}(4-)'],
}

def build_code(base, form):
    if form == ('base',):
        return base['code3']
    code = base['code']
    for f in form:
        if f in code_gen:
            code = code_gen[f].format(code=code)
    return code

def build_names(base, form):
    # form[0] is the conjugation; letter + conjugation determines code, plus root name(s)
    parms = {'code': build_code(base, form[:1]), 'stem': base['stem']}
    names = [pat.format(**parms) for pat in root_gen[base['scheme']][form[0]]]

    # Further variation, if any, is based on these root names.
    for f in form[1:]:
        if f in name_gen:
            names_ext = []
            for pat in name_gen[f]:
                for name in names:
                    name = pat.format(name=name, **parms)
                    # Avoid redundancy
                    if name not in names_ext:
                        names_ext.append(name)
            names = names_ext
    return names

## Find compounds in the reference (ChEBI) matching these generated names
- Map back to the form associated with each
- Lay out a grid of all identified forms for each base

In [4]:
def map_compounds(base):
    print(base['code'])
    codes = {}
    names = {}
    for form in generate_forms():
        key = (base['code'],)  + form
        codes[key] = build_code(base, form)
        names[key] = build_names(base, form)

    # Reverse the mapping to identify the form denoted by a given name or code.
    name_key = {}
    # Poor outcomes for three-letter codes, too many collisions with unrelated compounds
    for key, code in codes.items():
        # Skip single-letter codes for (code, conjugation) keys as redundant, and too short
        if len(key) > 2:
            name_key[code] = key
    for key, form_names in names.items():
        for name in form_names:
            if name not in name_key:
                name_key[name] = key
            else:
                print(f'Name collision: {name} can be {name_key[name]} or {key}')

    # Find compounds in CHEBI, by name first. Hits by name are never ambiguous here.
    all_names = set(name_key.keys())
    chebi_compounds = {}
    for doc in CHEBI.find({'name': {'$in': list(all_names)}}).collation({'locale': 'en', 'strength': 1}):
        compound = KB.CHEBI.codec.decode(doc)
        name = compound.name
        chebi_compounds[name_key[name]] = compound
        all_names.remove(name)
    print(f'  {len(chebi_compounds)} forms found by name')

    # All the real compounds are found by name. Using AKA scores one more real hit, but one bad one too. So skip it.

    # found = collections.defaultdict(set)
    # for doc in CHEBI.find({'aka': {'$in': list(all_names)}}).collation({'locale': 'en', 'strength': 1}):
    #     compound = KB.CHEBI.codec.decode(doc)
    #     if name_key.get(compound.name) in chebi_compounds:
    #         # This compound was already found by name, so ignore its aka's
    #         continue
    #     for name in doc['aka']:
    #         # Do not clobber anything previously found by name
    #         if name in name_key and name_key[name] not in chebi_compounds:
    #             print(f'{compound} "{name}" => {name_key[name]}')
    #             found[name_key[name]].add(compound)
    # print(f'  {len(found)} additional forms found by aka')

    # # Map key -> compound, most general first. Also track compound -> key to detect ambiguities.
    # compound_key = collections.defaultdict(list)
    # for key in sorted(found): # General to specific
    #     compounds = found[key]
    #     if len(compounds) > 1:
    #         print(f'    {key} => {compounds}')
    #     compound = compounds.pop()

    #     # Don't map the same compound to multiple forms.
    #     if compound not in compound_key:
    #         chebi_compounds[key] = compound
    #     # But keep track of form ambiguities
    #     compound_key[compound].append(key)

    # for compound, keys in compound_key.items():
    #     if len(keys) > 1:
    #         print(f'{compound.name} [{compound.id}]'
    #               f' may be {" or ".join(".".join(key) for key in keys)}')

    return chebi_compounds

In [5]:
%%time
chebi_compounds = {}
for base in bases.values():
    chebi_compounds.update(map_compounds(base))
    print()

# Manual override(s) for known issues
chebi_compounds[('I', 'base')] = KB('CHEBI:17368')  # hypoxanthine
chebi_compounds[('T', 'deoxy', 'mono', '1-')] = KB('CHEBI:46960')  # named dTMP(-) instead of dTMP(1-)

print(f'Grand total: {len(chebi_compounds)} nucleotides and bases systematically mapped.')

A
  26 forms found by name

C
  21 forms found by name

G
  22 forms found by name

I
  17 forms found by name

T
  17 forms found by name

U
  21 forms found by name

Grand total: 126 nucleotides and bases systematically mapped.
CPU times: user 45.3 ms, sys: 3.84 ms, total: 49.2 ms
Wall time: 809 ms


In [6]:
form_grid = collections.defaultdict(dict)
for key in sorted(chebi_compounds.keys()):
    compound = chebi_compounds[key]
    form_grid['.'.join(key[1:])][key[0]] = f'[{compound.id}] {compound.name}'

forms = sorted(form_grid.keys())
pd.DataFrame([form_grid[form] for form in forms], index=forms).fillna('-')

Unnamed: 0,A,C,G,I,T,U
base,[16708] adenine,[16040] cytosine,[16235] guanine,[17368] hypoxanthine,[17821] thymine,[17568] uracil
deoxy,[17256] 2'-deoxyadenosine,[15698] 2'-deoxycytidine,[17172] 2'-deoxyguanosine,[28997] 2'-deoxyinosine,[17748] thymidine,[16450] 2'-deoxyuridine
deoxy.cyc35.full,"[28074] 3',5'-cyclic dAMP",-,-,-,"[75183] 3',5'-cyclic dTMP",-
deoxy.di.3-,[57667] dADP(3-),[58593] dCDP(3-),[58595] dGDP(3-),[62286] 2'-deoxyinosine 5'-diphosphate(3-),[58369] dTDP(3-),[60471] dUDP(3-)
deoxy.di.full,[16174] dADP,[28846] dCDP,[28862] dGDP,[28823] 2'-deoxyinosine-5'-diphosphate,[18075] dTDP,[28850] dUDP
deoxy.mono.1-,-,-,-,-,[46960] dTMP(-),-
deoxy.mono.2-,[58245] 2'-deoxyadenosine 5'-monophosphate(2-),[57566] 2'-deoxycytosine 5'-monophosphate(2-),[57673] 2'-deoxyguanosine 5'-monophosphate(2-),[61194] 2'-deoxyinosine 5'-phosphate(2-),[63528] dTMP(2-),[246422] dUMP(2-)
deoxy.mono.full,[17713] 2'-deoxyadenosine 5'-monophosphate,[15918] 2'-deoxycytosine 5'-monophosphate,[16192] 2'-deoxyguanosine 5'-monophosphate,[28806] 2'-deoxyinosine-5'-monophosphate,[17013] dTMP,[17622] dUMP
deoxy.tri.3-,[495505] dATP(3-),[57724] dCTP(3-),[57794] dGTP(3-),-,[58370] dTTP(3-),[58212] dUTP(3-)
deoxy.tri.4-,[61404] dATP(4-),[61481] dCTP(4-),[61429] dGTP(4-),[61382] dITP(4-),[37568] dTTP(4-),[61555] dUTP(4-)


## Translate to KB compounds
- Anchor on deprotonated forms of nucleotides
- Systematic IDs
- Xref to ChEBI ID

In [7]:
# Assumption: form names are unique _among all variations referred to in this group_.
form_variant = {}
for v in VARIATIONS.values():
    for f in v.form_names:
        form_variant[f] = v

dominant_form = {'mono': '2-', 'di': '3-', 'tri': '4-', 'cyc23': '1-', 'cyc35': '1-'}

def systematic_id(base, form):
    if len(form) == 1:
        # For base and nucleoside forms the simple ACGT code is ambiguous.
        # ID is based on the explicit form.
        return '.'.join([base['code3'], form[0]]).lower()
    else:
        # For phospho forms the code covers base, conjugation, and phosphates.
        # Extend as necessary for additional variation.
        code = build_code(base, form[:2])
        return '.'.join((code,) + form[2:]).lower()

def build_kb_compound(key, chebi_compound):
    base = bases[key[0]]
    form = {form_variant[f].name: f for f in key[1:]}

    # Update key and name to match KB conventions
    name = chebi_compound.name
    if 'protons' in form:
        if form['protons'] == 'full':
            name = name + ' (fully protonated)'
        elif form['protons'] == dominant_form[form['phosphates']]:
            key = key[:-1]
            name = build_names(base, key[1:])[0]

    xrefs = {chebi_compound.ref()}
    if chebi_compound.xrefs:
        xrefs.update(chebi_compound.xrefs)

    return key, Molecule(
        id=systematic_id(base, key[1:]),
        name=name,
        shorthand=build_code(base, key[1:]),
        description=chebi_compound.description,
        aka=chebi_compound.aka,
        xrefs=xrefs,
        formula=chebi_compound.formula,
        mass=chebi_compound.mass,
        charge=chebi_compound.charge,
        inchi=chebi_compound.inchi,
    )
        
kb_compounds = {}
for key, chebi_compound in chebi_compounds.items():
    key, kb_compound = build_kb_compound(key, chebi_compound)
    kb_compounds[key] = kb_compound
    

### Build Specialization relationships where possible.

In [8]:
canon = [
    {'groups': ['conjugation', 'phosphates'], 'variations': ['chelation', 'protons']},
    # Free base and nucleoside forms are canonical, but have no additional variation.
    # Opted against: {'groups': ['conjugation'], 'variations': ['phosphates', 'chelation', 'protons']},
]

# Assumption: form names are unique _among all variations referred to in this group_.
form_variant = {}
for v in VARIATIONS.values():
    for f in v.form_names:
        form_variant[f] = v

for group_def in canon:
    # Possible canonical forms from all combinations in the group
    for canonical_form in itertools.product(*(VARIATIONS[v].form_names for v in group_def['groups'])):
        for base in bases:
            canonical_key = (base,) + canonical_form
            canonical = kb_compounds.get(canonical_key)
            observed = collections.defaultdict(set)

            # Find all subforms from combinations of the remaining variations
            for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                               for v in group_def['variations'])):
                subform = tuple(f for f in subform if f is not None)
                if not subform:
                    # None for all remaining variations is just the canonical form itself; skip.
                    continue

                specific_variant = kb_compounds.get(canonical_key + subform)
                if specific_variant:
                    for f in subform:
                        observed[form_variant[f].name].add(f)
                    if canonical is not None:
                        specific_variant.canonical_form = Specialization(
                            canonical.id, subform, specific_variant.id)

            if observed:
                if canonical is not None:
                    canonical.variations = [Variation(k, list(v)) for k, v in observed.items()]
                else:
                    print(f'Canonical form {canonical_key} is not defined in the reference.')


Canonical form ('A', 'deoxy', 'cyc35') is not defined in the reference.
Canonical form ('T', 'deoxy', 'cyc35') is not defined in the reference.


## Put it officially in the KB

_uncomment to run_

In [9]:
# with KB.unlock(KB.compounds):
#     for compound in kb_compounds.values():
#         KB.put(KB.compounds, compound)