# Init

In [1]:
import projectpath

import collections
import itertools
import os

import ipywidgets as widgets
import ipysheet
import pandas as pd

from knowledge import codecs, kb
from model.core import Molecule, Reaction, DbXref, Variation, Specialization

KB = kb.configure_kb()

# Identify all forms of all nucleotides

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

### On protons
- ChEBI standardizes on the charge-balanced form across the board, even though for many of the species we are interested in, specifically phosphorylated forms, the dominant form at pH=7.3 is ionic.
    - e.g. 'ATP' refers to the fully protonated form, while the dominant form is 'ATP(4-)'
- Our goal is inutuitive behavior for the biological modeler. So we will standardize on the expected form under biological conditions
    - 'ATP' refers to the form that has lost four protons. The fully protonated form is understood, but treated as a variation on this otherwise expected form, i.e. 'ATP (fully protonated)'


## Generate possible names for each form of each base / nucleoside / nucleotide

In [2]:
VARIATIONS = {
    'type': Variation('type', ['base', 'ribo', 'deoxy']),
    'phosphates': Variation('phosphates', ['mono', 'di', 'tri', 'cyc35', 'cyc23']),  # + None
    'chelation': Variation('chelation', ['mg']),  # + None
    'protons': Variation('protons', ['full', '1-', '2-', '3-', '4-']),
}
# Assumption: form names are unique _among all variations referred to in this group_.
form_variant = {}
for v in VARIATIONS.values():
    for f in v.form_names:
        form_variant[f] = v

# Names are almost but not quite systematic. E.g. guanosine and adenosine are nucleosides but
# cytosine is the free base where cytidine is the nucleoside. Annoying, but it is what it is.
bases = {
    'A': {'code': 'A', 'code3': 'Ade', 'stem': 'aden', 'scheme': 'osine'},
    'C': {'code': 'C', 'code3': 'Cyt', 'stem': 'cyt', 'scheme': 'idine'},
    'G': {'code': 'G', 'code3': 'Gua', 'stem': 'guan', 'scheme': 'osine'},
    'T': {'code': 'T', 'code3': 'Thy', 'stem': 'thym', 'scheme': 'thy'},
    'U': {'code': 'U', 'code3': 'Ura', 'stem': 'ur', 'scheme': 'idine'},
}

code_gen = {
    # 'base' and 'ribo' do not modify the code (and so code is ambiguous in that context)
    'deoxy': 'd{code}',

    'mono': '{code}MP',
    'di': '{code}DP',
    'tri': '{code}TP',
    'cyc35': 'c{code}MP',
    'cyc23': "c23{code}MP",
    
    'mg': 'Mg{code}',

    'full': '{code}',  # ChEBI convention: unspecified = fully protonated
    '1-': '{code}(1-)',
    '2-': '{code}(2-)',
    '3-': '{code}(3-)',
    '4-': '{code}(4-)',
}
root_gen = {
    'osine': {
        'base': ['{stem}ine'],
        'ribo': ['{stem}osine'],
        'deoxy': ['deoxy{stem}osine', "2'-deoxy{stem}osine"],
    },
    'idine': {
        'base': ['{stem}osine', '{stem}acil'],  # Special case for uracil
        'ribo': ['{stem}idine'],
        'deoxy': ['deoxy{stem}idine', "2'-deoxy{stem}idine"],
    },
    'thy': {
        'base': ['{stem}ine'],
        'ribo': ['ribo{stem}idine'],
        'deoxy': ['{stem}idine', 'deoxy{stem}idine', "2'-deoxy{stem}idine"],
    }
}
name_gen = {
    'mono': ["{name} 5'-monophosphate", "{name} monophosphate"],
    'di': ["{name} 5'-diphosphate", "{name} diphosphate"],
    'tri': ["{name} 5'-triphosphate", "{name} triphosphate"],
    'cyc23': ["{name} 2',3'-cyclic monophosphate", "2',3'-cyclic {code}MP"],
    'cyc35': ["{name} 3',5'-cyclic monophosphate", "3',5'-cyclic {code}MP"],

    'mg': ['magnesium {name}', 'magnesium-{name}', 'magnesium2+-{name}'],

    'full': ['{name}'],  # ChEBI convention: unspecified = fully protonated
    '1-': ['{name}(1-)'],
    '2-': ['{name}(2-)'],
    '3-': ['{name}(3-)'],
    '4-': ['{name}(4-)'],
}

def generate_forms():
    """Yields tuples for all potentially valid forms of a nucleobase, nucleoside, or nucleotide."""
    for ftype in VARIATIONS['type'].form_names:
        yield ftype,  # == (ftype, None, None, None)
        # Nucleoside forms may have phosphates and protonation state, and sometimes chelation.
        if ftype != 'base':
            for fphos in VARIATIONS['phosphates'].form_names:
                for fprot in VARIATIONS['protons'].form_names:
                    yield ftype, fphos, fprot,
                    for fchel in VARIATIONS['chelation'].form_names:
                        # Maintain order of precedence.
                        yield ftype, fphos, fchel, fprot

def build_code(base, form):
    code = base['code']
    for f in form:
        if f in code_gen:
            code = code_gen[f].format(code=code)
    return code

def build_names(base, form):
    # form[0] is the type; letter + type determines code, plus root name(s)
    parms = {'code': build_code(base, form[:1]), 'stem': base['stem']}
    names = [pat.format(**parms) for pat in root_gen[base['scheme']][form[0]]]

    # Further variation, if any, is based on these root names.
    for f in form[1:]:
        if f in name_gen:
            names_ext = []
            for pat in name_gen[f]:
                for name in names:
                    name = pat.format(name=name, **parms)
                    # Avoid redundancy
                    if name not in names_ext:
                        names_ext.append(name)
            names = names_ext
    return names

codes = {}
names = {}
for letter, base in bases.items():
    for form in generate_forms():
        key = (letter,)  + form
        codes[key] = build_code(base, form)
        names[key] = build_names(base, form)

name_key = {}
for key in codes:
    # Skip base and nucleoside codes as redundant, and too short anyway
    if len(key) > 2:
        name_key[codes[key]] = key
for key in names.keys():
    form_names = names[key]
    for name in form_names:
        if name not in name_key:
            name_key[name] = key
        else:
            print(f'Name collision: {name} can be {name_key[name]} or {key}')


## Find compounds in the reference (ChEBI) matching these generated names
- Map back to the form associated with each
- Lay out a grid of all identified forms for each base

In [3]:
# Find by name first, matching either the code or a spelled-out name
all_names = list(name_key.keys())
found = collections.defaultdict(list)
for doc in KB.client.ref.CHEBI.find({'name': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    found[name_key[doc['name']]].append(doc)
# Fall back to AKA. There are cases where the same name is an AKA for multiple forms.
for doc in KB.client.ref.CHEBI.find({'aka': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    for name in doc['aka']:
        if name in name_key:
            found[name_key[name]].append(doc)

# Map key -> compound, most general first. Also track compound -> key to detect ambiguities.
chebi_compounds = {}
compound_key = collections.defaultdict(list)
for key in sorted(found):
    docs = found[key]
    # Only the first hit.
    compound = codecs.CODECS[Molecule].decode(docs[0])
    # Only the most general form.
    if compound not in compound_key:
        chebi_compounds[key] = compound
    # But keep track of form ambiguities
    compound_key[compound].append(key)

for compound, keys in compound_key.items():
    if len(keys) > 1:
        print(f'{compound.name} [{compound._id}]'
              f' may be {" or ".join(".".join(key) for key in keys)}')

# Manual overrides resolving known issues
chebi_compounds['C', 'ribo', 'di', '3-'] = chebi_compounds.pop(('C', 'ribo', 'di', '2-'))
chebi_compounds.pop(('T', 'ribo', 'tri', '3-'), None)

form_grid = collections.defaultdict(dict)
for key in sorted(chebi_compounds.keys()):
    compound = chebi_compounds[key]
    form_grid['.'.join(key[1:])][key[0]] = f'[{compound._id}] {compound.name}'

forms = sorted(form_grid.keys())
pd.DataFrame([form_grid[form] for form in forms], index=forms).fillna('-')

2'-deoxycytosine 5'-monophosphate(2-) [57566] may be C.deoxy.mono.2- or C.deoxy.mono.full
CDP(3-) [58069] may be C.ribo.di.2- or C.ribo.di.3-
3',5'-cyclic dTMP [75183] may be T.deoxy.cyc35.full or T.ribo.cyc35.full


Unnamed: 0,A,C,G,T,U
base,[16708] adenine,[16040] cytosine,[16235] guanine,[17821] thymine,[17568] uracil
deoxy,[17256] 2'-deoxyadenosine,[15698] 2'-deoxycytidine,[17172] 2'-deoxyguanosine,[17748] thymidine,[16450] 2'-deoxyuridine
deoxy.cyc35.full,"[28074] 3',5'-cyclic dAMP",-,-,"[75183] 3',5'-cyclic dTMP",-
deoxy.di.3-,[57667] dADP(3-),[58593] dCDP(3-),[58595] dGDP(3-),[58369] dTDP(3-),[60471] dUDP(3-)
deoxy.di.full,[16174] dADP,[28846] dCDP,[28862] dGDP,[18075] dTDP,[28850] dUDP
deoxy.mono.2-,[58245] 2'-deoxyadenosine 5'-monophosphate(2-),[57566] 2'-deoxycytosine 5'-monophosphate(2-),[57673] 2'-deoxyguanosine 5'-monophosphate(2-),[63528] dTMP(2-),[246422] dUMP(2-)
deoxy.mono.full,[17713] 2'-deoxyadenosine 5'-monophosphate,-,[16192] 2'-deoxyguanosine 5'-monophosphate,[15245] thymidine 5'-monophosphate,[17622] dUMP
deoxy.tri.3-,[495505] dATP(3-),[57724] dCTP(3-),[57794] dGTP(3-),[58370] dTTP(3-),[58212] dUTP(3-)
deoxy.tri.4-,[61404] dATP(4-),[61481] dCTP(4-),[61429] dGTP(4-),[37568] dTTP(4-),[61555] dUTP(4-)
deoxy.tri.full,[16284] dATP,[16311] dCTP,[16497] dGTP,[18077] dTTP,[17625] dUTP


### To be resolved:
- [RESOLVED] Misannotated (?) CDP(3-) clobbers CDP(2-)
  - Appears to be intentional: C9H12N3O11P2 = CDP(3-) explicitly has CDP(2-) as a synonym, and C9H13N3O11P2 = CDP(2-) is entirely missing
- [RESOLVED] Misannotated (?) cAMP(1-) clobbers cAMP(0)
  - Not intentional: cAMP is CHEBI:17489, annotated as the conjugate acid of CHEBI:58165
- [RESOLVED] TTP(3-) is not a nucleotide at all, but a colliding abbreviation of Thiamine Triphosphate.
- [RESOLVED] cUMP => 2',3'-cyclic UMP; cGMP(1-) => 2',3', while cGMP => 3',5'. Generally clarify 2',3' vs 3',5' across the board.
  - Multiple 2',3' cyclic NMPs, add as a valid case
  - But "cNMP" should denote 3',5' form (i.e. what most people would expect). In the case of cGMP(1-) it is given as a synonym for 2',3', but _not_ for 3',5'. Assume this is wrong.
- 3',5'-cyclic UMP(1-) is missing from our refdb, but present in ChEBI (CHEBI:184387).
    - Status = S (SUBMITTER) in the downloaded compounds.tsv, i.e. third-party. We can choose to include this status in the ChEBI load, but if it's rare we can hold off on that decision.
- Similar story for TMP(2-) [CHEBI:45394]
- Mg²⁺ coverage is paltry. This is a ChEBI shortcoming, but maybe in our own KB we can be more comprehensive, e.g. the default form of _any_ NTP should be MgNTP(2-), even if we routinely anchor on NTP as canonical fpr modeling purposes.

## Translate to KB compounds
- Anchor on deprotonated forms of nucleotides
- Systematic IDs
- Xref to ChEBI ID

In [4]:
dominant_form = {'mono': '2-', 'di': '3-', 'tri': '4-', 'cyc23': '1-', 'cyc35': '1-'}

def systematic_id(base, form):
    if len(form) == 1:
        # For base and nucleoside forms the simple ACGT code is ambiguous.
        # ID is based on the explicit form.
        return '.'.join([base['code3'], form[0]]).lower()
    else:
        # For phospho forms the code covers base, type, and phosphates.
        # Extend as necessary for additional variation.
        code = build_code(base, form[:2])
        return '.'.join((code,) + form[2:]).lower()

def build_kb_compound(key, chebi_compound):
    base = bases[key[0]]
    form = {form_variant[f].name: f for f in key[1:]}

    # Update key and name to match KB conventions
    name = chebi_compound.name
    if 'protons' in form:
        if form['protons'] == 'full':
            name = name + ' (fully protonated)'
        elif form['protons'] == dominant_form[form['phosphates']]:
            key = key[:-1]
            name = build_names(base, key[1:])[0]

    xrefs = {DbXref('CHEBI', chebi_compound._id)}
    if chebi_compound.xrefs:
        xrefs.update(chebi_compound.xrefs)

    return key, Molecule(
        _id=systematic_id(base, key[1:]),
        name=name,
        shorthand=build_code(base, key[1:]),
        description=chebi_compound.description,
        aka=chebi_compound.aka,
        xrefs=xrefs,
        formula=chebi_compound.formula,
        mass=chebi_compound.mass,
        charge=chebi_compound.charge,
        inchi=chebi_compound.inchi,
    )
        
kb_compounds = {}
for key, chebi_compound in chebi_compounds.items():
    key, kb_compound = build_kb_compound(key, chebi_compound)
    kb_compounds[key] = kb_compound
    

### Build Specialization relationships where possible.

In [5]:
canon = [
    {'groups': ['type', 'phosphates'], 'variations': ['chelation', 'protons']},
    # Free base and nucleoside forms are canonical, but have no additional variation.
    # Opted against: {'groups': ['type'], 'variations': ['phosphates', 'chelation', 'charge']},
]

for group_def in canon:
    # Possible canonical forms from all combinations in the group
    for base in bases:
        for canonical_form in itertools.product(*(VARIATIONS[v].form_names for v in group_def['groups'])):
            canonical_key = (base,) + canonical_form
            canonical = kb_compounds.get(canonical_key)
            observed = collections.defaultdict(set)

            # Find all subforms from combinations of the remaining variations
            for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                               for v in group_def['variations'])):
                subform = tuple(f for f in subform if f is not None)
                if not subform:
                    # None for all remaining variations is just the canonical form itself; skip.
                    continue

                specific_variant = kb_compounds.get(canonical_key + subform)
                if specific_variant:
                    for f in subform:
                        observed[form_variant[f].name].add(f)
                    if canonical is not None:
                        specific_variant.canonical_form = Specialization(
                            canonical._id, subform, specific_variant._id)

            if observed:
                if canonical is not None:
                    canonical.variations = [Variation(k, list(v)) for k, v in observed.items()]
                else:
                    print(f'Canonical form {canonical_key} is not defined in the reference.')


Canonical form ('A', 'deoxy', 'cyc35') is not defined in the reference.
Canonical form ('T', 'ribo', 'mono') is not defined in the reference.
Canonical form ('T', 'deoxy', 'cyc35') is not defined in the reference.
Canonical form ('U', 'ribo', 'cyc35') is not defined in the reference.


## Put it officially in the KB

In [6]:
for compound in kb_compounds.values():
    KB.put(KB.compounds, compound)
