# Init

In [1]:
import projectpath

import collections
import itertools
import os

import ipywidgets as widgets
import ipysheet
import pandas as pd

from kb import kb
from scheme import Molecule, Reaction, DbXref, Variation, Specialization

REFDB = kb.REFDB
KB = kb.KB

# Identify all forms of all nucleotides

### NOTES
- The intent is not for this notebook or any script derived from it to be able to generate the KB unsupervised across all cases, but to assist a thinking scientist in covering a lot of ground quickly.
- Power tools, not robots.
- Scientific judgement is the final word.

## Generate possible names for each form of each base / nucleoside / nucleotide

In [2]:
VARIATIONS = {
    'type': Variation('type', ['base', 'ribo', 'deoxy']),
    'phosphates': Variation('phosphates', ['mono', 'di', 'tri', 'cyc35', 'cyc23']),  # + None
    'chelation': Variation('chelation', ['mg']),  # + None
    'charge': Variation('charge', ['5-', '4-', '3-', '2-', '1-']),  # + None/0. Technically None != 0, but ChEBI does not distinguish.
}

# Names are almost but not quite systematic. E.g. guanosine and adenosine are nucleosides but
# cytosine is the free base where cytidine is the nucleoside. Annoying, but it is what it is.
bases = {
    'A': {'code3': 'Ade', 'stem': 'aden', 'scheme': ['os']},
    'G': {'code3': 'Gua', 'stem': 'guan', 'scheme': ['os']},
    'C': {'code3': 'Cyt', 'stem': 'cyt', 'scheme': ['id']},
    'T': {'code3': 'Thy', 'stem': 'thym', 'scheme': ['os', 'id']},
    'U': {'code3': 'Ura', 'stem': 'ur', 'scheme': ['id']},
}

code_gen = {
    # 'base' and 'ribo' do not modify the code (and so code is ambiguous in that context)
    'deoxy': 'd{}',

    'mono': '{}MP',
    'di': '{}DP',
    'tri': '{}TP',
    'cyc35': 'c{}MP',
    'cyc23': "c23{}MP",
    
    'mg': 'Mg{}',

    '1-': '{}(1-)',
    '2-': '{}(2-)',
    '3-': '{}(3-)',
    '4-': '{}(4-)',
    '5-': '{}(5-)',
}
name_gen = {
    'os': {
        'base': ['{}ine'],
        'ribo': ['{}osine'],
        'deoxy': ['deoxy{}osine', "2'-deoxy{}osine"],

        'mono': ["{} 5'-monophosphate", "{} monophosphate"],
        'di': ["{} 5'-diphosphate", "{} diphosphate"],
        'tri': ["{} 5'-triphosphate", "{} triphosphate"],
        'cyc35': ["{} 3',5'-cyclic monophosphate"],
        'cyc23': ["{} 2',3'-cyclic monophosphate"],

        'mg': ['magnesium {}', 'magnesium-{}', 'magnesium2+-{}'],

        '1-': ['{}(1-)'],
        '2-': ['{}(2-)'],
        '3-': ['{}(3-)'],
        '4-': ['{}(4-)'],
        '5-': ['{}(5-)'],
    },
    'id': {
        'base': ['{}osine', '{}acil'],  # Special case for uracil
        'ribo': ['{}idine', 'ribo{}idine'], # Special case for ribothymidine
        'deoxy': ['deoxy{}idine', "2'-deoxy{}idine"],

        'mono': ["{} 5'-monophosphate", "{} monophosphate"],
        'di': ["{} 5'-diphosphate", "{} diphosphate"],
        'tri': ["{} 5'-triphosphate", "{} triphosphate"],
        'cyc35': ["{} 3',5'-cyclic monophosphate"],
        'cyc23': ["{} 2',3'-cyclic monophosphate"],

        'mg': ['magnesium {}', 'magnesium-{}', 'magnesium2+-{}'],

        '1-': ['{}(1-)'],
        '2-': ['{}(2-)'],
        '3-': ['{}(3-)'],
        '4-': ['{}(4-)'],
        '5-': ['{}(5-)'],
    },
}

def build_code(code, form):
    for f in form:
        code = code_gen.get(f, '{}').format(code)
    return code

def build_names(stem, form, scheme):
    names = [stem]
    for f in form:
        if f in name_gen[scheme]:
            names = [pat.format(name) for pat in name_gen[scheme][f] for name in names]
    return names

def generate_forms():
    # Too handcrafted atm. Need to capture conditionality more methodically somehow.
    for ftype in VARIATIONS['type'].form_names:
        yield ftype,  # == (ftype, None, None, None)
        if ftype != 'base':
            # All cases below have phosphates...
            for fphos in VARIATIONS['phosphates'].form_names:
                # ... but not necessarily charge or chelation
                for fcharge in [None] + VARIATIONS['charge'].form_names:
                    for fchel in [None] + VARIATIONS['chelation'].form_names:
                        yield ftype, fphos, fchel, fcharge

code = {}
names = {}
for base, details in bases.items():
    for form in generate_forms():
        key = (base,)  + tuple(f for f in form if f)
        code[key] = build_code(base, form)
        names[key] = []
        for scheme in details['scheme']:
            names[key].extend(build_names(details['stem'], form, scheme))

# Just no good way to do this in the current scheme, building name(s) from an abbreviation.
# But this appears to resolve ambiguities around the various cyclic NMPs.
for base in bases:
    nmp = code[base, 'ribo', 'mono']
    names[base, 'ribo', 'cyc23'] = [f"2',3'-cyclic {nmp}"]
    names[base, 'ribo', 'cyc23', '1-'] = [f"2',3'-cyclic {nmp}(1-)"]
    names[base, 'ribo', 'cyc35'] = [f"3',5'-cyclic {nmp}"]
    names[base, 'ribo', 'cyc35', '1-'] = [f"3',5'-cyclic {nmp}(1-)"]

# ribo-T is valid, but the assumption that 'thymidine' means deoxy creates ambiguity.
for form in generate_forms():
    if form[0] == 'ribo':
        key = ('T',) + tuple(f for f in form if f)
        del code[key]
        del names[key]

name_key = {v: k for k, v in code.items()}
for key in names:
    for name in names[key]:
        if name not in name_key:
            name_key[name] = key
        else:
            print(f'Name collision: {name} can be {name_key[name]} or {key}')


## Find compounds in the reference (ChEBI) matching these generated names
- Map back to the form associated with each
- Lay out a grid of all identified forms for each base

In [3]:
# Find by name first, matching either the code or a spelled-out name
all_names = list(name_key.keys())
found = collections.defaultdict(list)
for doc in REFDB.CHEBI.find({'name': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    found[name_key[doc['name']]].append(doc)
# Fall back to AKA. There are cases where the same name is an AKA for multiple forms.
for doc in REFDB.CHEBI.find({'aka': {'$in': all_names}}).collation({'locale': 'en', 'strength': 1}):
    for name in doc['aka']:
        if name in name_key:
            found[name_key[name]].append(doc)
            
# Map key -> compound, most general first. Also track compound -> key to detect ambiguities.
mapped_compound = {}
compound_key = collections.defaultdict(list)
for key in sorted(found):
    docs = found[key]
    # Only the first hit.
    compound = kb.CODECS[Molecule].decode(docs[0])
    # Only the most general form.
    if compound not in compound_key:
        mapped_compound[key] = compound
    # But keep track of form ambiguities
    compound_key[compound].append(key)

for compound, keys in compound_key.items():
    if len(keys) > 1:
        print(f'{compound.name} [{compound._id}]'
              f' may be {" or ".join(".".join(key) for key in keys)}')

form_grid = collections.defaultdict(dict)
for key in sorted(mapped_compound.keys()):
    compound = mapped_compound[key]
    form_grid['.'.join(key[1:])][key[0]] = f'[{compound._id}] {compound.name}'

forms = sorted(form_grid.keys())
pd.DataFrame([form_grid[form] for form in forms], index=forms).fillna('-')

CDP(3-) [58069] may be C.ribo.di.2- or C.ribo.di.3-


Unnamed: 0,A,C,G,T,U
base,[16708] adenine,[16040] cytosine,[16235] guanine,[17821] thymine,[17568] uracil
deoxy,[17256] 2'-deoxyadenosine,[15698] 2'-deoxycytidine,[17172] 2'-deoxyguanosine,[17748] thymidine,[16450] 2'-deoxyuridine
deoxy.di,[16174] dADP,[28846] dCDP,[28862] dGDP,[18075] dTDP,[28850] dUDP
deoxy.di.3-,[57667] dADP(3-),[58593] dCDP(3-),[58595] dGDP(3-),[58369] dTDP(3-),[60471] dUDP(3-)
deoxy.mono,[17713] 2'-deoxyadenosine 5'-monophosphate,[15918] 2'-deoxycytosine 5'-monophosphate,[16192] 2'-deoxyguanosine 5'-monophosphate,[17013] dTMP,[17622] dUMP
deoxy.mono.2-,[58245] 2'-deoxyadenosine 5'-monophosphate(2-),[57566] 2'-deoxycytosine 5'-monophosphate(2-),[57673] 2'-deoxyguanosine 5'-monophosphate(2-),[63528] dTMP(2-),[246422] dUMP(2-)
deoxy.tri,[16284] dATP,[16311] dCTP,[16497] dGTP,[18077] dTTP,[17625] dUTP
deoxy.tri.3-,[495505] dATP(3-),[57724] dCTP(3-),[57794] dGTP(3-),[58370] dTTP(3-),[58212] dUTP(3-)
deoxy.tri.4-,[61404] dATP(4-),[61481] dCTP(4-),[61429] dGTP(4-),[37568] dTTP(4-),[61555] dUTP(4-)
ribo,[16335] adenosine,[17562] cytidine,[16750] guanosine,-,[16704] uridine


### To be resolved:
- Misannotated (?) CDP(3-) clobbers CDP(2-)
  - Appears to be intentional: C9H12N3O11P2 = CDP(3-) explicitly has CDP(2-) as a synonym, and C9H13N3O11P2 = CDP(2-) is entirely missing
- [RESOLVED] Misannotated (?) cAMP(1-) clobbers cAMP(0)
  - Not intentional: cAMP is CHEBI:17489, annotated as the conjugate acid of CHEBI:58165
- [RESOLVED] cUMP => 2',3'-cyclic UMP; cGMP(1-) => 2',3', while cGMP => 3',5'. Generally clarify 2',3' vs 3',5' across the board.
  - Multiple 2',3' cyclic NMPs, add as a valid case
  - But "cNMP" should denote 3',5' form (i.e. what most people would expect). In the case of cGMP(1-) it is given as a synonym for 2',3', but _not_ for 3',5'. Assume this is wrong.
- 3',5'-cyclic UMP(1-) is missing from our refdb. But it seems to have been added recently to ChEBI (CHEBI:184387), so this should resolve when we update.
- Rescue riboT ?
- Mg²⁺ coverage is paltry. This is a ChEBI shortcoming, but maybe in our own KB we can be more comprehensive, e.g. the default form of _any_ NTP should be MgNTP(2-), even if we routinely anchor on NTP as canonical fpr modeling purposes.

In [4]:
mapped_compound.pop(('C', 'ribo', 'di', '2-'), None)
mapped_compound['C', 'ribo', 'di', '3-'] = kb.CODECS[Molecule].decode(REFDB.CHEBI.find_one(58069))

form_grid = collections.defaultdict(dict)
for key in sorted(mapped_compound.keys()):
    compound = mapped_compound[key]
    form_grid['.'.join(key[1:])][key[0]] = f'[{compound._id}] {compound.name}'

forms = sorted(form_grid.keys())
pd.DataFrame([form_grid[form] for form in forms], index=forms).fillna('-')

Unnamed: 0,A,C,G,T,U
base,[16708] adenine,[16040] cytosine,[16235] guanine,[17821] thymine,[17568] uracil
deoxy,[17256] 2'-deoxyadenosine,[15698] 2'-deoxycytidine,[17172] 2'-deoxyguanosine,[17748] thymidine,[16450] 2'-deoxyuridine
deoxy.di,[16174] dADP,[28846] dCDP,[28862] dGDP,[18075] dTDP,[28850] dUDP
deoxy.di.3-,[57667] dADP(3-),[58593] dCDP(3-),[58595] dGDP(3-),[58369] dTDP(3-),[60471] dUDP(3-)
deoxy.mono,[17713] 2'-deoxyadenosine 5'-monophosphate,[15918] 2'-deoxycytosine 5'-monophosphate,[16192] 2'-deoxyguanosine 5'-monophosphate,[17013] dTMP,[17622] dUMP
deoxy.mono.2-,[58245] 2'-deoxyadenosine 5'-monophosphate(2-),[57566] 2'-deoxycytosine 5'-monophosphate(2-),[57673] 2'-deoxyguanosine 5'-monophosphate(2-),[63528] dTMP(2-),[246422] dUMP(2-)
deoxy.tri,[16284] dATP,[16311] dCTP,[16497] dGTP,[18077] dTTP,[17625] dUTP
deoxy.tri.3-,[495505] dATP(3-),[57724] dCTP(3-),[57794] dGTP(3-),[58370] dTTP(3-),[58212] dUTP(3-)
deoxy.tri.4-,[61404] dATP(4-),[61481] dCTP(4-),[61429] dGTP(4-),[37568] dTTP(4-),[61555] dUTP(4-)
ribo,[16335] adenosine,[17562] cytidine,[16750] guanosine,-,[16704] uridine


## Assign systematic IDs, shorthand, and ChEBI xref

In [5]:
def systematic_id(base, form):
    if len(form) == 1:
        # For base, ribo, deoxy forms the simple ACGT code is ambiguous. Build id based on the explicit form.
        return '.'.join([bases[base]['code3'], form[0]]).lower()
    else:
        # For phospho forms the code covers base, type, and phosphates. Only extend it for additional variation.
        parts = [code[base, form[0], form[1]]]
        parts.extend(form[2:])
        return '.'.join(parts).lower()
    
for key in sorted(mapped_compound.keys()):
    compound = mapped_compound[key]
    if compound.xrefs == None:
        compound.xrefs = set()
    compound.xrefs.add(DbXref('CHEBI', compound._id))
    compound._id = systematic_id(key[0], key[1:])
    compound.shorthand = code[key]


### Build Specialization relationships. Fill in any missing canonical forms.

In [6]:
canon = [
    {'groups': ['type', 'phosphates'], 'variations': ['chelation', 'charge']},
    # Free base and nucleoside forms are canonical, but have no additional variation.
    # Opted against: {'groups': ['type'], 'variations': ['phosphates', 'chelation', 'charge']},
]

for group_def in canon:
    # Assumption: form names are unique _among all variations referred to in this group_.
    variant_map = {}
    for v in group_def['groups']:
        for f in VARIATIONS[v].form_names:
            variant_map[f] = v
    for v in group_def['variations']:
        for f in VARIATIONS[v].form_names:
            variant_map[f] = v

    # Possible canonical forms from all combinations in the group
    for base in bases:
        for form in itertools.product(*(VARIATIONS[v].form_names for v in group_def['groups'])):
            observed = collections.defaultdict(set)

            # Find all subforms from combinations of the remaining variations
            for subform in itertools.product(*([None] + VARIATIONS[v].form_names 
                                               for v in group_def['variations'])):
                subform = tuple(f for f in subform if f is not None)
                if not subform:
                    # None for all remaining variations is just the canonical form itself; skip.
                    continue

                specific_variant = mapped_compound.get((base,) + form + subform)
                if specific_variant:
                    specific_variant.canonical_form = Specialization(systematic_id(base, form), subform, specific_variant._id)
                    for f in subform:
                        observed[variant_map[f]].add(f)

            # A possible becomes an actual canonical form if any of its subforms was observed
            if observed:
                canonical = mapped_compound.get((base,) + form)
                # An actual canonical form must exist, even if overlooked in the reference
                if canonical is None:
                    print(f'Canonical form {(base,) + form} is not defined in the reference. Creating...')
                    canonical = Molecule(_id=systematic_id(base, form), name=possible_names[(base,) + form][0])
                    mapped_compound[(base,) + form] = canonical
                canonical.variations = [Variation(k, list(v)) for k, v in observed.items()]
                
                # Again setting the default form automatically gets very handcrafted. Again defer for now.



## Put it officially in the KB

In [8]:
for compound in mapped_compound.values():
    KB.compounds.insert_one(kb.CODECS[Molecule].encode(compound))


In [None]:
parent_dist = collections.Counter(len(v) for v in parents.values())
sorted(parent_dist.items())

In [None]:
children_dist = collections.Counter(len(v) for v in children.values())
sorted(children_dist.items())

## Start from D-glucose [17634], and collect all descendents

- We could start from (DL-) glucose [17234], but this parent itself is stereochemically undefined and therefore not biologically relevant. It also includes children such as (DL-) glucopyranose, which is of likewise dubious value for biological modeling.

In [None]:
glucose_ids = set()
pending = [17634]
while pending:
    id = pending.pop()
    if id not in glucose_ids:
        glucose_ids.add(id)
        pending.extend(children[id])

glucoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in glucose_ids
]

glucoses_df = pd.DataFrame([
    {'id': c.id, 'name': c.name, 'formula': c.formula, 'inchi': c.inchi}
    for c in glucoses
]).set_index('id')
glucoses_df

In [None]:
glucoses_df.to_csv(os.path.join(chebi_dir, 'glucoses.tsv'), sep='\t')

### Quick conclusions

- This does seem to find more or less all the forms of glucose we want for modeling
- Their relationships are intuitively clear
- These relationships are also there in the inchi strings, but not in a way that is trivial to infer in code.
- So. Just dump them all (meaning all simple sugars, however we define that) to a spreadsheet, assuming the total is manageable. Then manually annotate them according to a rational isomer addressing scheme.

In [None]:
# Take all descendents of hexose [18133]

hexose_ids = set()
pending = [18133]
while pending:
    id = pending.pop()
    if id not in hexose_ids:
        hexose_ids.add(id)
        pending.extend(children[id])
print(f'{len(hexose_ids)} hexose descendents')

hexoses = [
    kb.CODECS[Molecule].decode(kb.KB.compounds.find_one(id)) for id in hexose_ids
]

hexoses_df = pd.DataFrame([
    {'id': c.id, 'name': c.name, 'formula': c.formula, 'inchi': c.inchi}
    for c in hexoses
]).set_index('id')
print(hexoses_df.shape)

# hexoses_df.to_csv(os.path.join(chebi_dir, 'hexoses.tsv'), sep='\t')