In [1]:
import projectpath

import collections
import os
from typing import Iterable

import pandas as pd

from kb import kb
from scheme import Molecule, Reaction, DbCrossRef

chebi_dir = "/home/fdrusso/work/data/chebi"

# Put all this in db=ebi
ebi = kb.Connection().client.ebi


# Load ChEBI data verbatim

In [2]:
compounds = {}
for row in pd.read_csv(os.path.join(chebi_dir, "compounds.tsv"), sep="\t").itertuples():
    # Policy decision: use primary ID only; let the source worry about obsolete IDs.
    if row.STATUS == "C" and pd.isnull(row.PARENT_ID):
        compounds[row.ID] = Molecule(_id=row.ID, name=row.NAME)
print(f"{len(compounds)} valid compounds")

59583 valid compounds


In [3]:
compound_names = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, "names.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds:
        compound_names[row.COMPOUND_ID].add(row.NAME)
for compound_id, names in compound_names.items():
    compound = compounds[compound_id]
    compound.aka = list(names - {compound.name})

for row in pd.read_csv(os.path.join(chebi_dir, "chemical_data.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds:
        if row.TYPE == "MASS":
            compounds[row.COMPOUND_ID].mass = float(row.CHEMICAL_DATA)
        elif row.TYPE == "CHARGE":
            compounds[row.COMPOUND_ID].charge = int(row.CHEMICAL_DATA)
        elif row.TYPE == "FORMULA":
            compounds[row.COMPOUND_ID].formula = row.CHEMICAL_DATA

for row in pd.read_csv(os.path.join(chebi_dir, "chebiId_inchi.tsv"), sep="\t").itertuples():
    if row.CHEBI_ID in compounds:
        compounds[row.CHEBI_ID].inchi = row.InChI

In [4]:
# This cell is NOT idempotent

# Select db's only
db_mapping = {
    "KEGG GLYCAN accession": "KEGG",
    "KEGG DRUG accession": "KEGG",
    "Wikipedia accession": "WIKI",
    "MetaCyc accession": "METACYC",
    "KEGG COMPOUND accession": "KEGG",
    "CAS Registry Number": "CAS",
    "LINCS accession": "LINCS",
}

compound_xrefs = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, "database_accession.tsv"), sep="\t").itertuples():
    if row.COMPOUND_ID in compounds and row.TYPE in db_mapping:
        compound_xrefs[row.COMPOUND_ID].add(DbCrossRef(db_mapping[row.TYPE], row.ACCESSION_NUMBER))

for compound_id, xrefs in compound_xrefs.items():
        compounds[compound_id].crossref = list(xrefs)


In [5]:
compounds[17634]

Molecule [17634] D-glucose
  formula: C6H12O6
  mass: 180.15588 Da
  charge: +0

In [6]:
compounds[42758]

Molecule [42758] aldehydo-D-glucose
  formula: C6H12O6
  mass: 180.15588 Da
  charge: +0

# Put it in Mongo

In [7]:
kb.CODECS[Molecule].encode(compounds[42758])

{'_id': 42758,
 'name': 'aldehydo-D-glucose',
 'aka': ['Dextrose',
  'D(+)-Glucose',
  '(2R,3S,4R,5R)-2,3,4,5,6-pentahydroxyhexanal',
  'aldehydo-D-gluco-hexose',
  'D-glucose',
  'Glucose',
  'D-GLUCOSE IN LINEAR FORM',
  'WURCS=2.0/1,1,0/[o2122h]/1/'],
 'crossref': [{'db': 'WIKI', 'id': 'Glucose'}, {'db': 'CAS', 'id': '50-99-7'}],
 'formula': 'C6H12O6',
 'mass': 180.15588,
 'charge': 0,
 'inchi': 'InChI=1S/C6H12O6/c7-1-3(9)5(11)6(12)4(10)2-8/h1,3-6,8-12H,2H2/t3-,4+,5+,6+/m0/s1'}

## Big write -- wipe and replace

In [8]:
%%time
# Put this in db=ebi, collection=chebi
ebi.chebi.drop()
for compound_id, compound in compounds.items():
    doc = kb.CODECS[Molecule].encode(compound)
    ebi.chebi.insert_one(doc)

CPU times: user 9.69 s, sys: 688 ms, total: 10.4 s
Wall time: 13.5 s


In [9]:
list(ebi.chebi.find({"aka": "ribose"}).collation({"locale": 'en', "strength": 1}))

[{'_id': 27476,
  'name': 'beta-D-ribopyranose',
  'aka': ['RIBOSE(PYRANOSE FORM)',
   'beta-D-Ribopyranose',
   'WURCS=2.0/1,1,0/[a222h-1b_1-5]/1/',
   'Ribose'],
  'crossref': [{'db': 'CAS', 'id': '7296-60-8'},
   {'db': 'KEGG', 'id': 'C08353'}],
  'formula': 'C5H10O5',
  'mass': 150.1299,
  'charge': 0,
  'inchi': 'InChI=1S/C5H10O5/c6-2-1-10-5(9)4(8)3(2)7/h2-9H,1H2/t2-,3-,4-,5-/m1/s1'},
 {'_id': 45506,
  'name': 'alpha-D-ribose',
  'aka': ['RIBOSE',
   'alpha-D-ribofuranose',
   'alpha-D-Rib',
   'WURCS=2.0/1,1,0/[a222h-1a_1-4]/1/'],
  'formula': 'C5H10O5',
  'mass': 150.1299,
  'charge': 0,
  'inchi': 'InChI=1S/C5H10O5/c6-1-2-3(7)4(8)5(9)10-2/h2-9H,1H2/t2-,3-,4-,5+/m1/s1'},
 {'_id': 47013,
  'name': 'D-ribofuranose',
  'aka': ['ribose',
   'D-ribose',
   'D-Ribose',
   '(3R,4S,5R)-5-(hydroxymethyl)tetrahydrofuran-2,3,4-triol',
   'WURCS=2.0/1,1,0/[a222h-1x_1-4]/1/'],
  'crossref': [{'db': 'KEGG', 'id': 'C00121'},
   {'db': 'CAS', 'id': '50-69-1'},
   {'db': 'CAS', 'id': '613-83-2'}

# Load RHEA master reactions verbatim

RHEA is organized around 'quartets'
- Master - indeterminate or unspecified direction
- irreversible left -> right
- irreversible right -> left
- explicitly reversible

Not clear what is gained by this representation vs say a reversibility attribute. One possibility is it's all about the cross-references to other reaction DBs. Need to explore a bit more.

## Main RHEA reaction definitions are in RDF

In [10]:
import rdflib
from rdflib.namespace import RDFS

RH = rdflib.namespace.Namespace("http://rdf.rhea-db.org/")
rhea_dir = "/home/fdrusso/work/data/rhea"

In [11]:
%time rhea_rdf = rdflib.Graph().parse(os.path.join(rhea_dir, "rhea.rdf"))

rhea_rdf.bind("rh", RH)
rhea_rdf.bind("rdfs", RDFS)
rhea_rdf.bind("ch", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/"))
rhea_rdf.bind("ch2", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/chebi#"))
rhea_rdf.bind("ch3", rdflib.namespace.Namespace("http://purl.obolibrary.org/obo/chebi/"))
rhea_rdf.bind("up", rdflib.namespace.Namespace("http://purl.uniprot.org/core/"))
rhea_rdf.bind("ec", rdflib.namespace.Namespace("http://purl.uniprot.org/enzyme/"))
rhea_rdf.bind("pubmed", rdflib.namespace.Namespace("http://rdf.ncbi.nlm.nih.gov/pubmed/"))

rhea_rdf.bind("ECOCYC", rdflib.namespace.Namespace("http://identifiers.org/biocyc/ECOCYC:"))
rhea_rdf.bind("METACYC", rdflib.namespace.Namespace("http://identifiers.org/biocyc/METACYC:"))
rhea_rdf.bind("KEGG", rdflib.namespace.Namespace("http://identifiers.org/kegg.reaction/"))
rhea_rdf.bind("REACTOME", rdflib.namespace.Namespace("http://identifiers.org/reactome/"))


CPU times: user 2min 41s, sys: 836 ms, total: 2min 42s
Wall time: 2min 42s


## Pull it into a more workable structure

In [12]:
scalars = {
    RH.id,
    RDFS.label,
    RH.equation,
    RH.status,
    RH.isTransport,
    
    RH.curatedOrder,
    
    RH.accession,
    RH.name,
    RH.formula,
    RH.charge,
    
    RH.location,
    RH.position,
    RH.polymerizationIndex,
    RH.underlyingChebi,
}
lists = {
    RH.ec,
    RDFS.seeAlso,
}
objects = {
    RH.bidirectionalReaction,
    RH.directionalReaction,

    RH.side,

    RH.contains1,
    RH.contains2,
    RH.contains3,
    RH.contains4,
    RH.contains5,
    RH.contains6,
    RH.contains7,
    RH.contains8,
    RH.contains9,
    RH.contains10,
    RH.contains11,
    RH.contains12,
    RH.contains13,
    RH.contains14,
    RH.contains15,
    RH.contains16,
    RH.contains17,
    RH.contains18,
    RH.contains19,
    RH.contains20,
    
    RH.contains21,
    RH.contains22,
    RH.contains24,
    RH.contains26,
    RH.contains27,
    RH.contains28,
    RH.contains32,
    RH.contains40,
    RH.containsN,
    RH.contains2n,
    RH.containsNplus1,
    RH.containsNminus1,

    RH.compound,
    RH.reactivePart,
}
drop = {
    RDFS.comment,
    RDFS.subClassOf,
    RH.citation,
    RH.chebi,
    RH.contains,
    RH.htmlEquation,
    RH.htmlName,
    RH.isChemicallyBalanced,
    RH.products,
    RH.substrates,
    RH.substratesOrProducts,
    RH.transformableTo,
}

def extract_value(g, o):
    if type(o) == rdflib.Literal:
        return o.toPython()
    else:
        return o.n3(g.namespace_manager)
    
def extract_object(g, s):
    result = {}
    for p, o in g[s]:
        if p in scalars:
            p = extract_value(g, p)
            o = extract_value(g, o)
            result[p] = o
        elif p in lists:
            p = extract_value(g, p)
            o = extract_value(g, o)
            if p in result:
                result[p].append(o)
            else:
                result[p] = [o]
        elif p in objects:
            p = extract_value(g, p)
            o = extract_object(g, o)
            if p in result:
                result[p].append(o)
            else:
                result[p] = [o]
        elif p not in drop:
            print(f"Ignoring {extract_value(g, s)} {extract_value(g, p)}")
    return result
    

## Coerce master reactions (only) into the KB Reaction structure

In [13]:
def to_dbxref(rhea_xref):
    # Special cases
    for prefix, db in [
        ("ch:GO_", "GO"),
    ]:
        if rhea_xref.startswith(prefix):
            return DbCrossRef(db, rhea_xref[len(prefix):])

    # Generally otherwise just split on a colon
    parts = rhea_xref.split(":")
    if len(parts) == 2:
        return DbCrossRef(parts[0].upper(), parts[1])
    else:
        return DbCrossRef("RHEA", rhea_xref)
    
    
def to_reaction(rhea_rxn):
    crossref = []
    if "rh:ec" in rhea_rxn:
        crossref.extend(to_dbxref(rhea_xref) for rhea_xref in rhea_rxn["rh:ec"])
    if "rdfs:seeAlso" in rhea_rxn:
        crossref.extend(to_dbxref(rhea_xref) for rhea_xref in rhea_rxn["rdfs:seeAlso"])
    # Lump in the rest of the quartet's xrefs -- not rigorous by RHEA standards, but reasonable for us.
    for key in ["rh:directionalReaction", "rh:bidirectionalReaction"]:
        for other_reaction in rhea_rxn.get(key, []):
            if "rh:ec" in other_reaction:
                crossref.extend(to_dbxref(rhea_xref) for rhea_xref in other_reaction["rh:ec"])
            if "rdfs:seeAlso" in other_reaction:
                crossref.extend(to_dbxref(rhea_xref) for rhea_xref in other_reaction["rdfs:seeAlso"])
    
    multipliers = [None, -1, +1]  # curatedOrder -> stoichiometry sign, 1-based
    stoichiometry = {}
    for side in rhea_rxn["rh:side"]:
        multiplier = multipliers[side["rh:curatedOrder"]]
        for count in range(1, 21):
            key = "rh:contains" + str(count)
            for rhea_compound in side.get(key, []):
                chebi_id = rhea_compound["rh:compound"][0]["rh:accession"]
                if chebi_id.startswith("CHEBI:"):
                    # We'll use the in-memory molecules for now, but this will utimately be a KB lookup
                    compound = compounds[int(chebi_id[6:])]
                    stoichiometry[compound] = multiplier * count
                else:
                    raise ValueError(f"Unrecognized compound ID {chebi_id}")
    
    reversible = False
    if "rh:bidirectionalReaction" in rhea_rxn and rhea_rxn["rh:bidirectionalReaction"][0].get("rdfs:seeAlso"):
        reversible = True
    
    return Reaction(
        _id = rhea_rxn["rh:id"],
        name = rhea_rxn["rdfs:label"],
        crossref = crossref or None,
        stoichiometry = stoichiometry,
        reversible = reversible,
    )

to_reaction(extract_object(rhea_rdf, RH["16109"]))

Reaction(_id=16109, name='ATP + beta-D-fructose 6-phosphate = ADP + beta-D-fructose 1,6-bisphosphate + H(+)', shorthand=None, aka=None, crossref=[EC:2.7.1.11, GO:0003872, ECOCYC:6PFRUCTPHOS-RXN, METACYC:6PFRUCTPHOS-RXN, REACTOME:R-HSA-70467.5, KEGG:R00756], stoichiometry={Molecule [30616] ATP(4-)
  formula: C10H12N5O13P3
  mass: 503.14946 Da
  charge: -4: -1, Molecule [57634] beta-D-fructofuranose 6-phosphate(2-)
  formula: C6H11O9P
  mass: 258.1199 Da
  charge: -2: -1, Molecule [456216] ADP(3-)
  formula: C10H12N5O10P2
  mass: 424.1773 Da
  charge: -3: 1, Molecule [32966] beta-D-fructofuranose 1,6-bisphosphate(4-)
  formula: C6H10O12P2
  mass: 336.08392 Da
  charge: -4: 1, Molecule [15378] hydron
  formula: H
  mass: 1.00794 Da
  charge: +1: 1}, catalyst=None, reversible=True)

In [14]:
reactions = {}
skipped = {}
for s, _, _ in rhea_rdf.triples((None, RDFS.subClassOf, RH.Reaction)):
    rhea_rxn = extract_object(rhea_rdf, s)
    if rhea_rxn["rh:status"] == "rh:Approved":
        try:
            reaction = to_reaction(rhea_rxn)
            reactions[reaction._id] = reaction
        
        except Exception as e:
            skipped[s] = (rhea_rxn, e)

print(f"{len(reactions)} reactions parsed succesfully, {len(skipped)} skipped")

10093 reactions parsed succesfully, 3794 skipped


In [16]:
kb.CODECS[Reaction].decode(kb.CODECS[Reaction].encode(reactions[11816]))

Reaction(_id=11816, name='aldehydo-D-glucose 6-phosphate = keto-D-fructose 6-phosphate', shorthand=None, aka=None, crossref=[EC:5.3.1.9, GO:0004347, KEGG:R00771, ECOCYC:PGLUCISOM-RXN, METACYC:PGLUCISOM-RXN], stoichiometry={Molecule [57584] aldehydo-D-glucose 6-phosphate(2-): -1, Molecule [57579] D-fructose 6-phosphate(2-): 1}, catalyst=None, reversible=True)

## Wipe and replace rhea

In [17]:
%%time
# Put this in db=ebi, collection=rhea
ebi.rhea.drop()
for reaction_id, reaction in reactions.items():
    doc = kb.CODECS[Reaction].encode(reaction)
    ebi.rhea.insert_one(doc)

CPU times: user 2.35 s, sys: 183 ms, total: 2.53 s
Wall time: 3.34 s
