In [1]:
import projectpath

import collections
import os
from typing import Iterable

import pandas as pd
import pymongo

from mosmo.knowledge import kb
from mosmo.model import Molecule, Reaction, KbEntry, DbXref

chebi_dir = '/home/fdrusso/work/data/chebi'

KB = kb.configure_kb()

# Load ChEBI data verbatim

In [2]:
compounds = {}
for row in pd.read_csv(os.path.join(chebi_dir, 'compounds.tsv'), sep='\t', dtype={'ID': str}).itertuples():
    # Policy decision: use confirmed entries and primary ID only; let the source worry about obsolete IDs.
    if row.STATUS == 'C' and pd.isnull(row.PARENT_ID):
        compounds[row.ID] = Molecule(id=row.ID, name=row.NAME)
print(f'{len(compounds)} valid compounds')

59708 valid compounds


In [3]:
compound_names = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, 'names.tsv'), sep='\t', dtype={'COMPOUND_ID': str}).itertuples():
    if row.COMPOUND_ID in compounds:
        compound_names[row.COMPOUND_ID].add(row.NAME)
for compound_id, names in compound_names.items():
    compound = compounds[compound_id]
    compound.aka = list(names - {compound.name})

for row in pd.read_csv(os.path.join(chebi_dir, 'chemical_data.tsv'), sep='\t', dtype={'COMPOUND_ID': str}).itertuples():
    if row.COMPOUND_ID in compounds:
        if row.TYPE == 'MASS':
            compounds[row.COMPOUND_ID].mass = float(row.CHEMICAL_DATA)
        elif row.TYPE == 'CHARGE':
            compounds[row.COMPOUND_ID].charge = int(row.CHEMICAL_DATA)
        elif row.TYPE == 'FORMULA':
            compounds[row.COMPOUND_ID].formula = row.CHEMICAL_DATA

for row in pd.read_csv(os.path.join(chebi_dir, 'chebiId_inchi.tsv'), sep='\t', dtype={'CHEBI_ID': str}).itertuples():
    if row.CHEBI_ID in compounds:
        compounds[row.CHEBI_ID].inchi = row.InChI

In [4]:
# Xrefs for select db's only, normalizing on all-caps db names.
db_mapping = {
    'CAS Registry Number': 'CAS',
    'KEGG COMPOUND accession': 'KEGG',
    'KEGG GLYCAN accession': 'KEGG',
    'KEGG DRUG accession': 'KEGG',
    'MetaCyc accession': 'METACYC',
    'LINCS accession': 'LINCS',
    'Wikipedia accession': 'WIKI',
}

compound_xrefs = collections.defaultdict(set)
for row in pd.read_csv(os.path.join(chebi_dir, 'database_accession.tsv'), sep='\t', dtype={'COMPOUND_ID': str}).itertuples():
    if row.COMPOUND_ID in compounds and row.TYPE in db_mapping:
        compound_xrefs[row.COMPOUND_ID].add(DbXref(db_mapping[row.TYPE], row.ACCESSION_NUMBER))

for compound_id, xrefs in compound_xrefs.items():
        compounds[compound_id].xrefs = xrefs


In [5]:
compounds['17634']

[17634] D-glucose

# Put it in KB.CHEBI

In [6]:
%%time
collection = KB.client[KB.CHEBI.client_db][KB.CHEBI.collection]
collection.drop()
for compound in compounds.values():
    KB.put(KB.CHEBI, compound, bypass_cache=True)
collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
collection.create_index('aka', name='aka', collation=pymongo.collation.Collation(locale='en_US', strength=1))
collection.create_index([('xrefs.id', pymongo.ASCENDING), ('xrefs.db', pymongo.ASCENDING)],
                        name='xrefs',
                        collation=pymongo.collation.Collation(locale='en_US', strength=1))

CPU times: user 20.9 s, sys: 1.22 s, total: 22.1 s
Wall time: 36.2 s


'xrefs'

In [7]:
KB.find(KB.CHEBI, 'ribose')

[[33942] ribose,
 [27476] beta-D-ribopyranose,
 [45506] alpha-D-ribose,
 [47013] D-ribofuranose]

# Load RHEA master reactions verbatim

RHEA is organized around 'quartets'
- Master - indeterminate or unspecified direction
- irreversible left -> right
- irreversible right -> left
- explicitly reversible

Not clear what is gained by this representation vs say a reversibility attribute. One possibility is it's all about the cross-references to other reaction DBs. Need to explore a bit more.

## Main RHEA reaction definitions are in RDF

In [8]:
import rdflib
from rdflib.namespace import RDFS

RH = rdflib.namespace.Namespace('http://rdf.rhea-db.org/')
rhea_dir = '/home/fdrusso/work/data/rhea'

In [9]:
%time rhea_rdf = rdflib.Graph().parse(os.path.join(rhea_dir, 'rhea.rdf'))

rhea_rdf.bind('rh', RH)
rhea_rdf.bind('rdfs', RDFS)
rhea_rdf.bind('ch', rdflib.namespace.Namespace('http://purl.obolibrary.org/obo/'))
rhea_rdf.bind('ch2', rdflib.namespace.Namespace('http://purl.obolibrary.org/obo/chebi#'))
rhea_rdf.bind('ch3', rdflib.namespace.Namespace('http://purl.obolibrary.org/obo/chebi/'))

rhea_rdf.bind('UNIPROT', rdflib.namespace.Namespace('http://purl.uniprot.org/core/'))
rhea_rdf.bind('EC', rdflib.namespace.Namespace('http://purl.uniprot.org/enzyme/'))
rhea_rdf.bind('PUBMED', rdflib.namespace.Namespace('http://rdf.ncbi.nlm.nih.gov/pubmed/'))
rhea_rdf.bind('ECOCYC', rdflib.namespace.Namespace('http://identifiers.org/biocyc/ECOCYC:'))
rhea_rdf.bind('METACYC', rdflib.namespace.Namespace('http://identifiers.org/biocyc/METACYC:'))
rhea_rdf.bind('KEGG', rdflib.namespace.Namespace('http://identifiers.org/kegg.reaction/'))
rhea_rdf.bind('REACT', rdflib.namespace.Namespace('http://identifiers.org/reactome/'))
rhea_rdf.bind('MACIE', rdflib.namespace.Namespace('http://identifiers.org/macie/'))

CPU times: user 2min 15s, sys: 1.21 s, total: 2min 16s
Wall time: 2min 16s


## Pull it into a more workable structure

In [10]:
# predicate: (extract_fn, is_list)
extract_predicate = {
    RH.id: ("value", False),
    RDFS.label: ("value", False),
    RDFS.comment: ("value", False),
    RH.status: ("value", False),
    RH.ec: ("value", True),
    RDFS.seeAlso: ("value", True),

    RH.equation: ("value", False),
    RH.isTransport: ("value", False),
    RH.bidirectionalReaction: ("object", False),
    RH.directionalReaction: ("object", True),

    RH.side: ("object", True),
    RH.curatedOrder: ("value", False),

    RH.compound: ("object", False),
    RH.accession: ("value", False),
    RH.name: ("value", False),
    RH.reactivePart: ("object", True),
    RH.location: ("value", False),
    RH.position: ("value", False),
    RH.polymerizationIndex: ("value", False),
    RH.underlyingChebi: ("value", False),
}

# Special case for funky containsXXX predicates
contains_count = {}
for contains in rhea_rdf.subjects(RDFS.subPropertyOf, RH.contains):
    count = rhea_rdf.value(contains, RH.coefficient).toPython()
    # if count.isdigit():
    #     count = int(count)
    contains_count[contains] = count

# Intentionally ignore (redundant)
drop = {
    RDFS.subClassOf,
    RH.charge,
    RH.chebi,
    RH.citation,
    RH.contains,
    RH.formula,
    RH.htmlEquation,
    RH.htmlName,
    RH.isChemicallyBalanced,
    RH.products,
    RH.substrates,
    RH.substratesOrProducts,
    RH.transformableTo,
}

def extract_value(g, o):
    if type(o) == rdflib.Literal:
        return o.toPython()
    else:
        return o.n3(g.namespace_manager)
    
def extract_object(g, s):
    result = {}
    for p, o in g[s]:
        if p in extract_predicate:
            otype, is_list = extract_predicate[p]
            p = extract_value(g, p)
            if otype == 'value':
                o = extract_value(g, o)
            elif otype == 'object':
                o = extract_object(g, o)
            else:
                raise ValueError(f'Misconfigured predicate {p}')

            if is_list:
                if p in result:
                    result[p].append(o)
                else:
                    result[p] = [o]
            else:
                if p in result:
                    raise ValueError(f'Clobbered {extract_value(g, s)} {extract_value(g, p)}')
                result[p] = o

        elif p in contains_count:
            coeff = contains_count[p]
            p = 'rh:contains'
            o = {'count': coeff, 'object': extract_object(g, o)}
            if p in result:
                result[p].append(o)
            else:
                result[p] = [o]
            
        elif p not in drop:
            print(f'Ignoring {extract_value(g, s)} {extract_value(g, p)}')

    return result
    

In [11]:
%%time
reaction_ids = list(rhea_rdf.subjects(RDFS.subClassOf, RH.Reaction))
extracted = [extract_object(rhea_rdf, s) for s in reaction_ids]

print(f"{len(reaction_ids)} primary reactions, {len(extracted)} extracted successfully")

14040 primary reactions, 14040 extracted successfully
CPU times: user 21.5 s, sys: 115 ms, total: 21.6 s
Wall time: 21.6 s


## Coerce master reactions (only) into the KB Reaction structure

In [17]:
def to_dbxref(rhea_xref):
    # Special cases
    for prefix, db in [
        ('ch:GO_', 'GO'),
        ('ch:CHEBI_', 'CHEBI')
    ]:
        if rhea_xref.startswith(prefix):
            return DbXref(db, rhea_xref[len(prefix):])

    # Generally otherwise just parse it normally
    return DbXref.from_str(rhea_xref)


def to_reaction(rhea_rxn):
    def rxn_xrefs(rxn):
        if rxn:
            for xref in rxn.get('rh:ec', []):
                yield to_dbxref(xref)
            for xref in rxn.get('rdfs:seeAlso', []):
                yield to_dbxref(xref)

    # Lump together all of the quartet's xrefs -- not rigorous by RHEA standards, but reasonable for us.
    xrefs = set(rxn_xrefs(rhea_rxn))
    for subrxn in rhea_rxn.get('rh:directionalReaction'):
        xrefs.update(rxn_xrefs(subrxn))
    xrefs.update(rxn_xrefs(rhea_rxn.get('rh:bidirectionalReaction')))

    # RHEA does not provide nice names on its own; get the name from EC if possible
    ec_nums = [xref for xref in xrefs if xref.db == 'EC']
    if ec_nums:
        name = KB.get(KB.EC, ec_nums[0].id).name
        # Mark names as ambiguous where there are multiple ECs.
        if len(ec_nums) > 1:
            name = '* ' + name
    else:
        # Fallback: use the RHEA-provided label, which is just the reaction formula.
        name = rhea_rxn['rdfs:label']

    reversible = bool(rhea_rxn.get('rh:bidirectionalReaction', {}).get('rdfs:seeAlso'))

    multipliers = [None, -1, +1]  # curatedOrder -> stoichiometry sign, 1-based
    stoichiometry = {}
    for side in rhea_rxn['rh:side']:
        multiplier = multipliers[side['rh:curatedOrder']]
        for participant in side.get('rh:contains'):
            accession = to_dbxref(participant['object']['rh:compound']['rh:accession'])
            compound = None
            if accession.db == 'CHEBI':
                # We'll use the in-memory molecules for now, but this will utimately be a KB lookup
                compound = compounds[accession.id]
            elif 'rh:underlyingChebi' in participant['object']['rh:compound']:
                accession = to_dbxref(participant['object']['rh:compound']['rh:underlyingChebi'])
                compound = compounds[accession.id]
            
            if compound:
                stoichiometry[compound] = multiplier * int(participant['count'])
            else:
                raise ValueError(f'Unrecognized compound ID {accession}')

    return Reaction(
        id = str(rhea_rxn['rh:id']),
        name = name,
        xrefs = xrefs or None,
        stoichiometry = stoichiometry,
        reversible = reversible,
    )

to_reaction(extract_object(rhea_rdf, RH['13713']))

[13713] 16389 + 30031 <=> 17976 + 29806

In [18]:
%%time
reactions = {}
skipped = {}
for rhea_rxn in extracted:
    if rhea_rxn['rh:status'] == 'rh:Approved':
        try:
            reaction = to_reaction(rhea_rxn)
            reactions[reaction.id] = reaction
        
        except Exception as e:
            skipped[rhea_rxn['rh:id']] = (rhea_rxn, e)

print(f'{len(reactions)} reactions parsed succesfully, {len(skipped)} skipped')

10257 reactions parsed succesfully, 3630 skipped
CPU times: user 775 ms, sys: 12 ms, total: 787 ms
Wall time: 815 ms


## Store to KB.RHEA

In [19]:
%%time
collection = KB.client[KB.RHEA.client_db][KB.RHEA.collection]
collection.drop()
for reaction in reactions.values():
    KB.put(KB.RHEA, reaction, bypass_cache=True)
collection.create_index('name', name='name', collation=pymongo.collation.Collation(locale='en_US', strength=1))
collection.create_index([('xrefs.id', pymongo.ASCENDING), ('xrefs.db', pymongo.ASCENDING)],
                        name='xrefs',
                        collation=pymongo.collation.Collation(locale='en_US', strength=1))

CPU times: user 6.76 s, sys: 295 ms, total: 7.06 s
Wall time: 11.7 s


'xrefs'