Integrate MetaNetX and BioCyc annotations into the MetaFlux SBML export.

In [1]:
# use -python-local-only-non-strict when calling PathwayTools
import pythoncyc
from pythoncyc.PToolsFrame import Symbol, PFrame
from pythoncyc.PTools import sendQueryToPTools, PToolsError
from IPython.display import display, HTML
import json
import urllib
import time
pythoncyc.all_orgids()

['|GCF_004798725|', '|LRT|', '|LRU|', '|META|', '|ECOLI|']

In [2]:
import cobra
import os
cobra.Configuration.solver = 'glpk'

def get_biocyc_id(met: cobra.Metabolite):
    biocyc_id = met.annotation.get('biocyc')
    if biocyc_id:
        biocyc_id = biocyc_id[biocyc_id.index(':')+1:]
    return biocyc_id

In [3]:
with open("metanetx_annotations.json") as fp:
    metanetx_annotations = json.load(fp)

In [4]:
species = 'lumbricus_terrestris'

In [5]:
if species == 'lumbricus_terrestris':
    pgdb = pythoncyc.select_organism('lrt')
else:
    pgdb = pythoncyc.select_organism('lru')
metacyc = pythoncyc.select_organism('meta')

In [6]:
model = cobra.io.read_sbml_model(os.path.join(species, "MetaFlux/SBML_export.xml"))

In [7]:
import pandas as pd
bigg_metabolites = pd.read_csv("/scratch/vonkamp/gwdg_owncloud/cnapy-projects/a_woodii/Models_Acetogens/functions/bigg_models_metabolites.txt", sep='\t')

In [8]:
# validate MetaNetX mapping and create BiGG mapping
biocyc2bigg = dict()
native = 0
bigg_met = cobra.Metabolite("B")
bigg_results = dict()
#with open("/scratch/vonkamp/gwdg_owncloud/lumbricus_integrate_koala/bigg_results.json") as fp:
#    bigg_results = json.load(fp)
for m,v in metanetx_annotations.items():
    try:
        b = [metacyc.get_slot_values(m, "DBLINKS").get("|BIGG|")[0]]
    except:
        b = None
    if b:
        biocyc2bigg[m] = b
        native += 1
        continue
    b = [a for a in v if a.startswith('https://biocyc.org/compound')]
    if len(b) > 0:
        b = [a[a.index('=')+1:] for a in b] # many have multiple mappings back to MetaCyc
        if m not in b:
            print(m, b)
    else:
        print(m, b)
    b = {a[43:].replace('M_', '').replace('-', '__') for a in v if a.startswith('http://bigg.ucsd.edu/universal/metabolites/')}
    valid = []
    for a in b:
        if a in bigg_results:
            result = bigg_results[a]
            db_links = result['database_links']
            if 'BioCyc' not in db_links:
                print(a, "does not link back to BioCyc")
            else:
                db_links = db_links.get('BioCyc', [])
                if len(db_links) == 1 and db_links[0]['id'][5:] == m: # unique link back to BioCyc
                    valid.append(a)
                else: # check if a variant with matching formula and charge exists
                    for formula,charge in zip(result['formulae'], result['charges']):
                        try:
                            biocyc_charge = sum(c for _,c in metacyc.get_slot_values(m, 'ATOM-CHARGES'))
                            if charge == biocyc_charge:
                                biocyc_elements = {e[1] if len(e) == 3 else e[1]+e[2].lower(): int(c[0]) for e,c in metacyc.get_slot_values(m, 'CHEMICAL-FORMULA').items()}
                                bigg_met.formula = formula
                                if bigg_met.elements == biocyc_elements:
                                    valid.append(a)
                        except:
                            pass
            continue
        result = bigg_metabolites[bigg_metabolites.universal_bigg_id == a]["database_links"]
        if len(result) == 0:
            print(a, "appears obsolete")
        else:
            try:
                result = [s.split("META:")[1] for s in result.iloc[0].split(';') if "META:" in s]
                if m in result:
                    valid.append(a)
                #if len(result) == 1 and result[0] == m: # unique link back to BioCyc
                #    valid.append(a)
                #else:
                #    print(m, "has candidates", result)
            except:
                print("Cannot resolve", m)
        # try:
        #     if a in bigg_results:
        #         result = bigg_results[a]
        #     else:
        #         result = urllib.request.urlopen('http://bigg.ucsd.edu/api/v2/models/universal/metabolites/'+a)
        #         data = result.read()
        #         result = json.loads(data.decode(result.info().get_content_charset('utf-8')))
        #         bigg_results[a] = result
        #         time.sleep(0.1)
        #     db_links = result['database_links']
        #     # if 'BioCyc' not in db_links:
        #     #     print(a, "does not link back to BioCyc")
        #     # else:
        #     db_links = db_links.get('BioCyc', [])
        #     if len(db_links) == 1 and db_links[0]['id'][5:] == m: # unique link back to BioCyc
        #         valid.append(a)
        #     else: # check if a variant with matching formula and charge exists
        #         for formula,charge in zip(result['formulae'], result['charges']):
        #             biocyc_charge = sum(c for _,c in meta.get_slot_values(m, 'ATOM-CHARGES'))
        #             if charge == biocyc_charge:
        #                 biocyc_elements = {e[1] if len(e) == 3 else e[1]+e[2].lower(): int(c[0]) for e,c in meta.get_slot_values(m, 'CHEMICAL-FORMULA').items()}
        #                 bigg_met.formula = formula
        #                 if bigg_met.elements == biocyc_elements:
        #                     valid.append(a)
        #         # if len(db_links) > 1:
        #         #     print(a, "does not link back uniquely to", m)
        #         # else:
        #         #     if db_links[0]['id'][5:] == m:
        #         #         valid.append(a)
        #         #     else:
        #         #         print(a, "does not link back to", m, "but", db_links[0])
        # except:
        #     print(a, "appears obsolete")
    biocyc2bigg[m] = valid
    if len(valid) > 1:
        print(m, valid)
native


Cannot resolve 5-PHOSPHONOOXY-L-LYSINE
CPD-25508 []
CPD-25450 []
Cannot resolve PHYTOSPINGOSINE
Cannot resolve CPD-9720
CPD-26479 []
Cannot resolve CPD-14276
Cannot resolve CPD-8343
Cannot resolve CPD-8609
Cannot resolve CPD-10505
CPD-24938 []
Cannot resolve CPD-9871
CPD-24959 []
Oxidized-CymA-Proteins []
CPD-24553 []
Alpha-Tubulin-L-Lysine []
Cannot resolve CPD-547
CPD-24844 []
Cannot resolve CPD-17368
CPD-26676 []
CPD-24554 []
Cannot resolve ALPHA-GLUCOSE-16-BISPHOSPHATE
CPD-27098 []
Cannot resolve CPD-19740
CPD-24926 []
CPD-25877 []
Cannot resolve ARACHIDONYL-COA
Cannot resolve 2-OXOSUCCINAMATE
CPD-26482 []
Cannot resolve GLC
Cannot resolve CPD-14018
Cannot resolve CPD-13534
Cannot resolve CPD-9406
Cannot resolve OXALO-SUCCINATE
MOCS3-Cysteine []
Ditrans-polycis-polyprenals []
Globotriosylceramides []
Cannot resolve 2-KETO-GLUTARAMATE
Cannot resolve CPD-19741
LIPOIC-ACID ['lipt', 'lipoate']
Cannot resolve PENTANOYLCOA-CPD
Cannot resolve PENTANOYLCOA-CPD
Cannot resolve CPD-14283
CPD-

316

In [9]:
print(len(biocyc2bigg), sum(1 for v in biocyc2bigg.values() if v))

1250 489


In [10]:
for m in model.metabolites:
    biocyc_id = get_biocyc_id(m)
    if biocyc_id:
        a = biocyc2bigg.get(biocyc_id, None)
        if a:
            m.annotation['bigg.metabolite'] = a
            if len(a) > 1:
                print(m.id, a)

ALPHA_GLUCOSE_c ['glc__aD', 'Glc_aD']
AMMONIA_c ['nh4', 'nh3']
CPD_170_c ['stchs', 'stys']
LIPOIC_ACID_c ['lipt', 'lipoate']


In [11]:
# # resolve ambiguous mappings so that they refer to the same metabolite in iCEL1314 (must have one of the BiGG IDs and same KEGG ID)
# model.metabolites.get_by_id('2_KETO_GLUTARAMATE_c').annotation['iCEL1314'] = 'akgm_c' # not in BiGG
# model.metabolites.OLEOYL_COA_c.annotation['bigg.metabolite'] = 'odecoa' # C00510

In [12]:
for m in model.metabolites:
    biocyc_id = get_biocyc_id(m)
    if biocyc_id:
        try:
            pf = PFrame(biocyc_id, metacyc, getFrameData=True)
            if pf.synonyms is None:
                m.annotation['metacyc.synonyms'] = []
            else:
                m.annotation['metacyc.synonyms'] = pf.synonyms
        except:
            try:
                m.annotation['metacyc.synonyms'] = sendQueryToPTools(
                    f'(get-slot-values (nth 0 (with-organism (:org-id \'meta) (get-frame-labeled "{biocyc_id}"))) \'synonyms)')
            except:
                m.annotation['metacyc.synonyms'] = []
    else:
        m.annotation['metacyc.synonyms'] = []

In [13]:
# add BioCyc pathways to reaction annotation
for r in model.reactions:
    if not r.boundary:
        try:
            if '/' in r.name:
                idx = r.name.find('RXN')
                if idx == 0:
                    idx = r.name.find('-', r.name.find('-') + 1) # find second dash
                    biocyc_id = r.name[:idx]
                else:
                    biocyc_id = r.name[:idx + 3]
            else:
                biocyc_id = r.name
            ps = [p[1:-1] for p in pgdb.get_slot_values(biocyc_id, 'IN-PATHWAY')]
            if ps:
                r.annotation['biocyc.pathway'] = ps
                ps = [pgdb.get_slot_value(p, 'COMMON-NAME') for p in ps]
                r.annotation['biocyc.pathway-name'] = [(p if p else 'n/a') for p in ps] # replace None where no common name was defined
        except PToolsError as e:
            print(biocyc_id, e)

BIOMASS-EQUATION An internal error occurred in the running Pathway Tools application: :error Object "BIOMASS-EQUATION" is not coercible to a frame for KB LRTBASE
RXN-13451[CCO An internal error occurred in the running Pathway Tools application: :error Object "RXN-13451[CCO" is not coercible to a frame for KB LRTBASE
RXN66-1[CCO An internal error occurred in the running Pathway Tools application: :error Object "RXN66-1[CCO" is not coercible to a frame for KB LRTBASE
RXN-13451[CCO An internal error occurred in the running Pathway Tools application: :error Object "RXN-13451[CCO" is not coercible to a frame for KB LRTBASE
RXN-13451[CCO An internal error occurred in the running Pathway Tools application: :error Object "RXN-13451[CCO" is not coercible to a frame for KB LRTBASE
RXN-13451[CCO An internal error occurred in the running Pathway Tools application: :error Object "RXN-13451[CCO" is not coercible to a frame for KB LRTBASE
RXN-13451[CCO An internal error occurred in the running Pathwa

In [14]:
cobra.io.write_sbml_model(model, os.path.join(species, "MetaFlux/SBML_export_augmented.sbml"))