# Metabolite annotation
Using MetaNetX chem_xref.tsv (can  be downloaded from [here](https://www.metanetx.org/mnxdoc/mnxref.html)).

In [1]:
from collections import defaultdict
from pathlib import Path

import cobra
import re
from datatable import dt, f, join, update

In [2]:
ROOT = Path.cwd().parent
model_file = str(ROOT / "iMENI452.xml")

In [3]:
model = cobra.io.read_sbml_model(model_file)

Scaling...
 A: min|aij| =  1.000e+00  max|aij| =  1.000e+00  ratio =  1.000e+00
Problem data seem to be well scaled


Gather all prepared reactions in a dataframe matched to their identifiers in the model.

In [4]:
mets_prepared = [met.id for met in model.metabolites]

In [5]:
mets_prepared[:5]

['ala-L[c]', 'nad[c]', 'h2o[c]', 'pyr[c]', 'nh4[c]']

In [6]:
mnx = dt.fread(str(ROOT / "chem_xref.tsv"), skip_to_line=352)

In [7]:
mnx.head()

Unnamed: 0_level_0,#source,ID,description
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,BIOMASS,BIOMASS,BIOMASS
1,mnx:BIOMASS,BIOMASS,BIOMASS
2,seed.compound:cpd11416,BIOMASS,Biomass
3,seedM:M_cpd11416,BIOMASS,secondary/obsolete/fantasy identifier
4,seedM:cpd11416,BIOMASS,Biomass
5,MNXM01,MNXM01,PMF||Translocated proton that acccounts for the Pr…
6,mnx:PMF,MNXM01,PMF||Translocated proton that acccounts for the Pr…
7,CHEBI:16234,MNXM02,hydroxide||HO-||HYDROXIDE ION||Hydroxide ion||OH(-…
8,CHEBI:29356,MNXM02,oxide(2-)||O(2-)||oxide
9,MNXM02,MNXM02,OH(-)||hydroxyde


"bigg.metabolites" do not have the compartment in the annotation.

In [8]:
mnx.names = ["key", "mnx", "description"]

In [9]:
mnx[dt.re.match(f.key, r"bigg.metabolite.*"), f.key].head()

Unnamed: 0_level_0,key
Unnamed: 0_level_1,▪▪▪▪
0,bigg.metabolite:oh1
1,bigg.metabolite:h
2,bigg.metabolite:nadh
3,bigg.metabolite:grdp
4,bigg.metabolite:mercplaccys
5,bigg.metabolite:CE2176
6,bigg.metabolite:12dgr_SC
7,bigg.metabolite:triodthy
8,bigg.metabolite:4hdebrisoquine
9,bigg.metabolite:4mtolbutamide


We have to express a map from bigg metabolites identifiers in the model to the normalized bigg metabolites (without compartment).

In [10]:
MET_PAT = re.compile(r"(.+)\[[ce]\]$")

In [11]:
MET_PAT.sub(r"\1", "nad[c]")

'nad'

In [12]:
mets_prepared = [MET_PAT.sub(r"\1", met) for met in mets_prepared]

In addition, we have to express "ala-L" and "glc-D" as "ala__L" and "glc__D".

In [13]:
DUNDER_PAT = re.compile(r"-([DLSR])")

In [14]:
DUNDER_PAT.sub(r"__\1", "ala-L")

'ala__L'

In [15]:
mets_prepared = [DUNDER_PAT.sub(r"__\1", met) for met in mets_prepared]

And "acon-C" and "ala-B" as "acon_C" and "ala_B".

In [16]:
UNDER_PAT = re.compile(r"-([CBT])")

In [17]:
mets_prepared = [UNDER_PAT.sub(r"_\1", met) for met in mets_prepared]

Same with ending numbers.

In [18]:
UNDER_PAT = re.compile(r"-([0-9])$")

In [19]:
mets_prepared = [UNDER_PAT.sub(r"_\1", met) for met in mets_prepared]

In [20]:
mets_prepared = dt.Frame(
    key=mets_prepared, model_id=[met.id for met in model.metabolites]
)

In [21]:
mets_prepared.head()

Unnamed: 0_level_0,key,model_id
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪
0,ala__L,ala-L[c]
1,nad,nad[c]
2,h2o,h2o[c]
3,pyr,pyr[c]
4,nh4,nh4[c]
5,nadh,nadh[c]
6,h,h[c]
7,glu__L,glu-L[c]
8,oaa,oaa[c]
9,akg,akg[c]


# Annotation MNX

In [22]:
bigg_mnx = mnx[
    dt.re.match(f.key, "^bigg\.metabolite:.+"), {"key": f.key[16:], "mnx": f.mnx}
]
bigg_mnx.shape

(9087, 2)

In [23]:
bigg_mnx.key = "key"

In [24]:
bigg_mnx[dt.re.match(f.key, r"hacon.*"), :]

Unnamed: 0_level_0,key,mnx
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪
0,hacon_C,MNXM920
1,hacon_T,MNXM162779


In [25]:
mets_prepared = mets_prepared[:, :, join(bigg_mnx)]

In [26]:
mets_prepared.head()

Unnamed: 0_level_0,key,model_id,mnx
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,ala__L,ala-L[c],MNXM1105732
1,nad,nad[c],MNXM8
2,h2o,h2o[c],WATER
3,pyr,pyr[c],MNXM23
4,nh4,nh4[c],MNXM729302
5,nadh,nadh[c],MNXM10
6,h,h[c],MNXM1
7,glu__L,glu-L[c],MNXM741173
8,oaa,oaa[c],MNXM46
9,akg,akg[c],MNXM20


In [27]:
mets_prepared[dt.isna(f.mnx), :].head()

Unnamed: 0_level_0,key,model_id,mnx
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,mqn,mqn[c],
1,mql,mql[c],
2,btnp-s2,btnp-s2[c],
3,btnp,btnp[c],
4,ppant__R,ppant-R[c],
5,galactan,galactan[c],
6,polyacgal,polyacgal[c],
7,polyglcur,polyglcur[c],
8,fcd,fcd[e],
9,fgd,fgd[e],


The remaining metabolites cannot be easily translated; for instance, what metaquinone is mqn? We can leave it here.

In [28]:
for met in model.metabolites:
    matched = mets_prepared[f.model_id == met.id, "mnx"]
    if matched.nrows:
        met.annotation["metanetx.chemical"] = matched[0, 0]

In [29]:
len([1 for reac in model.metabolites if "metanetx.chemical" in reac.annotation if reac.annotation["metanetx.chemical"] is not None])

625

In [30]:
len(model.metabolites)

684

## Annotating the rest of dbs

In [31]:
mnx["db"] = "mnx"

In [32]:
def get_id_db(id_base: str) -> (str, str):
    id_base = id_base.split(":", maxsplit=1)
    if len(id_base) > 1:
        db, id = id_base
    else:
        db, id = "mnx", id_base[0]
    return id, db

In [33]:
ids_dbs = [get_id_db(mnx[i, "key"]) for i in range(mnx.nrows)]
ids = [el[0] for el in ids_dbs]
dbs = [el[1] for el in ids_dbs]
mnx["key"] = dt.Frame(ids)
mnx["db"] = dt.Frame(dbs)

In [34]:
mnx

Unnamed: 0_level_0,key,mnx,description,db
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪
0,BIOMASS,BIOMASS,BIOMASS,mnx
1,BIOMASS,BIOMASS,BIOMASS,mnx
2,cpd11416,BIOMASS,Biomass,seed.compound
3,M_cpd11416,BIOMASS,secondary/obsolete/fantasy identifier,seedM
4,cpd11416,BIOMASS,Biomass,seedM
5,MNXM01,MNXM01,PMF||Translocated proton that acccounts for the Pr…,mnx
6,PMF,MNXM01,PMF||Translocated proton that acccounts for the Pr…,mnx
7,16234,MNXM02,hydroxide||HO-||HYDROXIDE ION||Hydroxide ion||OH(-…,CHEBI
8,29356,MNXM02,oxide(2-)||O(2-)||oxide,CHEBI
9,MNXM02,MNXM02,OH(-)||hydroxyde,mnx


Remove the ones from metanetx.

In [35]:
mnx = mnx[f.db != "mnx", :]

In [37]:
for met in model.metabolites:
    kv = mnx[f.mnx == met.annotation["metanetx.chemical"], ["key", "db"]].to_list()
    data_dict = defaultdict(list)
    for k, v in zip(kv[1], kv[0]):
        data_dict[k].append(v)
    if met.annotation["metanetx.chemical"]:
        data_dict["metanetx.chemical"] = met.annotation["metanetx.chemical"]
    if "kegg.compound" in met.annotation:
        data_dict["kegg.compound"] = met.annotation["kegg.compound"]
    for k, v in data_dict.items():
        data_dict[k] = v if len(v) > 1 else v[0]
    met.annotation = data_dict

In [38]:
dbs = dt.unique(mnx["db"]).to_list()[0]

In [39]:
dbs

['CHEBI',
 'SLM',
 'bigg.metabolite',
 'biggM',
 'chebi',
 'envipath',
 'envipathM',
 'hmdb',
 'kegg.compound',
 'kegg.drug',
 'kegg.glycan',
 'keggC',
 'keggD',
 'keggG',
 'lipidmaps',
 'lipidmapsM',
 'metacyc.compound',
 'metacycM',
 'reactome',
 'reactomeM',
 'rheaG',
 'rheaP',
 'sabiork.compound',
 'sabiorkM',
 'seed.compound',
 'seedM',
 'slm']

In [40]:
for db in dbs:
    print(
        f"{db} annotated: {len([1 for met in model.metabolites if db in met.annotation])}"
    )

CHEBI annotated: 490
SLM annotated: 19
bigg.metabolite annotated: 625
biggM annotated: 625
chebi annotated: 490
envipath annotated: 109
envipathM annotated: 109
hmdb annotated: 380
kegg.compound annotated: 593
kegg.drug annotated: 94
kegg.glycan annotated: 2
keggC annotated: 476
keggD annotated: 94
keggG annotated: 2
lipidmaps annotated: 47
lipidmapsM annotated: 47
metacyc.compound annotated: 488
metacycM annotated: 488
reactome annotated: 307
reactomeM annotated: 307
rheaG annotated: 18
rheaP annotated: 0
sabiork.compound annotated: 433
sabiorkM annotated: 433
seed.compound annotated: 541
seedM annotated: 541
slm annotated: 19


In [41]:
sum(1 for met in model.metabolites if "metanetx.chemical" in met.annotation)

625

In [42]:
for met in model.metabolites:
    if met.annotation is None:
        met.annotation == {}
    to_del = [k for k, v in met.annotation.items() if v is None]
    for k in to_del:
        del met.annotation[k]

## SBO terms

Every metabolite should be annotated with "SBO:0000247".

In [43]:
for met in model.metabolites:
    met.annotation["sbo"] = "SBO:0000247"

In [44]:
cobra.io.write_sbml_model(model, model_file)