# Install missing modules & load modules

In [None]:
!pip install cobra #install cobra only required for google colab 



In [None]:
import pandas as pd
import numpy as np
import cobra
import requests

# Load AraCore Model

In [None]:
#Get file from github
fileName = 'https://raw.githubusercontent.com/ma-blaetke/CBM_C3_C4_Metabolism/master/data/2018-23-05-mb-genC3.sbml'
r = requests.get(fileName)

In [None]:
#Create model
model = cobra.io.read_sbml_model(r.text)

In [None]:
model

0,1
Name,c3_model
Memory address,0x07f7d34e54350
Number of metabolites,413
Number of reactions,572
Number of groups,0
Objective expression,1.0*Ex_Suc - 1.0*Ex_Suc_reverse_fb96e
Compartments,"Chloroplast, Lumen, Cytosol, Mitochondrion, IntermembraneSpace, Peroxisome"


# Correct Compartment Naming in AraCore Model according to BiGG naming conventions

In [None]:
model.compartments

{'c': 'Cytosol',
 'h': 'Chloroplast',
 'i': 'IntermembraneSpace',
 'l': 'Lumen',
 'm': 'Mitochondrion',
 'p': 'Peroxisome'}

In [None]:
bigg_compartments = {'c':	'cytosol',
'e':	'extracellular space',
'p':	'periplasm',
'm':	'mitochondria',
'x':	'peroxisome/glyoxysome',
'r':	'endoplasmic reticulum',
'v':	'vacuole',
'n':	'nucleus',
'g':	'golgi apparatus',
'u':	'thylakoid',
'l':	'lysosome',
'h':	'chloroplast',
'f':	'flagellum',
's':	'eyespot',
'im':	'intermembrane space of mitochondria',
'cx':	'carboxyzome',
'um':	'thylakoid membrane',
'cm':	'cytosolic membrane',
'i':	'inner mitochondrial compartment',
'mm':	'mitochondrial intermembrane',
'w':	'wildtype staph aureus',
'y':	'cytochrome complex'}

In [None]:
#http://bigg.ucsd.edu/compartments

#c	cytosol
#h	chloroplast
#m	mitochondria
#x	peroxisome/glyoxysome
#im	intermembrane space of mitochondria
#h	chloroplast
#ul thylakoid lumen <<< NEW


df_compartment_mapping = pd.Series(
  { 'c': 'c', #cytosol
  'h': 'h', #chloroplast
  'm': 'm', #mitochondria
  'p': 'x', #peroxisome/glyoxysome
  'i': 'im', #intermembrane space of mitochondria
  'l': 'ul', #thylakoid lumen <<< NEW, not in BiGG compartment list
   'e':'e', #extracellular space << NEW, not yet in model
  }
 )

# Create Metabolite Table for AraCore Model

In [None]:
#Create mapping table
df_metabolites_aracore = pd.DataFrame(
    {
        "aracore_ids" : [met_obj.id for met_obj in model.metabolites],
        "aracore_name" : [met_obj.name for met_obj in model.metabolites],
        "aracore_formula": [met_obj.formula for met_obj in model.metabolites],
        "aracore_annotations" : [met_obj.annotation for met_obj in model.metabolites]
    })

df_metabolites_aracore.head(25) 

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations
0,hnu_h,Photon,X,{}
1,PQ_h,Oxidized plastoquinone,C13H16O2,{}
2,H2O_h,"H2O, water",H2O,{}
3,H_h,"H+, proton",H,{}
4,PQH2_h,Reduced plastoquinone,C13H18O2,{}
5,O2_h,"O2, oxygen",O2,{}
6,H_l,"H+, proton",H,{}
7,PCox_h,Oxidized plastocyanin,X,{}
8,PCrd_h,Reduced plastocyanin,X,{}
9,Fdox_h,Oxidized ferredoxin,S8FeX,{}


In [None]:
df_metabolites_aracore['aracore_formula'].nunique()

213

In [None]:
#Update compartment symbols in metabolite ids and make metabolite ids lower case
df_metabolites_aracore['aracore_updated_ids'] = df_metabolites_aracore['aracore_ids'].apply(lambda x: f"{x.rsplit('_',1)[0]}_{df_compartment_mapping[x.rsplit('_',1)[-1]]}").str.lower()

#Create universal metabolite ids by removing compartment symbols
df_metabolites_aracore['aracore_updated_universal_ids'] = df_metabolites_aracore['aracore_updated_ids'].apply(lambda x: x.rsplit('_',1)[0])

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids
0,hnu_h,Photon,X,{},hnu_h,hnu
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o
3,H_h,"H+, proton",H,{},h_h,h
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2
5,O2_h,"O2, oxygen",O2,{},o2_h,o2
6,H_l,"H+, proton",H,{},h_ul,h
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox


# BIGG Metabolites

---



In [None]:
# Load BIGG Reaction Table
bigg_metabolites_url = 'http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt'
df_metabolites_bigg = pd.read_csv(bigg_metabolites_url, sep='\t')

df_metabolites_bigg.head(25)

Unnamed: 0,bigg_id,universal_bigg_id,name,model_list,database_links,old_bigg_ids
0,12dgr120_c,12dgr120,"1,2-Diacyl-sn-glycerol (didodecanoyl, n-C12:0)",iEC1364_W; iEC1349_Crooks; iEC1356_Bl21DE3; iM...,MetaNetX (MNX) Chemical: http://identifiers.or...,12dgr120; 12dgr120[c]; 12dgr120_c; _12dgr120_c
1,12dgr140_c,12dgr140,"1,2-Diacyl-sn-glycerol (ditetradecanoyl, n-C14:0)",iECNA114_1301; iECSE_1348; iECO111_1330; iECOK...,MetaNetX (MNX) Chemical: http://identifiers.or...,12dgr140; 12dgr140[c]; 12dgr140_c; _12dgr140_c
2,12dgr180_c,12dgr180,"1,2-Diacyl-sn-glycerol (dioctadecanoyl, n-C18:0)",iECB_1328; iECDH10B_1368; iEcE24377_1341; iECD...,MetaNetX (MNX) Chemical: http://identifiers.or...,12dgr180; 12dgr180[c]; 12dgr180_c; _12dgr180_c
3,14glucan_c,14glucan,"1,4-alpha-D-glucan",iSFxv_1172; iUTI89_1310; iSSON_1240; iSbBS512_...,BioCyc: http://identifiers.org/biocyc/META:1-4...,14glucan; 14glucan_c
4,15dap_c,15dap,"1,5-Diaminopentane",iECUMN_1333; iLF82_1304; iETEC_1333; iECSF_132...,KEGG Compound: http://identifiers.org/kegg.com...,15dap; 15dap[c]; 15dap_c
5,23ddhb_c,23ddhb,"2,3-Dihydro-2,3-dihydroxybenzoate",iEC1372_W3110; iEC1368_DH5a; iCN900; iEC1364_W...,KEGG Compound: http://identifiers.org/kegg.com...,23ddhb; 23ddhb_c
6,23dhba_c,23dhba,"(2,3-Dihydroxybenzoyl)adenylate",iECs_1301; iECO111_1330; iECP_1309; iECIAI1_13...,KEGG Compound: http://identifiers.org/kegg.com...,23dhba; 23dhba_c
7,23dhbzs_c,23dhbzs,"2,3-dihydroxybenzoylserine",STM_v1_0; iY75_1357; iAF1260b; iML1515; iEC134...,KEGG Compound: http://identifiers.org/kegg.com...,23dhbzs; 23dhbzs_c
8,26dap_LL_c,26dap_LL,"LL-2,6-Diaminoheptanedioate",iLJ478; iAF1260b; STM_v1_0; iJN678; iY75_1357;...,KEGG Compound: http://identifiers.org/kegg.com...,26dap-LL[c]; 26dap_DASH_LL_c; 26dap_LL; 26dap_...
9,2agpe141_c,2agpe141,2-Acyl-sn-glycero-3-phosphoethanolamine (n-C14:1),iEC1344_C; iYS1720; iEC1368_DH5a; iEC1372_W311...,MetaNetX (MNX) Chemical: http://identifiers.or...,2agpe141; 2agpe141_c; _2agpe141_c


In [None]:
#Convert universal bigg id for metabolites to lower case
df_metabolites_bigg['universal_bigg_id_lower'] = df_metabolites_bigg['universal_bigg_id'].str.lower()

In [None]:
#Convert string of database links into dictionaries of database identifier/symbol (key) and database-specific metabolite/annotation id (value)
df_metabolites_bigg['database_links'] = df_metabolites_bigg['database_links'].apply(lambda str_links: {db_link.split(':',1)[-1].split('/')[-2]: db_link.split(':',1)[-1].split('/')[-1] for db_link in str_links.split(';')} if isinstance(str_links,str) else {})

In [None]:
#All database keys in database_links
np.unique(df_metabolites_bigg['database_links'].apply(lambda x: list(x.keys())).sum())

array(['biocyc', 'chebi', 'hmdb', 'inchikey', 'kegg.compound',
       'kegg.drug', 'kegg.glycan', 'lipidmaps', 'metanetx.chemical',
       'reactome', 'seed.compound'], dtype='<U17')

In [None]:
#Get ModelSEED compound ids (metabolite ids)
df_metabolites_bigg['seed.compound'] = df_metabolites_bigg['database_links'].apply(lambda dict_db_link:  dict_db_link['seed.compound'] if 'seed.compound' in dict_db_link.keys() else None)
df_metabolites_bigg.head(25)

Unnamed: 0,bigg_id,universal_bigg_id,name,model_list,database_links,old_bigg_ids,universal_bigg_id_lower,seed.compound
0,12dgr120_c,12dgr120,"1,2-Diacyl-sn-glycerol (didodecanoyl, n-C12:0)",iEC1364_W; iEC1349_Crooks; iEC1356_Bl21DE3; iM...,{'metanetx.chemical': 'MNXM4939'},12dgr120; 12dgr120[c]; 12dgr120_c; _12dgr120_c,12dgr120,
1,12dgr140_c,12dgr140,"1,2-Diacyl-sn-glycerol (ditetradecanoyl, n-C14:0)",iECNA114_1301; iECSE_1348; iECO111_1330; iECOK...,{'metanetx.chemical': 'MNXM146479'},12dgr140; 12dgr140[c]; 12dgr140_c; _12dgr140_c,12dgr140,
2,12dgr180_c,12dgr180,"1,2-Diacyl-sn-glycerol (dioctadecanoyl, n-C18:0)",iECB_1328; iECDH10B_1368; iEcE24377_1341; iECD...,{'metanetx.chemical': 'MNXM4217'},12dgr180; 12dgr180[c]; 12dgr180_c; _12dgr180_c,12dgr180,
3,14glucan_c,14glucan,"1,4-alpha-D-glucan",iSFxv_1172; iUTI89_1310; iSSON_1240; iSbBS512_...,"{'biocyc': 'META:1-4-alpha-D-Glucan', 'metanet...",14glucan; 14glucan_c,14glucan,cpd21754
4,15dap_c,15dap,"1,5-Diaminopentane",iECUMN_1333; iLF82_1304; iETEC_1333; iECSF_132...,"{'kegg.compound': 'C01672', 'chebi': 'CHEBI:58...",15dap; 15dap[c]; 15dap_c,15dap,cpd01155
5,23ddhb_c,23ddhb,"2,3-Dihydro-2,3-dihydroxybenzoate",iEC1372_W3110; iEC1368_DH5a; iCN900; iEC1364_W...,"{'kegg.compound': 'C04171', 'chebi': 'CHEBI:87...",23ddhb; 23ddhb_c,23ddhb,cpd29666
6,23dhba_c,23dhba,"(2,3-Dihydroxybenzoyl)adenylate",iECs_1301; iECO111_1330; iECP_1309; iECIAI1_13...,"{'kegg.compound': 'C04030', 'chebi': 'CHEBI:57...",23dhba; 23dhba_c,23dhba,cpd02494
7,23dhbzs_c,23dhbzs,"2,3-dihydroxybenzoylserine",STM_v1_0; iY75_1357; iAF1260b; iML1515; iEC134...,"{'kegg.compound': 'C04204', 'chebi': 'CHEBI:70...",23dhbzs; 23dhbzs_c,23dhbzs,cpd15332
8,26dap_LL_c,26dap_LL,"LL-2,6-Diaminoheptanedioate",iLJ478; iAF1260b; STM_v1_0; iJN678; iY75_1357;...,"{'kegg.compound': 'C00666', 'chebi': 'CHEBI:63...",26dap-LL[c]; 26dap_DASH_LL_c; 26dap_LL; 26dap_...,26dap_ll,cpd00504
9,2agpe141_c,2agpe141,2-Acyl-sn-glycero-3-phosphoethanolamine (n-C14:1),iEC1344_C; iYS1720; iEC1368_DH5a; iEC1372_W311...,{'metanetx.chemical': 'MNXM3447'},2agpe141; 2agpe141_c; _2agpe141_c,2agpe141,


# ModelSeed Metabolites

In [None]:
# Load ModelSeed Reaction Table

seed_metabolites_url = 'https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/compounds.tsv'
df_metabolites_seed = pd.read_csv(seed_metabolites_url, sep='\t')

df_metabolites_seed.head(25)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,abbreviation,name,formula,mass,source,inchikey,charge,is_core,is_obsolete,linked_compound,is_cofactor,deltag,deltagerr,pka,pkb,abstract_compound,comprised_of,aliases,smiles,notes
0,cpd00001,h2o,H2O,H2O,18.0,Primary Database,XLYOFNOQVPJJNP-UHFFFAOYSA-N,0,1,0,,0,-37.54,0.18,1:1:15.70,1:1:-1.80,,,Name: H20; H2O; H3O+; HO-; Hydroxide ion; OH; ...,O,GC|EQ|EQU
1,cpd00002,atp,ATP,C10H13N5O13P3,504.0,Primary Database,ZKHQWZAMYRWXGA-KQYNXXCUSA-K,-3,1,0,,0,-548.85,0.36,1:14:12.60;1:22:3.29;1:26:0.90;1:29:7.42;1:30:...,1:6:-7.46;1:9:-1.06;1:14:-3.85;1:15:4.93,,,Name: ATP; Adenosine 5'-triphosphate; adenosin...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU
2,cpd00003,nad,NAD,C21H26N7O14P2,662.0,Primary Database,BAWFJGJZGIEFAR-NNYOXOHSSA-M,-1,1,0,,0,-286.41,1.59,1:6:11.94;1:17:1.85;1:18:2.28;1:25:11.38;1:35:...,1:6:-4.22;1:35:-3.85;1:37:-1.05;1:41:4.93;1:43...,,,Name: DPN; DPN+; DPN-ox; Diphosphopyridine nuc...,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU
3,cpd00004,nadh,NADH,C21H27N7O14P2,663.0,Primary Database,BOPGDPNILDQYTO-NNYOXOHSSA-L,-2,1,0,,0,-271.15,1.59,1:14:12.28;1:18:14.00;1:22:-7.46;1:26:-1.05;1:...,1:6:2.28;1:9:1.85;1:14:-3.85;1:15:4.93;1:18:-3...,,,Name: DPNH; NAD-reduced; NADH; NADH+H+; NADH2;...,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU
4,cpd00005,nadph,NADPH,C21H26N7O17P3,742.0,Primary Database,ACFIXJIJDZMPPO-NNYOXOHSSA-J,-4,1,0,,0,-483.1,1.62,1:18:0.90;1:19:5.78;1:26:0.66;1:30:3.26;1:40:1...,1:11:-7.46;1:12:-1.06;1:22:4.87;1:40:-3.78,,,Name: NADP(H); NADP-red; NADP-reduced; NADPH; ...,NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU
5,cpd00006,nadp,NADP,C21H25N7O17P3,741.0,Primary Database,XJLXINKUBYWONI-NNYOXOHSSA-K,-3,1,0,,0,-498.36,1.63,1:18:0.90;1:19:5.78;1:26:3.26;1:30:0.66;1:47:1...,1:11:-7.46;1:12:-1.06;1:22:4.87,,,Name: NADP; NADP(+); NADP+; NADP-ox; NADP-oxid...,NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU
6,cpd00007,o2,O2,O2,32.0,Primary Database,MYMOFIZGZYHOMD-UHFFFAOYSA-N,0,1,0,,0,3.92,0.71,,,,,Name: O2; Oxygen; dioxygen; oxygen; oxygen mol...,O=O,GC|EQ|EQU
7,cpd00008,adp,ADP,C10H13N5O10P2,425.0,Primary Database,XTWYTFMLZFPYCI-KQYNXXCUSA-L,-2,1,0,,0,-340.04,0.3,1:14:12.46;1:18:13.98;1:22:2.22;1:25:7.42;1:26...,1:6:-7.46;1:9:-1.05;1:14:-3.85;1:15:4.93;1:18:...,,,Name: ADP; Adenosine 5'-diphosphate; Adenosine...,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU
8,cpd00009,pi,Phosphate,HO4P,96.0,Primary Database,NBIIXXVUZAFLBC-UHFFFAOYSA-L,-2,1,0,,0,-252.51,0.18,1:2:12.90;1:3:1.80;1:4:6.95,,,,Name: H2PO4-; HPO4-2; HPO42-; Orthophosphate; ...,O=P([O-])([O-])O,GC|EQ|EQU
9,cpd00010,coa,CoA,C21H32N7O16P3S,764.0,Primary Database,RGJOEKWQDUBAIZ-IBOSZNHHSA-J,-4,1,0,,0,-429.53,1.87,1:22:0.92;1:23:5.94;1:26:0.83;1:30:3.27;1:48:1...,1:8:-7.46;1:9:-1.06;1:17:4.89,,,Name: CoA; CoA-SH; Coenzyme A; CoenzymeA; Coen...,CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,GC|EQ|EQU


In [None]:
df_metabolites_seed['abbreviation_lower'] = df_metabolites_seed['abbreviation'].str.lower()

In [None]:
#Convert string of alias pairs into dictionaries of keys and value - 
# 1) split by "|" to sepertae the different key - value pairs
# 2) split by ":" to seperate keys and values
df_metabolites_seed['aliases'] = df_metabolites_seed['aliases'].apply(lambda aliases_str: 
                                                                      {alias.split(':',1)[0]: alias.split(':',1)[-1] 
                                                                       for alias in aliases_str.split('|')} if isinstance(aliases_str, str) else {} )

#Some of the keys have values that are again a strings with multiple items that need to be splitted
# 3) split those  string by ';' and also remove leading and tailing white spaces 
df_metabolites_seed['aliases'] = df_metabolites_seed['aliases'].apply(lambda aliases_dict: 
                                                                      {alias_key:
                                                                       [alias_value.strip() for alias_value in alias_values.split(';')]
                                                                       for alias_key, alias_values in aliases_dict.items()}
                                                                      )

df_metabolites_seed.head(25)

Unnamed: 0,id,abbreviation,name,formula,mass,source,inchikey,charge,is_core,is_obsolete,linked_compound,is_cofactor,deltag,deltagerr,pka,pkb,abstract_compound,comprised_of,aliases,smiles,notes,abbreviation_lower
0,cpd00001,h2o,H2O,H2O,18.0,Primary Database,XLYOFNOQVPJJNP-UHFFFAOYSA-N,0,1,0,,0,-37.54,0.18,1:1:15.70,1:1:-1.80,,,"{'Name': ['H20', 'H2O', 'H3O+', 'HO-', 'Hydrox...",O,GC|EQ|EQU,h2o
1,cpd00002,atp,ATP,C10H13N5O13P3,504.0,Primary Database,ZKHQWZAMYRWXGA-KQYNXXCUSA-K,-3,1,0,,0,-548.85,0.36,1:14:12.60;1:22:3.29;1:26:0.90;1:29:7.42;1:30:...,1:6:-7.46;1:9:-1.06;1:14:-3.85;1:15:4.93,,,"{'Name': ['ATP', 'Adenosine 5'-triphosphate', ...",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU,atp
2,cpd00003,nad,NAD,C21H26N7O14P2,662.0,Primary Database,BAWFJGJZGIEFAR-NNYOXOHSSA-M,-1,1,0,,0,-286.41,1.59,1:6:11.94;1:17:1.85;1:18:2.28;1:25:11.38;1:35:...,1:6:-4.22;1:35:-3.85;1:37:-1.05;1:41:4.93;1:43...,,,"{'Name': ['DPN', 'DPN+', 'DPN-ox', 'Diphosphop...",NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU,nad
3,cpd00004,nadh,NADH,C21H27N7O14P2,663.0,Primary Database,BOPGDPNILDQYTO-NNYOXOHSSA-L,-2,1,0,,0,-271.15,1.59,1:14:12.28;1:18:14.00;1:22:-7.46;1:26:-1.05;1:...,1:6:2.28;1:9:1.85;1:14:-3.85;1:15:4.93;1:18:-3...,,,"{'Name': ['DPNH', 'NAD-reduced', 'NADH', 'NADH...",NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU,nadh
4,cpd00005,nadph,NADPH,C21H26N7O17P3,742.0,Primary Database,ACFIXJIJDZMPPO-NNYOXOHSSA-J,-4,1,0,,0,-483.1,1.62,1:18:0.90;1:19:5.78;1:26:0.66;1:30:3.26;1:40:1...,1:11:-7.46;1:12:-1.06;1:22:4.87;1:40:-3.78,,,"{'Name': ['NADP(H)', 'NADP-red', 'NADP-reduced...",NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU,nadph
5,cpd00006,nadp,NADP,C21H25N7O17P3,741.0,Primary Database,XJLXINKUBYWONI-NNYOXOHSSA-K,-3,1,0,,0,-498.36,1.63,1:18:0.90;1:19:5.78;1:26:3.26;1:30:0.66;1:47:1...,1:11:-7.46;1:12:-1.06;1:22:4.87,,,"{'Name': ['NADP', 'NADP(+)', 'NADP+', 'NADP-ox...",NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU,nadp
6,cpd00007,o2,O2,O2,32.0,Primary Database,MYMOFIZGZYHOMD-UHFFFAOYSA-N,0,1,0,,0,3.92,0.71,,,,,"{'Name': ['O2', 'Oxygen', 'dioxygen', 'oxygen'...",O=O,GC|EQ|EQU,o2
7,cpd00008,adp,ADP,C10H13N5O10P2,425.0,Primary Database,XTWYTFMLZFPYCI-KQYNXXCUSA-L,-2,1,0,,0,-340.04,0.3,1:14:12.46;1:18:13.98;1:22:2.22;1:25:7.42;1:26...,1:6:-7.46;1:9:-1.05;1:14:-3.85;1:15:4.93;1:18:...,,,"{'Name': ['ADP', 'Adenosine 5'-diphosphate', '...",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU,adp
8,cpd00009,pi,Phosphate,HO4P,96.0,Primary Database,NBIIXXVUZAFLBC-UHFFFAOYSA-L,-2,1,0,,0,-252.51,0.18,1:2:12.90;1:3:1.80;1:4:6.95,,,,"{'Name': ['H2PO4-', 'HPO4-2', 'HPO42-', 'Ortho...",O=P([O-])([O-])O,GC|EQ|EQU,pi
9,cpd00010,coa,CoA,C21H32N7O16P3S,764.0,Primary Database,RGJOEKWQDUBAIZ-IBOSZNHHSA-J,-4,1,0,,0,-429.53,1.87,1:22:0.92;1:23:5.94;1:26:0.83;1:30:3.27;1:48:1...,1:8:-7.46;1:9:-1.06;1:17:4.89,,,"{'Name': ['CoA', 'CoA-SH', 'Coenzyme A', 'Coen...",CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,GC|EQ|EQU,coa


In [None]:
#All database keys in aliases
np.unique(df_metabolites_seed['aliases'].apply(lambda x: list(x.keys())).sum())

array(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BrachyCyc', 'ChlamyCyc',
       'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii',
       'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc',
       'Maize_C4GEM', 'MetaCyc', 'Name', 'PlantCyc', 'PoplarCyc',
       'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260',
       'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800',
       'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904',
       'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083',
       'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844'], dtype='<U21')

In [None]:
#Get BiGG metabolite ids
df_metabolites_seed['BiGG'] = df_metabolites_seed['aliases'].apply(lambda dict_aliases:  dict_aliases['BiGG'] if 'BiGG' in dict_aliases.keys() else None)
df_metabolites_seed.head(25)

Unnamed: 0,id,abbreviation,name,formula,mass,source,inchikey,charge,is_core,is_obsolete,linked_compound,is_cofactor,deltag,deltagerr,pka,pkb,abstract_compound,comprised_of,aliases,smiles,notes,abbreviation_lower,BiGG
0,cpd00001,h2o,H2O,H2O,18.0,Primary Database,XLYOFNOQVPJJNP-UHFFFAOYSA-N,0,1,0,,0,-37.54,0.18,1:1:15.70,1:1:-1.80,,,"{'Name': ['H20', 'H2O', 'H3O+', 'HO-', 'Hydrox...",O,GC|EQ|EQU,h2o,"[h2o, oh1]"
1,cpd00002,atp,ATP,C10H13N5O13P3,504.0,Primary Database,ZKHQWZAMYRWXGA-KQYNXXCUSA-K,-3,1,0,,0,-548.85,0.36,1:14:12.60;1:22:3.29;1:26:0.90;1:29:7.42;1:30:...,1:6:-7.46;1:9:-1.06;1:14:-3.85;1:15:4.93,,,"{'Name': ['ATP', 'Adenosine 5'-triphosphate', ...",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU,atp,[atp]
2,cpd00003,nad,NAD,C21H26N7O14P2,662.0,Primary Database,BAWFJGJZGIEFAR-NNYOXOHSSA-M,-1,1,0,,0,-286.41,1.59,1:6:11.94;1:17:1.85;1:18:2.28;1:25:11.38;1:35:...,1:6:-4.22;1:35:-3.85;1:37:-1.05;1:41:4.93;1:43...,,,"{'Name': ['DPN', 'DPN+', 'DPN-ox', 'Diphosphop...",NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU,nad,[nad]
3,cpd00004,nadh,NADH,C21H27N7O14P2,663.0,Primary Database,BOPGDPNILDQYTO-NNYOXOHSSA-L,-2,1,0,,0,-271.15,1.59,1:14:12.28;1:18:14.00;1:22:-7.46;1:26:-1.05;1:...,1:6:2.28;1:9:1.85;1:14:-3.85;1:15:4.93;1:18:-3...,,,"{'Name': ['DPNH', 'NAD-reduced', 'NADH', 'NADH...",NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU,nadh,[nadh]
4,cpd00005,nadph,NADPH,C21H26N7O17P3,742.0,Primary Database,ACFIXJIJDZMPPO-NNYOXOHSSA-J,-4,1,0,,0,-483.1,1.62,1:18:0.90;1:19:5.78;1:26:0.66;1:30:3.26;1:40:1...,1:11:-7.46;1:12:-1.06;1:22:4.87;1:40:-3.78,,,"{'Name': ['NADP(H)', 'NADP-red', 'NADP-reduced...",NC(=O)C1=CN([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(...,GC|EQ|EQU,nadph,[nadph]
5,cpd00006,nadp,NADP,C21H25N7O17P3,741.0,Primary Database,XJLXINKUBYWONI-NNYOXOHSSA-K,-3,1,0,,0,-498.36,1.63,1:18:0.90;1:19:5.78;1:26:3.26;1:30:0.66;1:47:1...,1:11:-7.46;1:12:-1.06;1:22:4.87,,,"{'Name': ['NADP', 'NADP(+)', 'NADP+', 'NADP-ox...",NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(...,GC|EQ|EQU,nadp,[nadp]
6,cpd00007,o2,O2,O2,32.0,Primary Database,MYMOFIZGZYHOMD-UHFFFAOYSA-N,0,1,0,,0,3.92,0.71,,,,,"{'Name': ['O2', 'Oxygen', 'dioxygen', 'oxygen'...",O=O,GC|EQ|EQU,o2,[o2]
7,cpd00008,adp,ADP,C10H13N5O10P2,425.0,Primary Database,XTWYTFMLZFPYCI-KQYNXXCUSA-L,-2,1,0,,0,-340.04,0.3,1:14:12.46;1:18:13.98;1:22:2.22;1:25:7.42;1:26...,1:6:-7.46;1:9:-1.05;1:14:-3.85;1:15:4.93;1:18:...,,,"{'Name': ['ADP', 'Adenosine 5'-diphosphate', '...",Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O...,GC|EQ|EQU,adp,[adp]
8,cpd00009,pi,Phosphate,HO4P,96.0,Primary Database,NBIIXXVUZAFLBC-UHFFFAOYSA-L,-2,1,0,,0,-252.51,0.18,1:2:12.90;1:3:1.80;1:4:6.95,,,,"{'Name': ['H2PO4-', 'HPO4-2', 'HPO42-', 'Ortho...",O=P([O-])([O-])O,GC|EQ|EQU,pi,[pi]
9,cpd00010,coa,CoA,C21H32N7O16P3S,764.0,Primary Database,RGJOEKWQDUBAIZ-IBOSZNHHSA-J,-4,1,0,,0,-429.53,1.87,1:22:0.92;1:23:5.94;1:26:0.83;1:30:3.27;1:48:1...,1:8:-7.46;1:9:-1.06;1:17:4.89,,,"{'Name': ['CoA', 'CoA-SH', 'Coenzyme A', 'Coen...",CC(C)(COP(=O)([O-])OP(=O)([O-])OC[C@H]1O[C@@H]...,GC|EQ|EQU,coa,[coa]


In [None]:
df_metabolites_seed['BiGG'].notna().value_counts() # => ModelSeed seems to have only 2729 compounds mapped with BiGG Ids

False    31263
True      2729
Name: BiGG, dtype: int64

In [None]:
df_metabolites_seed['formula'].nunique()

16762

# Mapping AraCore - BIGG

## Based on BiGG and AraCore Metabolite Ids

In [None]:
#Check if universal metabolite ids of aracore model have an equivalent metabolite ids in BIGG based on same namimg
df_metabolites_aracore['is_bigg_id'] = df_metabolites_aracore['aracore_updated_universal_ids'].apply(lambda met_id: ((df_metabolites_bigg['universal_bigg_id_lower'] == met_id).sum() > 0))

#Fill column for universal bigg ids for which above is the case
df_metabolites_aracore['universal_bigg_id'] = df_metabolites_aracore[['aracore_updated_universal_ids','is_bigg_id']].apply(lambda x: x[0] if x[1] else None,axis=1)

#Add ModelSEED compound ids from BiGG metabolite table => df_metabolites_bigg['seed.compound']
df_metabolites_aracore['bigg_seed_id'] = df_metabolites_aracore[['aracore_updated_universal_ids','is_bigg_id']].apply(lambda x: df_metabolites_bigg[df_metabolites_bigg['universal_bigg_id_lower'] == x[0]]['seed.compound'].unique()[0] if x[1] else None,axis=1)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876


In [None]:
df_metabolites_aracore['is_bigg_id'].value_counts() # => 165 mapping between aracore_updated_universal_ids and universal_bigg_id_lower

False    248
True     165
Name: is_bigg_id, dtype: int64

In [None]:
df_metabolites_aracore['bigg_seed_id'].isna().value_counts() # => 156 additional ModelSeed compound id found by mapped BiGG

True     257
False    156
Name: bigg_seed_id, dtype: int64

# Mapping AraCore - ModelSeed

## Based on ModelSEED and AraCore Metabolite Ids

In [None]:
#Check if universal metabolite ids of aracore model have an equivalent metabolite ids in BIGG based on same namimg
df_metabolites_aracore['is_seed_id'] = df_metabolites_aracore['aracore_updated_universal_ids'].apply(lambda met_id: ((df_metabolites_seed['abbreviation_lower'] == met_id).sum() > 0))

#Fill column for universal bigg ids for which above is the case
df_metabolites_aracore['seed_id'] = df_metabolites_aracore[['aracore_updated_universal_ids','is_seed_id']].apply(lambda x: df_metabolites_seed[df_metabolites_seed['abbreviation_lower'] == x[0]]['id'].tolist()[0] if x[1] else None,axis=1)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,False,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,False,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067,True,cpd00067
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,False,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007,True,cpd00007
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067,True,cpd00067
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035,False,
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034,False,
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876,True,cpd15876


## Based on unique chemical formulas

In [None]:
#Map chemical formulas 
df_metabolites_aracore['formula_seed_id'] = df_metabolites_aracore['aracore_formula'].apply(lambda formula: 
                                                df_metabolites_seed[df_metabolites_seed['formula'] == formula]['id'].tolist() 
                                                if formula != 'X' else [] )

#Only keep mapping where a single seed_id has been mapped, since chemical formulas are not unique identifiers - different metabolites may have the same chemical formula
df_metabolites_aracore['formula_seed_id']  = df_metabolites_aracore['formula_seed_id'].apply(lambda formula_seed_id: formula_seed_id[0] if len(formula_seed_id) == 1 else None)                     
df_metabolites_aracore.head(50)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,False,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,False,,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001,
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067,True,cpd00067,cpd00067
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,False,,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007,True,cpd00007,
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067,True,cpd00067,cpd00067
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035,False,,
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034,False,,
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876,True,cpd15876,


## Aggregate ModelSeed Ids mapped based on shared Ids and Chemical Formulas

In [None]:
#Fuse the to colums seed_id and formula_seed_id
df_metabolites_aracore['seed_id'] = df_metabolites_aracore[['seed_id','formula_seed_id']].apply(lambda x: list(np.unique(list(filter(None,[x[0], x[1]])))),axis=1) #.apply(len) > 1]

In [None]:
#Check if there are metabolites that have more than one ModelSEED id after fusing
df_metabolites_aracore[df_metabolites_aracore['seed_id'].apply(len) > 1]

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id
13,ADP_h,Adenosine diphosphate,C10H12N5O10P2,{},adp_h,adp,True,adp,cpd00008,True,"[cpd00008, cpd22244]",cpd22244
46,ADP_c,Adenosine diphosphate,C10H12N5O10P2,{},adp_c,adp,True,adp,cpd00008,True,"[cpd00008, cpd22244]",cpd22244
108,ADP_m,Adenosine diphosphate,C10H12N5O10P2,{},adp_m,adp,True,adp,cpd00008,True,"[cpd00008, cpd22244]",cpd22244
112,Q_m,Ubiquinone,C49H74O4,{},q_m,q,True,q,,True,"[cpd11669, cpd15560]",cpd15560
196,NO2_c,Nitrite,NO2,{},no2_c,no2,True,no2,cpd00075,True,"[cpd00075, cpd27655]",cpd27655
197,NO2_h,Nitrite,NO2,{},no2_h,no2,True,no2,cpd00075,True,"[cpd00075, cpd27655]",cpd27655


In [None]:
#Resolve conflicts check back with ModelSEED database https://modelseed.org/biochem/compounds

#ADP_h, ADP_c, ADP_m => #cpd00008
df_metabolites_aracore.loc[13, 'seed_id'] = [['cpd00008']]
df_metabolites_aracore.loc[46, 'seed_id'] = [['cpd00008']]
df_metabolites_aracore.loc[108, 'seed_id'] = [['cpd00008']]


#Q_m => cpd15560
df_metabolites_aracore.loc[112, 'seed_id'] = [['cpd15560']]

#NO2_c, NO2_h => cpd00075
df_metabolites_aracore.loc[196, 'seed_id'] = [['cpd00075']]
df_metabolites_aracore.loc[197, 'seed_id'] = [['cpd00075']]



In [None]:
#Convert ModelSeed id list of len 1 to simple string
df_metabolites_aracore['seed_id'] = df_metabolites_aracore['seed_id'].apply(lambda seed_id_list: seed_id_list[0] if len(seed_id_list) == 1 else None)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,False,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,False,,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001,
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067,True,cpd00067,cpd00067
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,False,,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007,True,cpd00007,
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067,True,cpd00067,cpd00067
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035,False,,
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034,False,,
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876,True,cpd15876,


In [None]:
df_metabolites_aracore['seed_id'].notna().value_counts() # => 197 mapping with ModelSeed Ids

False    216
True     197
Name: seed_id, dtype: int64

## Add BiGG ids that are avialble from the ModelSeed Table

In [None]:
#Add BiGG ids (BiGG) from ModelSEED table => df_metabolites_seed['BiGG]
df_metabolites_aracore['seed_BiGG_id'] = df_metabolites_aracore['seed_id'].apply(lambda seed_id: df_metabolites_seed[df_metabolites_seed['id'] == seed_id]['BiGG']  if not isinstance(seed_id, type(None)) else None)
df_metabolites_aracore['seed_BiGG_id'] = df_metabolites_aracore['seed_BiGG_id'].apply(lambda x: x.values.tolist()[0] if isinstance(x, pd.core.series.Series) else None)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id,seed_BiGG_id
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,False,,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,False,,,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001,,"[h2o, oh1]"
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067,True,cpd00067,cpd00067,[h]
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,False,,,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007,True,cpd00007,,[o2]
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067,True,cpd00067,cpd00067,[h]
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035,False,,,
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034,False,,,
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876,True,cpd15876,,[fdox]


In [None]:
df_metabolites_aracore['seed_BiGG_id'].notna().value_counts() # => 187 additional BiGG Ids found by mapped ModelSeed compound id

False    226
True     187
Name: seed_BiGG_id, dtype: int64

#Aggregate ModelSEED compound ids in column 'bigg_seed_id' and 'seed_id'


---



In [None]:
#Aggregate model seed compound ids from columns 'bigg_seed_id' and 'seed_id' into a list and extract unique values of this list
df_metabolites_aracore['seed_id_aggr'] = df_metabolites_aracore[['bigg_seed_id', 'seed_id']].apply(lambda x: list(np.unique(list(filter(None,[x[0],x[1]])))) , axis=1)
df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id,seed_BiGG_id,seed_id_aggr
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,False,,,,[]
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,False,,,,[]
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001,,"[h2o, oh1]","[cpd00001, cpd27222]"
3,H_h,"H+, proton",H,{},h_h,h,True,h,cpd00067,True,cpd00067,cpd00067,[h],[cpd00067]
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,False,,,,[]
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,cpd00007,True,cpd00007,,[o2],[cpd00007]
6,H_l,"H+, proton",H,{},h_ul,h,True,h,cpd00067,True,cpd00067,cpd00067,[h],[cpd00067]
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,cpd30035,False,,,,[cpd30035]
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,cpd30034,False,,,,[cpd30034]
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,cpd15876,True,cpd15876,,[fdox],[cpd15876]


In [None]:
df_metabolites_aracore['seed_id_aggr'].apply(len).value_counts() # => 127  + 37 = 164 metabolites have ModelSeed Ids -> 37 metabolites have 2 ModelSeed Ids -> potential conflicts !?!

0    214
1    162
2     37
Name: seed_id_aggr, dtype: int64

In [None]:
#Extract those metabolites that have more than one ModelSeed compound ids mmapped
df_metabolites_aracore_seed_conflicted = df_metabolites_aracore[df_metabolites_aracore['seed_id_aggr'].apply(len) > 1].copy()
df_metabolites_aracore_seed_conflicted

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,bigg_seed_id,is_seed_id,seed_id,formula_seed_id,seed_BiGG_id,seed_id_aggr
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,cpd27222,True,cpd00001,,"[h2o, oh1]","[cpd00001, cpd27222]"
14,Pi_h,Orthophosphate,HO4P,{},pi_h,pi,True,pi,cpd27787,True,cpd00009,,[pi],"[cpd00009, cpd27787]"
23,F6P_h,Fructose 6-phosphate,C6H11O9P,{},f6p_h,f6p,True,f6p,cpd19035,True,cpd00072,,"[f6p, f6p_B]","[cpd00072, cpd19035]"
28,R5P_h,Ribose 5-phosphate,C5H9O8P,{},r5p_h,r5p,True,r5p,cpd19028,True,cpd00101,,[r5p],"[cpd00101, cpd19028]"
30,G6P_h,Glucose 6-phosphate,C6H11O9P,{},g6p_h,g6p,True,g6p,cpd26836,True,cpd00079,,[g6p],"[cpd00079, cpd26836]"
31,G1P_h,Glucose 1-phosphate,C6H11O9P,{},g1p_h,g1p,True,g1p,cpd28817,True,cpd00089,,[g1p],"[cpd00089, cpd28817]"
33,PPi_h,"Diphosphate, Pyrophosphate",O7P2,{},ppi_h,ppi,True,ppi,cpd27828,True,cpd00012,,[ppi],"[cpd00012, cpd27828]"
45,G6P_c,Glucose 6-phosphate,C6H11O9P,{},g6p_c,g6p,True,g6p,cpd26836,True,cpd00079,,[g6p],"[cpd00079, cpd26836]"
50,Pi_c,Orthophosphate,HO4P,{},pi_c,pi,True,pi,cpd27787,True,cpd00009,,[pi],"[cpd00009, cpd27787]"
51,G1P_c,Glucose 1-phosphate,C6H11O9P,{},g1p_c,g1p,True,g1p,cpd28817,True,cpd00089,,[g1p],"[cpd00089, cpd28817]"


In [None]:
#Add abbreviations and formulas of the df_metabolites_seed to compare by eye and make notes to solve potential conflicts for later
df_metabolites_aracore_seed_conflicted['seed_id_aggr_abbr'] = df_metabolites_aracore_seed_conflicted['seed_id_aggr'].apply(lambda x: [df_metabolites_seed[df_metabolites_seed['id'] == compound_id]['abbreviation'].tolist()[0] for compound_id in x])
df_metabolites_aracore_seed_conflicted['seed_id_aggr_formula'] = df_metabolites_aracore_seed_conflicted['seed_id_aggr'].apply(lambda x: [df_metabolites_seed[df_metabolites_seed['id'] == compound_id]['formula'].tolist()[0] for compound_id in x])
df_metabolites_aracore_seed_conflicted[['aracore_ids','aracore_name','aracore_formula','seed_id_aggr','seed_id_aggr_abbr','seed_id_aggr_formula']]

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,seed_id_aggr,seed_id_aggr_abbr,seed_id_aggr_formula
2,H2O_h,"H2O, water",H2O,"[cpd00001, cpd27222]","[h2o, hydroxyl-group]","[H2O, HO]"
14,Pi_h,Orthophosphate,HO4P,"[cpd00009, cpd27787]","[pi, phosphate-group]","[HO4P, HO4P]"
23,F6P_h,Fructose 6-phosphate,C6H11O9P,"[cpd00072, cpd19035]","[f6p, beta-D-Fructose 6-phosphate]","[C6H11O9P, C6H11O9P]"
28,R5P_h,Ribose 5-phosphate,C5H9O8P,"[cpd00101, cpd19028]","[r5p, alpha-D-Ribose 5-phosphate]","[C5H9O8P, C5H9O8P]"
30,G6P_h,Glucose 6-phosphate,C6H11O9P,"[cpd00079, cpd26836]","[g6p, D-glucose-6-phosphate]","[C6H11O9P, C6H11O9P]"
31,G1P_h,Glucose 1-phosphate,C6H11O9P,"[cpd00089, cpd28817]","[g1p, glucose-1-phosphate]","[C6H11O9P, C6H11O9P]"
33,PPi_h,"Diphosphate, Pyrophosphate",O7P2,"[cpd00012, cpd27828]","[ppi, pyrophosphate-group]","[HO7P2, HO7P2]"
45,G6P_c,Glucose 6-phosphate,C6H11O9P,"[cpd00079, cpd26836]","[g6p, D-glucose-6-phosphate]","[C6H11O9P, C6H11O9P]"
50,Pi_c,Orthophosphate,HO4P,"[cpd00009, cpd27787]","[pi, phosphate-group]","[HO4P, HO4P]"
51,G1P_c,Glucose 1-phosphate,C6H11O9P,"[cpd00089, cpd28817]","[g1p, glucose-1-phosphate]","[C6H11O9P, C6H11O9P]"


In [None]:
#Resolve conflict in original table df_metabolites_aracore
df_metabolites_aracore.set_index("aracore_ids",inplace=True) #Set index to acacore_ids to make editing easiers

#H2O_c, H2O_h, H2O_p, H2O_m == cpd00001
df_metabolites_aracore.loc['H2O_c','seed_id_aggr'] = [['cpd00001']]
df_metabolites_aracore.loc['H2O_h','seed_id_aggr'] = [['cpd00001']]
df_metabolites_aracore.loc['H2O_p','seed_id_aggr'] = [['cpd00001']]
df_metabolites_aracore.loc['H2O_m','seed_id_aggr'] = [['cpd00001']]

#Pi_h,_c,_m => cpd00009
df_metabolites_aracore.loc['Pi_c','seed_id_aggr'] = [['cpd00009']]
df_metabolites_aracore.loc['Pi_h','seed_id_aggr'] = [['cpd00009']]
df_metabolites_aracore.loc['Pi_m','seed_id_aggr'] = [['cpd00009']]

#F6P_c,h => cpd00072
df_metabolites_aracore.loc['F6P_c','seed_id_aggr'] = [['cpd00072']]
df_metabolites_aracore.loc['F6P_h','seed_id_aggr'] = [['cpd00072']]

#R5P_h => cpd00101
df_metabolites_aracore.loc['R5P_h','seed_id_aggr'] = [['cpd00101']]

#G6P_h,c => cpd00079
df_metabolites_aracore.loc['G6P_c','seed_id_aggr'] = [['cpd00079']]
df_metabolites_aracore.loc['G6P_h','seed_id_aggr'] = [['cpd00079']]

#G1P_h,c => cpd00089
df_metabolites_aracore.loc['G1P_c','seed_id_aggr'] = [['cpd00089']]
df_metabolites_aracore.loc['G1P_h','seed_id_aggr'] = [['cpd00089']]


#UDPG_c => cpd00026
df_metabolites_aracore.loc['UDPG_c','seed_id_aggr'] = [['cpd00026']]

#PPi_h,_c => cpd00012
df_metabolites_aracore.loc['PPi_c','seed_id_aggr'] = [['cpd00012']]
df_metabolites_aracore.loc['PPi_h','seed_id_aggr'] = [['cpd00012']]


#CoA_m,_c,_h => cpd00010
df_metabolites_aracore.loc['CoA_c','seed_id_aggr'] = [['cpd00010']]
df_metabolites_aracore.loc['CoA_h','seed_id_aggr'] = [['cpd00010']]
df_metabolites_aracore.loc['CoA_m','seed_id_aggr'] = [['cpd00010']]

#NH4_m,_h,_c => cpd00013
df_metabolites_aracore.loc['NH4_c','seed_id_aggr'] = [['cpd00013']]
df_metabolites_aracore.loc['NH4_h','seed_id_aggr'] = [['cpd00013']]
df_metabolites_aracore.loc['NH4_m','seed_id_aggr'] = [['cpd00013']]

#AMP_h,_c => cpd00018
df_metabolites_aracore.loc['AMP_c','seed_id_aggr'] = [['cpd00018']]
df_metabolites_aracore.loc['AMP_h','seed_id_aggr'] = [['cpd00018']]

#H2S_h,_c,_m => cpd00239
df_metabolites_aracore.loc['H2S_c','seed_id_aggr'] = [['cpd00239']]
df_metabolites_aracore.loc['H2S_h','seed_id_aggr'] = [['cpd00239']]
df_metabolites_aracore.loc['H2S_m','seed_id_aggr'] = [['cpd00239']]

#Orn_h,_m => cpd00064
df_metabolites_aracore.loc['Orn_h','seed_id_aggr'] = [['cpd00064']]
df_metabolites_aracore.loc['Orn_m','seed_id_aggr'] = [['cpd00064']]

#For_h => cpd00047
df_metabolites_aracore.loc['For_h','seed_id_aggr'] = [['cpd00047']]

#THF_m, THF_c, THF_h	5,6,7,8-Tetrahydrofolate == cpd00087 -> 186 has one more H in sum formula
df_metabolites_aracore.loc['THF_m','seed_id_aggr'] = [['cpd00087']]
df_metabolites_aracore.loc['THF_c','seed_id_aggr'] = [['cpd00087']]
df_metabolites_aracore.loc['THF_h','seed_id_aggr'] = [['cpd00087']]

#SO4_h, SO4_m, SO4_c Sulfate O4S == cpd00048
df_metabolites_aracore.loc['SO4_h','seed_id_aggr'] = [['cpd00048']]
df_metabolites_aracore.loc['SO4_m','seed_id_aggr'] = [['cpd00048']]
df_metabolites_aracore.loc['SO4_c','seed_id_aggr'] = [['cpd00048']]

#ppi has different sum formulas in the Aracore model?!?
#Coenzyme A has different sum formulas in the Aracore model?!?
#5,6,7,8-Tetrahydrofolate has different sum formulas in the Aracore model?!?

df_metabolites_aracore.reset_index(inplace=True)

In [None]:
#Check if all bigg conflicts resolved 
(df_metabolites_aracore['seed_id_aggr'].apply(len) > 1).value_counts() #all conflicts resolved

False    413
Name: seed_id_aggr, dtype: int64

In [None]:
#convert list of len 1 into string of BiGG id
df_metabolites_aracore['seed_id_aggr'] = df_metabolites_aracore['seed_id_aggr'].apply(lambda x: x[0] if x else None)

#clean dataframe and drop cols
df_metabolites_aracore.drop(['is_seed_id','seed_id','bigg_seed_id', 'formula_seed_id'], axis=1, inplace=True)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,seed_BiGG_id,seed_id_aggr
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,"[h2o, oh1]",cpd00001
3,H_h,"H+, proton",H,{},h_h,h,True,h,[h],cpd00067
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,[o2],cpd00007
6,H_l,"H+, proton",H,{},h_ul,h,True,h,[h],cpd00067
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,,cpd30035
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,,cpd30034
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,[fdox],cpd15876


#Aggregate BiGG ids  in column 'universal_bigg_id' and 'seed_BiGG_id'


In [None]:
#Aggregate model seed compound ids from columns 'bigg_seed_id' and 'seed_id' into a list and extract unique values of this list

df_metabolites_aracore['bigg_id_aggr'] = df_metabolites_aracore[['universal_bigg_id', 'seed_BiGG_id']].apply(lambda x: list(filter(None,[x[0]]+x[1])) if x[1] else  list(filter(None,[x[0]]+[x[1]])), axis=1) #.apply(len).value_counts()
df_metabolites_aracore['bigg_id_aggr'] = df_metabolites_aracore['bigg_id_aggr'].apply(lambda x: list(np.unique(x)))

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,seed_BiGG_id,seed_id_aggr,bigg_id_aggr
0,hnu_h,Photon,X,{},hnu_h,hnu,False,,,,[]
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,True,pq,,,[pq]
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,"[h2o, oh1]",cpd00001,"[h2o, oh1]"
3,H_h,"H+, proton",H,{},h_h,h,True,h,[h],cpd00067,[h]
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,True,pqh2,,,[pqh2]
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,True,o2,[o2],cpd00007,[o2]
6,H_l,"H+, proton",H,{},h_ul,h,True,h,[h],cpd00067,[h]
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,True,pcox,,cpd30035,[pcox]
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,True,pcrd,,cpd30034,[pcrd]
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,True,fdox,[fdox],cpd15876,[fdox]


In [None]:
#Extract those metabolites that have more than one ModelSeed compound ids mmapped
df_metabolites_aracore_bigg_conflicted = df_metabolites_aracore[df_metabolites_aracore['bigg_id_aggr'].apply(len) > 1].copy()
df_metabolites_aracore_bigg_conflicted

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,is_bigg_id,universal_bigg_id,seed_BiGG_id,seed_id_aggr,bigg_id_aggr
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,True,h2o,"[h2o, oh1]",cpd00001,"[h2o, oh1]"
23,F6P_h,Fructose 6-phosphate,C6H11O9P,{},f6p_h,f6p,True,f6p,"[f6p, f6p_B]",cpd00072,"[f6p, f6p_B]"
55,H2O_c,"H2O, water",H2O,{},h2o_c,h2o,True,h2o,"[h2o, oh1]",cpd00001,"[h2o, oh1]"
56,F6P_c,Fructose 6-phosphate,C6H11O9P,{},f6p_c,f6p,True,f6p,"[f6p, f6p_B]",cpd00072,"[f6p, f6p_B]"
76,HCO3_c,Bicarbonate,CHO3,{},hco3_c,hco3,True,hco3,"[h2co3, hco3]",cpd00242,"[h2co3, hco3]"
101,H2O_m,"H2O, water",H2O,{},h2o_m,h2o,True,h2o,"[h2o, oh1]",cpd00001,"[h2o, oh1]"
112,Q_m,Ubiquinone,C49H74O4,{},q_m,q,True,q,"[q8, q8h2]",cpd15560,"[q, q8, q8h2]"
126,H2O_p,"H2O, water",H2O,{},h2o_x,h2o,True,h2o,"[h2o, oh1]",cpd00001,"[h2o, oh1]"
136,NH4_m,Ammonia,H4N,{},nh4_m,nh4,True,nh4,"[nh3, nh4]",cpd00013,"[nh3, nh4]"
178,HCO3_h,Bicarbonate,CHO3,{},hco3_h,hco3,True,hco3,"[h2co3, hco3]",cpd00242,"[h2co3, hco3]"


In [None]:
# Access above table and compare by eye and make notes to solve potential conflicts for later
# H2O_h, H2O_m, H2O_p, H2O_c  => h2O
# F6P_h, F6P_c => f6p
# HCO3_c, HCO3_h => hco3
# NH4_m, NH4_h, NH4_c => nh4
# ACP_h => acp
# Orn_h, Orn_m => orn

#Resolve conflict in original table df_metabolites_aracore

df_metabolites_aracore.set_index("aracore_ids",inplace=True) #Set index to acacore_ids to make editing easiers

# H2O_h, H2O_m, H2O_p, H2O_c  => h2O
df_metabolites_aracore.loc['H2O_c','bigg_id_aggr'] = [['h2o']]
df_metabolites_aracore.loc['H2O_h','bigg_id_aggr'] = [['h2o']]
df_metabolites_aracore.loc['H2O_p','bigg_id_aggr'] = [['h2o']]
df_metabolites_aracore.loc['H2O_m','bigg_id_aggr'] = [['h2o']]

# F6P_h, F6P_c => f6p
df_metabolites_aracore.loc['F6P_h','bigg_id_aggr'] = [['f6p']]
df_metabolites_aracore.loc['F6P_c','bigg_id_aggr'] = [['f6p']]

# HCO3_c, HCO3_h => hco3
df_metabolites_aracore.loc['HCO3_c','bigg_id_aggr'] = [['hco3']]
df_metabolites_aracore.loc['HCO3_h','bigg_id_aggr'] = [['hco3']]

# NH4_m, NH4_h, NH4_c => nh4
df_metabolites_aracore.loc['NH4_m','bigg_id_aggr'] = [['nh4']]
df_metabolites_aracore.loc['NH4_h','bigg_id_aggr'] =[ ['nh4']]
df_metabolites_aracore.loc['NH4_c','bigg_id_aggr'] = [['nh4']]

# ACP_h => acp
df_metabolites_aracore.loc['ACP_h','bigg_id_aggr'] = [['acp']]

# Orn_h, Orn_m => orn
df_metabolites_aracore.loc['Orn_h','bigg_id_aggr'] = [['orn']]
df_metabolites_aracore.loc['Orn_m','bigg_id_aggr'] = [['orn']]

# Q_m => q8
df_metabolites_aracore.loc['Q_m','bigg_id_aggr'] = [['q8']]



#ppi has different sum formulas in the Aracore model?!?
#Coenzyme A has different sum formulas in the Aracore model?!?
#5,6,7,8-Tetrahydrofolate has different sum formulas in the Aracore model?!?

df_metabolites_aracore.reset_index(inplace=True)

In [None]:
#Check if all bigg conflicts resolved 
(df_metabolites_aracore['bigg_id_aggr'].apply(len) > 1).value_counts() #all conflicts resolved

False    413
Name: bigg_id_aggr, dtype: int64

In [None]:
#convert list of len 1 into string of BiGG id
df_metabolites_aracore['bigg_id_aggr'] = df_metabolites_aracore['bigg_id_aggr'].apply(lambda x: x[0] if x else None)

#clean dataframe and drop cols
df_metabolites_aracore.drop(['is_bigg_id','seed_BiGG_id', 'universal_bigg_id'], axis=1, inplace=True)

df_metabolites_aracore.head(25)

Unnamed: 0,aracore_ids,aracore_name,aracore_formula,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,seed_id_aggr,bigg_id_aggr
0,hnu_h,Photon,X,{},hnu_h,hnu,,
1,PQ_h,Oxidized plastoquinone,C13H16O2,{},pq_h,pq,,pq
2,H2O_h,"H2O, water",H2O,{},h2o_h,h2o,cpd00001,h2o
3,H_h,"H+, proton",H,{},h_h,h,cpd00067,h
4,PQH2_h,Reduced plastoquinone,C13H18O2,{},pqh2_h,pqh2,,pqh2
5,O2_h,"O2, oxygen",O2,{},o2_h,o2,cpd00007,o2
6,H_l,"H+, proton",H,{},h_ul,h,cpd00067,h
7,PCox_h,Oxidized plastocyanin,X,{},pcox_h,pcox,cpd30035,pcox
8,PCrd_h,Reduced plastocyanin,X,{},pcrd_h,pcrd,cpd30034,pcrd
9,Fdox_h,Oxidized ferredoxin,S8FeX,{},fdox_h,fdox,cpd15876,fdox


In [None]:
df_metabolites_aracore['bigg_id_aggr'].notna().value_counts() #165 BiGG Ids mapped to aracore id

False    218
True     195
Name: bigg_id_aggr, dtype: int64

In [None]:
#Export final mapping table for manual mapping
df_metabolites_aracore.to_csv('drive/MyDrive/2021-05-27-metabolite-mapping-table.csv')