In [4]:
################################################################################################################################
################################################################################################################################
############                                -*-  RAPESEED METABOLIC NETWORK -*-                                    #############
################################################################################################################################
################################################################################################################################



################################################################################################################################
############                       -*- Apply Community standards for AraCore Model-*-                              #############
################################################################################################################################



################################################################################################################################
############                                  -*- Parameters of the pipeline -*-                                   #############
################################################################################################################################


datadir = "2018-23-05-mb-genC3.sbml"


In [5]:
################################################################################################################################
############                                         -*- Pipeline -*-                                              #############
################################################################################################################################



####################################################################
################## I/ Parsing the file
####################################################################


##################
########### Goal : Etract the information we need to annotate the model : metabolites, reactions and genes
##################


import cobra
import pandas as pd


def parsing(fileName):
    model = cobra.io.read_sbml_model(fileName)
    print("Reactions")
    print("---------")
    for x in model.reactions:
        print("%s : %s" % (x.id, x.reaction))

    print("Metabolites")
    print("-----------")
    for x in model.metabolites:
        print("%9s : %s" % (x.id, x.formula))

    print("Genes")
    print("-----")
    for x in model.genes:
        associated_ids = (i.id for i in x.reactions)
        print("%s is associated with reactions: %s" %
        (x.id, "{" + ", ".join(associated_ids) + "}"))

parsing(datadir)


Reactions
---------
PSII_h : 2.0 H2O_h + 4.0 H_h + 2.0 PQ_h + 4.0 hnu_h --> 4.0 H_l + O2_h + 2.0 PQH2_h
Cytb6f1_h : PCox_h + PQH2_h --> 2.0 H_l + PCrd_h + PQstar_h
Cytb6f2_h : 2.0 H_h + PCox_h + PQstar_h --> 2.0 H_l + PCrd_h + PQ_h
PGR5PGRL11_h : Fdrd_h + PGR5_PGRL1ox_h --> Fdox_h + PGR5_PGRL1rd_h
PGR5PGRL12_h : 4.0 H_h + 2.0 PGR5_PGRL1rd_h + PQ_h --> 2.0 PGR5_PGRL1ox_h + PQH2_h
NDH1_h : Fdrd_h + 2.0 H_h + NDHox_h --> Fdox_h + 2.0 H_l + NDHrd_h
NDH2_h : 4.0 H_h + 2.0 NDHrd_h + PQ_h --> 2.0 NDHox_h + PQH2_h
PSI_h : Fdox_h + PCrd_h + hnu_h --> Fdrd_h + PCox_h
Fd_DASH_NADPR_h : 2.0 Fdrd_h + H_h + NADP_h --> 2.0 Fdox_h + NADPH_h
ATPase_h : ADP_h + 4.0 H_l + Pi_h <=> ATP_h + H2O_h + 3.0 H_h
RBC_h : CO2_h + H2O_h + RuBP_h --> 2.0 H_h + 2.0 PGA_h
PGAK_h : ATP_h + PGA_h <=> ADP_h + DPGA_h
GAPDH1_h : DPGA_h + H_h + NADPH_h --> GAP_h + NADP_h + Pi_h
TPI_h : GAP_h <=> DHAP_h
FBPA_h : DHAP_h + GAP_h <=> FBP_h
FBPase_h : FBP_h + H2O_h --> F6P_h + Pi_h
FTK_h : F6P_h + GAP_h <=> E4P_h + X5P_h
SBPA_h 

    ATP_m : C10H12N5O13P3
      Q_m : C49H74O4
    Fum_m : C4H2O4
    QH2_m : C49H76O4
    Mal_m : C4H4O5
      H_i : H
 Cytcox_m : C42H42N8O8S2FeR4
 Cytcrd_m : C42H42N8O8S2FeR4
     O2_m : O2
   PGCA_h : C2H2O6P
    GCA_h : C2H3O3
    GCA_p : C2H3O3
     O2_p : O2
    GLX_p : C2HO3
   H2O2_p : H2O2
    H2O_p : H2O
    Glu_p : C5H8NO4
    Gly_p : C2H5NO2
     KG_p : C5H4O5
    Gly_m : C2H5NO2
    LPL_m : C8H14NOS2R
  amDHP_m : C9H20N2OS2R
    THF_m : C19H21N7O6
    DHP_m : C8H16NOS2R
M_DASH_THF_m : C20H21N7O6
    NH4_m : H4N
    Ser_m : C3H7NO3
    Ser_p : C3H7NO3
    HPR_p : C3H3O4
   NADH_p : C21H27N7O14P2
      H_p : H
   GCEA_p : C3H5O4
    NAD_p : C21H26N7O14P2
    Mal_p : C4H4O5
    OAA_p : C4H2O5
   GCEA_h : C3H5O4
    GLP_h : C6H9O9P
    6PG_h : C6H10O10P
   DAHP_h : C7H10O10P
    DHQ_h : C7H9O6
    DHS_h : C7H7O5
     SA_h : C7H9O5
    S3P_h : C7H8O8P
   EPSP_h : C10H9O10P
    CHR_h : C10H8O6
   PRPP_h : C5H8O14P3
    AMP_h : C10H12N5O7P
    Mal_h : C4H4O5
    OAA_h : C4H2O5
 

AT1G48860 is associated with reactions: {EPSPS_h}
AT2G45300 is associated with reactions: {EPSPS_h}
AT1G48850 is associated with reactions: {CHRS_h}
AT2G35390 is associated with reactions: {R5PDPK_h}
AT1G10700 is associated with reactions: {R5PDPK_h}
AT1G32380 is associated with reactions: {R5PDPK_h}
AT2G44530 is associated with reactions: {R5PDPK_h}
AT4G15530 is associated with reactions: {PyrPiDK_h}
AT5G58330 is associated with reactions: {MalDH3_h}
AT1G79750 is associated with reactions: {MalDH4_h}
AT3G58740 is associated with reactions: {CitS_c}
AT2G42790 is associated with reactions: {CitS_c}
AT1G32480 is associated with reactions: {iCitDHNAD_c}
AT1G65930 is associated with reactions: {iCitDHNADP_c, iCitDHNADP_h}
AT5G14590 is associated with reactions: {iCitDHNADP_h, iCitDHNADP_m}
AT3G21720 is associated with reactions: {iCitL_p}
AT1G09430 is associated with reactions: {ATPCitL_c}
AT5G49460 is associated with reactions: {ATPCitL_c}
AT1G60810 is associated with reactions: {ATPCitL_

In [6]:
###################################################################
################## II/ Annotate the initial model
###################################################################


###################
############ Goal : Introducing new annotations to the initial model => use of MIRIAM standards for metabolites
###################


############
### 1st step : Create a mapping table for metabolites
############


fileName = datadir
model = cobra.io.read_sbml_model(fileName)
df_mapping_table_reactions = pd.DataFrame(
    {
        "IDs" : [x.id for x in model.reactions],
        "Name" : [x.name for x in model.reactions]
    })
df_mapping_table_reactions # mapping table 

Unnamed: 0,IDs,Name
0,PSII_h,photosystem II
1,Cytb6f1_h,cytochrom b6f complex
2,Cytb6f2_h,cytochrom b6f complex
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...
...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc
568,Tr_Asp_mc,Tr_Asp_mc
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc


In [7]:
df_mapping_table_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   IDs     572 non-null    object
 1   Name    572 non-null    object
dtypes: object(2)
memory usage: 9.1+ KB


In [8]:
df_mapping_table_reactions_no_index = df_mapping_table_reactions.set_index('IDs')
df_mapping_table_reactions_no_index

Unnamed: 0_level_0,Name
IDs,Unnamed: 1_level_1
PSII_h,photosystem II
Cytb6f1_h,cytochrom b6f complex
Cytb6f2_h,cytochrom b6f complex
PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...
PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...
...,...
Tr_KG_Mal_mc,Tr_KG_Mal_mc
Tr_Asp_mc,Tr_Asp_mc
Tr_Asp_Glu_mc,Tr_Asp_Glu_mc
Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc


In [11]:
# Save the mapping table as a CSV file with IDs as index
df_mapping_table_reactions_no_index.to_csv("2021-05-20-CA-reaction-mapping/output/2021-05-17-CA-reaction-mapping-table1.csv")

In [9]:
mapping_table_reactions = pd.read_csv('2021-05-20-CA-reaction-mapping/output/2021-05-17-CA-reaction-mapping-table1.csv', index_col = 0)
mapping_table_reactions["BIGG Model IDs"] = ""
mapping_table_reactions["ModelSEED IDs"] = ""
mapping_table_reactions["KEGG IDs"] = ""
mapping_table_reactions.head()

Unnamed: 0_level_0,Name,BIGG Model IDs,ModelSEED IDs,KEGG IDs
IDs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PSII_h,photosystem II,,,
Cytb6f1_h,cytochrom b6f complex,,,
Cytb6f2_h,cytochrom b6f complex,,,
PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,,,
PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,,,


In [10]:
mapping_table_reactions.shape

(572, 4)

In [11]:
# Obtaining IDs with BIGG Model database

# Import the BIGG Models reactions file
bigg_models_reactions = pd.read_csv('2021-05-20-CA-reaction-mapping/data-databases/bigg_models_reactions.txt', sep = "\t")
bigg_models_reactions.head()

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob
3,BIOMASS_Ec_iJO1366_core_53p95M,E. coli biomass objective function (iJO1366) -...,0.000223 10fthf_c + 2.6e-05 2fe2s_c + 0.000223...,iZ_1308; iWFL_1372; iUTI89_1310; iUMNK88_1353;...,MetaNetX (MNX) Equation: http://identifiers.or...,Ec_biomass_iJO1366_core_53p95M
4,EX_12ppd__S_e,"(S)-Propane-1,2-diol exchange",12ppd__S_e <->,iECH74115_1262; iECP_1309; iECS88_1305; iECED1...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_12ppd_DASH_S_LPAREN_e_RPAREN_; EX_12ppd_S_L...


In [12]:
bigg_models_reactions.shape

(28301, 6)

In [13]:
# Selection of the columns of interest
bigg_id = bigg_models_reactions.iloc[:, :1]
bigg_id

Unnamed: 0,bigg_id
0,DM_4crsol_c
1,DM_aacald_c
2,DM_amob_c
3,BIOMASS_Ec_iJO1366_core_53p95M
4,EX_12ppd__S_e
...,...
28296,EX_LPS30__L_e
28297,GTLOA38
28298,GTGAL13RMN
28299,DM_LPS9_46_27_ST_p


In [14]:
df_merge = df_mapping_table_reactions.merge(bigg_id, how = 'left', left_on = 'IDs', right_on = 'bigg_id')
df_merge

Unnamed: 0,IDs,Name,bigg_id
0,PSII_h,photosystem II,
1,Cytb6f1_h,cytochrom b6f complex,
2,Cytb6f2_h,cytochrom b6f complex,
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,
...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,
568,Tr_Asp_mc,Tr_Asp_mc,
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,


In [15]:
# Load the ModelSEED reaction file
df_ModelSEED_reactions = pd.read_csv("2021-05-20-CA-reaction-mapping/data-databases/reactionsModelSEED.txt", sep= "\t")
df_ModelSEED_reactions.head()

Unnamed: 0,id,abbreviation,name,code,stoichiometry,is_transport,equation,definition,reversibility,direction,...,aliases,ec_numbers,deltag,deltagerr,compound_ids,status,is_obsolete,linked_reaction,notes,source
0,rxn00001,R00004,diphosphate phosphohydrolase,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-1:cpd00012:0:0:""PPi"";2:...",0,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,(1) H2O[0] + (1) PPi[0] <=> (2) Phosphate[0] +...,>,=,...,AraCyc: INORGPYROPHOSPHAT-RXN|BiGG: IPP1; PPA;...,3.6.1.1,-3.46,0.05,cpd00001;cpd00009;cpd00012;cpd00067,OK,0,rxn27946;rxn27947;rxn27948;rxn32487;rxn38157;r...,GCC|HB|EQC|EQU,Primary Database
1,rxn00002,R00005,urea-1-carboxylate amidohydrolase,(1) cpd00001[0] + (1) cpd00742[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-3:cpd00067:0:0:""H+"";-1:...",0,(1) cpd00001[0] + (3) cpd00067[0] + (1) cpd007...,(1) H2O[0] + (3) H+[0] + (1) Allophanate[0] =>...,>,>,...,AraCyc: ALLOPHANATE-HYDROLASE-RXN|BiGG: ALPHNH...,3.5.1.54,-20.14,1.86,cpd00001;cpd00011;cpd00013;cpd00067;cpd00742,OK,0,rxn30346;rxn35525,GCC|EQC|EQU,Primary Database
2,rxn00003,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...,(1) cpd00011[0] + (1) cpd00668[0] <=> (2) cpd0...,"-1:cpd00011:0:0:""CO2"";-1:cpd00668:0:0:""ALCTT"";...",0,(1) cpd00011[0] + (1) cpd00668[0] <= (2) cpd00...,(1) CO2[0] + (1) ALCTT[0] <= (2) Pyruvate[0] +...,<,<,...,AlgaGEM: R_R00006_c|AraGEM: R_R00006_c|BiGG: I...,2.2.1.6,8.27,0.9,cpd00011;cpd00020;cpd00067;cpd00668,OK,0,rxn30144;rxn33164,GCC|EQC|EQU,Primary Database
3,rxn00004,R00008,4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...,(1) cpd02570[0] <=> (2) cpd00020[0],"-1:cpd02570:0:0:""Parapyruvate"";2:cpd00020:0:0:...",0,(1) cpd02570[0] <=> (2) cpd00020[0],(1) Parapyruvate[0] <=> (2) Pyruvate[0],=,=,...,KEGG: R00008|Name: 4-Hydroxy-4-methyl-2-oxoglu...,4.1.3.17,4.49,0.57,cpd00020;cpd02570,OK,0,,GCC|EQC|EQU,Primary Database
4,rxn00006,R00009,hydrogen-peroxide:hydrogen-peroxide oxidoreduc...,(2) cpd00025[0] <=> (2) cpd00001[0] + (1) cpd0...,"-2:cpd00025:0:0:""H2O2"";2:cpd00001:0:0:""H2O"";1:...",0,(2) cpd00025[0] => (2) cpd00001[0] + (1) cpd00...,(2) H2O2[0] => (2) H2O[0] + (1) O2[0],>,>,...,AraCyc: CATAL-RXN|BiGG: CAT; CATp; CTA1; CTT1|...,1.11.1.21|1.11.1.6,-46.06,1.64,cpd00001;cpd00007;cpd00025,OK,0,rxn19264;rxn22404;rxn27744;rxn31381,GCC|EQC|EQU,Primary Database


In [16]:
# Select the columns of interest
modelSEED_id = df_ModelSEED_reactions.iloc[:, :1]
modelSEED_id

Unnamed: 0,id
0,rxn00001
1,rxn00002
2,rxn00003
3,rxn00004
4,rxn00006
...,...
43769,rxn48571
43770,rxn48572
43771,rxn48573
43772,rxn48574


In [17]:
df_merge_modelSEED = df_mapping_table_reactions.merge(modelSEED_id, how = 'left', left_on = 'IDs', right_on = 'id')
df_merge_modelSEED

Unnamed: 0,IDs,Name,id
0,PSII_h,photosystem II,
1,Cytb6f1_h,cytochrom b6f complex,
2,Cytb6f2_h,cytochrom b6f complex,
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,
...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,
568,Tr_Asp_mc,Tr_Asp_mc,
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,


In [20]:
# Merge with KEGG database
# => doesn't seem to work : difficult for me to see where the files in the database are
# However, maybe we could obtain them thanks to Rhea database => contains files including compounds and reactions from 
# KEGG database
rhea_compounds = pd.read_csv('../tests/rhea-compounds.txt', sep = '\t')
rhea_compounds.head()

Unnamed: 0,ENTRY CHEBI:7
0,NAME (+)-car-3-ene
1,FORMULA C10H16
2,REACTION 32539 32540 32541 32542
3,ENZYME 4.2.3.107
4,///


In [21]:
rhea_compounds.shape

(78051, 1)

In [22]:
rhea_compounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78051 entries, 0 to 78050
Data columns (total 1 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ENTRY       CHEBI:7  78051 non-null  object
dtypes: object(1)
memory usage: 609.9+ KB
