In [1]:
################################################################################################################################
################################################################################################################################
############                                -*-  RAPESEED METABOLIC NETWORK -*-                                    #############
################################################################################################################################
################################################################################################################################



################################################################################################################################
############                       -*- Apply Community standards for AraCore Model-*-                              #############
################################################################################################################################



################################################################################################################################
############                                  -*- Parameters of the pipeline -*-                                   #############
################################################################################################################################


datadir = "../models/2018-23-05-mb-genC3.sbml"


In [2]:
################################################################################################################################
############                                         -*- Pipeline -*-                                              #############
################################################################################################################################



####################################################################
################## I/ Parsing the file
####################################################################


##################
########### Goal : Etract the information we need to annotate the model : metabolites, reactions and genes
##################


import cobra
import pandas as pd


def parsing(fileName):
    model = cobra.io.read_sbml_model(fileName)
    print("Reactions")
    print("---------")
    for x in model.reactions:
        print("%s : %s" % (x.id, x.reaction))

    print("Metabolites")
    print("-----------")
    for x in model.metabolites:
        print("%9s : %s" % (x.id, x.formula))

    print("Genes")
    print("-----")
    for x in model.genes:
        associated_ids = (i.id for i in x.reactions)
        print("%s is associated with reactions: %s" %
        (x.id, "{" + ", ".join(associated_ids) + "}"))

parsing(datadir)


Reactions
---------
PSII_h : 2.0 H2O_h + 4.0 H_h + 2.0 PQ_h + 4.0 hnu_h --> 4.0 H_l + O2_h + 2.0 PQH2_h
Cytb6f1_h : PCox_h + PQH2_h --> 2.0 H_l + PCrd_h + PQstar_h
Cytb6f2_h : 2.0 H_h + PCox_h + PQstar_h --> 2.0 H_l + PCrd_h + PQ_h
PGR5PGRL11_h : Fdrd_h + PGR5_PGRL1ox_h --> Fdox_h + PGR5_PGRL1rd_h
PGR5PGRL12_h : 4.0 H_h + 2.0 PGR5_PGRL1rd_h + PQ_h --> 2.0 PGR5_PGRL1ox_h + PQH2_h
NDH1_h : Fdrd_h + 2.0 H_h + NDHox_h --> Fdox_h + 2.0 H_l + NDHrd_h
NDH2_h : 4.0 H_h + 2.0 NDHrd_h + PQ_h --> 2.0 NDHox_h + PQH2_h
PSI_h : Fdox_h + PCrd_h + hnu_h --> Fdrd_h + PCox_h
Fd_DASH_NADPR_h : 2.0 Fdrd_h + H_h + NADP_h --> 2.0 Fdox_h + NADPH_h
ATPase_h : ADP_h + 4.0 H_l + Pi_h <=> ATP_h + H2O_h + 3.0 H_h
RBC_h : CO2_h + H2O_h + RuBP_h --> 2.0 H_h + 2.0 PGA_h
PGAK_h : ATP_h + PGA_h <=> ADP_h + DPGA_h
GAPDH1_h : DPGA_h + H_h + NADPH_h --> GAP_h + NADP_h + Pi_h
TPI_h : GAP_h <=> DHAP_h
FBPA_h : DHAP_h + GAP_h <=> FBP_h
FBPase_h : FBP_h + H2O_h --> F6P_h + Pi_h
FTK_h : F6P_h + GAP_h <=> E4P_h + X5P_h
SBPA_h 

    DAP_h : C7H14N2O4
   mDAP_h : C7H14N2O4
    Lys_h : C6H15N2O2
H_DASH_Ser_h : C4H9NO3
PH_DASH_Ser_h : C4H8NO6P
    CTH_h : C7H14N2O4S
H_DASH_Cys_h : C4H9NO2S
M_DASH_THF_c : C20H22N7O6
5M_DASH_THF_c : C20H24N7O6
H_DASH_Cys_c : C4H9NO2S
    Met_c : C5H11NO2S
   aMet_c : C15H22N6O5S
aH_DASH_Cys_c : C14H20N6O5S
    PRE_h : C10H8O6
    AGN_h : C10H12NO5
    Phe_h : C9H11NO2
    Arg_m : C6H15N4O2
    Orn_m : C5H13N2O2
   urea_m : CH5N2O
Glu_DASH_SeA_m : C5H9NO3
   GluP_c : C5H8NO7P
   GluP_h : C5H8NO7P
   GluP_m : C5H8NO7P
Glu_DASH_SeA_c : C5H9NO3
Glu_DASH_SeA_h : C5H9NO3
    P5C_c : C5H6NO2
    P5C_h : C5H6NO2
    P5C_m : C5H6NO2
    Pro_c : C5H9NO2
    Pro_m : C5H9NO2
P_DASH_HPR_h : C3H2O7P
   PSer_h : C3H6NO6P
    ANT_h : C7H6NO2
PR_DASH_ANT_h : C12H13NO9P
CPD_DASH_Ru5P_h : C12H13NO9P
Ind_DASH_GP_h : C11H12NO6P
    Ind_h : C8H7N
    Trp_h : C11H12N2O2
    Tyr_h : C9H11NO3
    Val_h : C5H11NO2
    PRA_h : C5H11NO7P
    GAR_h : C7H14N2O8P
   FGAR_h : C8H13N2O9P
   FGAM_h : C8H14N3O8P
   

AT3G06200 is associated with reactions: {GMPK_c}
AT2G41880 is associated with reactions: {GMPK_c}
AT3G600180 is associated with reactions: {NMPK_c}
AT5G26667 is associated with reactions: {NMPK_c}
AT1G14240 is associated with reactions: {CTPP_c}
AT1G14250 is associated with reactions: {CTPP_c}
AT3G04080 is associated with reactions: {CTPP_c}
AT1G14230 is associated with reactions: {CTPP_c}
AT4G19180 is associated with reactions: {CTPP_c}
AT2G02970 is associated with reactions: {CTPP_c}
AT3G23580 is associated with reactions: {ADPR_c, UDPR_c, GDPR_c, CDPR_c}
AT2G21790 is associated with reactions: {ADPR_c, UDPR_c, GDPR_c, CDPR_c}
AT3G27060 is associated with reactions: {ADPR_c, UDPR_c, GDPR_c, CDPR_c}
AT4G13720 is associated with reactions: {dUTPP_c}
AT3G46940 is associated with reactions: {dUTPP_c}
AT5G59440 is associated with reactions: {dTMPK_c}
AT4G23895 is associated with reactions: {UDPK_c, dCDPK_c, dADPK_c, dTDPK_c, GDPK_c, dGDPK_c, dUDPK_c}
AT5G63310 is associated with reactions

In [3]:
###################################################################
################## II/ Annotate the initial model
###################################################################



###################
############ Goal : Introducing new annotations to the initial model => Get as many BIGG/ModelSEED Ids based for the reactions
############ The plan is to extract the kegg_ids from the annotation of the AraCore model and to extract the kegg_ids from the 
############ BIGG reaction table. After that the kegg_id columns in the two resulting dataframes for the arcore model and bigg 
############ can be used to perform the mapping. (Similar approach for ModelSeed).
###################



# (1) Get the KEGG ids that are given in the annotation of our model 
# (2) Import the bigg_models_reactions.txt as a pandas table in python
# (3) the pandas table will have a column 'database_links', which provides in text format serveral urls to external databases.
# (4) We need to parse the column 'database_links' to get the KEGG url (it will probably start with 
# http://identifiers.org/kegg/ and continue with the KEGG reactions id. 
# -> We may want to use df['kegg_urls'] = df['database_links'].apply(lambda x: myfun(x)), my_fun(x) needs to convert the string
# of urls into a list of url, and than extract the url that starts with  http://identifiers.org/kegg/
# (5) So after we grabbed the correct url, we need to extract the KEGG reactions id from the url and put them into a new 
# column
# df['kegg_ids'] = df['kegg_urls'].split(....)



############
### 1st step : Get the KEGG ids that are given in the annotation of our model
############


fileName = datadir
model = cobra.io.read_sbml_model(fileName)
df_mapping_table_reactions = pd.DataFrame(
    {
        "IDs" : [x.id for x in model.reactions],
        "Name" : [x.name for x in model.reactions],
        "Annotations" : [x.annotation for x in model.reactions]
    })
df_mapping_table_reactions # mapping table 

Unnamed: 0,IDs,Name,Annotations
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."
...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'}
568,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'}
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'}
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'}


In [4]:
df_mapping_table_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   IDs          572 non-null    object
 1   Name         572 non-null    object
 2   Annotations  572 non-null    object
dtypes: object(3)
memory usage: 13.5+ KB


In [5]:
df_mapping_table_reactions_no_index = df_mapping_table_reactions.set_index('IDs')
df_mapping_table_reactions_no_index

Unnamed: 0_level_0,Name,Annotations
IDs,Unnamed: 1_level_1,Unnamed: 2_level_1
PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."
PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."
...,...,...
Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'}
Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'}
Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'}
Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'}


In [6]:
# Save the mapping table as a CSV file with IDs as index
df_mapping_table_reactions_no_index.to_csv("../data/processed/2021-05-29-CA-reactions-mapping-table1-output.csv")

In [6]:
df_mapping_table_reactions.shape

(572, 3)

In [7]:
df_mapping_table_reactions.loc[:,"Annotations"]

0      {'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...
1      {'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...
2      {'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...
3      {'doi': '10.1016/j.tplants.2011.10.004', 'go':...
4      {'doi': '10.1016/j.tplants.2011.10.004', 'go':...
                             ...                        
567                                 {'go': 'GO:0006810'}
568                                 {'go': 'GO:0006810'}
569                                 {'go': 'GO:0006810'}
570                                 {'go': 'GO:0006810'}
571                                 {'go': 'GO:0006810'}
Name: Annotations, Length: 572, dtype: object

In [8]:
# Function to get kegg_ids from dictionary in annotation column
def apply_split_annotations(dict_anno):
    if 'kegg.reaction' in dict_anno.keys():
        kegg_id = dict_anno['kegg.reaction']
    else:
        kegg_id = '' # False, np.nan, because, if we have another dataframe, the type of the data would be different
    return kegg_id
#apply_split_annotations(df_mapping_table_reactions)

''

In [9]:
df_mapping_table_reactions["kegg_id"] = df_mapping_table_reactions["Annotations"].apply(lambda dict_anno:apply_split_annotations(dict_anno))
# I write where I want to apply the apply() function and on which dataframe, then I write the lambda function = function defined on one line
# x is the variable on which I want to apply the lambda function
# The apply() function applies the function to every row of the dataframe
# => more efficient than for loops or while ones when we have big dataframes
# After that, we correct the function we have defined before : apply_split_annotation() here : for loop for example
# Lambda function : we define a function which takes dict_anno as parameter; then, we write the function we defined just before,
# including the parameter of interest (here, dict_anno)
# Written equivalent : df_mapping_table_reactions["kegg_id"] = df_mapping_table_reactions["Annotations"].apply(apply_split_annotations)
# => Here, we pass the function itself in parameter
df_mapping_table_reactions

Unnamed: 0,IDs,Name,Annotations,kegg_id
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R09503
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817
...,...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'},
568,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'},
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'},
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'},


In [10]:
############
### 2nd step : Import the bigg_models_reactions.txt as a pandas table in python
############

bigg_models_reactions = pd.read_csv('../data/external/bigg_models_reactions.txt', sep = "\t")
bigg_models_reactions.head()

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob
3,BIOMASS_Ec_iJO1366_core_53p95M,E. coli biomass objective function (iJO1366) -...,0.000223 10fthf_c + 2.6e-05 2fe2s_c + 0.000223...,iZ_1308; iWFL_1372; iUTI89_1310; iUMNK88_1353;...,MetaNetX (MNX) Equation: http://identifiers.or...,Ec_biomass_iJO1366_core_53p95M
4,EX_12ppd__S_e,"(S)-Propane-1,2-diol exchange",12ppd__S_e <->,iECH74115_1262; iECP_1309; iECS88_1305; iECED1...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_12ppd_DASH_S_LPAREN_e_RPAREN_; EX_12ppd_S_L...


In [11]:
bigg_models_reactions.shape

(28301, 6)

In [12]:
#############
# 3rd step : The pandas table will have a column 'database_links', which provides in text format serveral urls to external databases.
#############

# Selection of the columns of interest
bigg_database_links = bigg_models_reactions.iloc[:, 4] # WARNING ! count the number of columns from 0 !!
bigg_database_links

0        RHEA: http://identifiers.org/rhea/35071; RHEA:...
1        MetaNetX (MNX) Equation: http://identifiers.or...
2        MetaNetX (MNX) Equation: http://identifiers.or...
3        MetaNetX (MNX) Equation: http://identifiers.or...
4        MetaNetX (MNX) Equation: http://identifiers.or...
                               ...                        
28296                                                  NaN
28297                                                  NaN
28298                                                  NaN
28299                                                  NaN
28300                                                  NaN
Name: database_links, Length: 28301, dtype: object

In [13]:
bigg_database_links.iloc[0]

'RHEA: http://identifiers.org/rhea/35071; RHEA: http://identifiers.org/rhea/35072; RHEA: http://identifiers.org/rhea/35073; RHEA: http://identifiers.org/rhea/35074; BioCyc: http://identifiers.org/biocyc/META:TRANS-RXN0-500; MetaNetX (MNX) Equation: http://identifiers.org/metanetx.reaction/MNXR96888'

In [14]:
#############
# 4th step : We need to parse the column 'database_links' to get the KEGG url
#############

# it will probably start with http://identifiers.org/kegg/) and continue with the KEGG reactions id. 
# -> We may want to use df['kegg_urls'] = df['database_links'].apply(lambda x: myfun(x)), my_fun(x) needs to convert the string
# of urls into a list of url, and then extract the url that starts with  http://identifiers.org/kegg/

def parsing_database_links(line):
    if type(line) is str:
        if "http://identifiers.org/kegg" in line:
            line_splited = line.rstrip(';').split(' ') # line_splited is the list containing 
            #print(line_splited)
            for string in line_splited:
                if 'kegg' in string:
                    kegg_url = string
                    kegg_url = kegg_url.rstrip(';')
                    print(kegg_url)
                else:
                    pass
        else:
            kegg_url = None
    else:
        kegg_url = None
    return kegg_url
#parsing_database_links(bigg_database_links)

In [15]:
bigg_models_reactions["kegg_url"] = bigg_models_reactions["database_links"].apply(lambda line:parsing_database_links(line))

http://identifiers.org/kegg.reaction/R03538
http://identifiers.org/kegg.reaction/R05135
http://identifiers.org/kegg.reaction/R04544
http://identifiers.org/kegg.reaction/R07764
http://identifiers.org/kegg.reaction/R04537
http://identifiers.org/kegg.reaction/R07127
http://identifiers.org/kegg.reaction/R01877
http://identifiers.org/kegg.reaction/R02148
http://identifiers.org/kegg.reaction/R04964
http://identifiers.org/kegg.reaction/R04543
http://identifiers.org/kegg.reaction/R05086
http://identifiers.org/kegg.reaction/R01530
http://identifiers.org/kegg.reaction/R01648
http://identifiers.org/kegg.reaction/R01177
http://identifiers.org/kegg.reaction/R03778
http://identifiers.org/kegg.reaction/R05222
http://identifiers.org/kegg.reaction/R00259
http://identifiers.org/kegg.reaction/R00226
http://identifiers.org/kegg.reaction/R04477
http://identifiers.org/kegg.reaction/R01811
http://identifiers.org/kegg.reaction/R01175
http://identifiers.org/kegg.reaction/R03857
http://identifiers.org/kegg.reac

http://identifiers.org/kegg.reaction/R07503
http://identifiers.org/kegg.reaction/R07236
http://identifiers.org/kegg.reaction/R07498
http://identifiers.org/kegg.reaction/R07758
http://identifiers.org/kegg.reaction/R07761
http://identifiers.org/kegg.reaction/R04672
http://identifiers.org/kegg.reaction/R04672
http://identifiers.org/kegg.reaction/R01172
http://identifiers.org/kegg.reaction/R03544
http://identifiers.org/kegg.reaction/R03545
http://identifiers.org/kegg.reaction/R01976
http://identifiers.org/kegg.reaction/R01976
http://identifiers.org/kegg.reaction/R01829
http://identifiers.org/kegg.reaction/R00206
http://identifiers.org/kegg.reaction/R00024
http://identifiers.org/kegg.reaction/R06962
http://identifiers.org/kegg.reaction/R03132
http://identifiers.org/kegg.reaction/R01288
http://identifiers.org/kegg.reaction/R00999
http://identifiers.org/kegg.reaction/R02433
http://identifiers.org/kegg.reaction/R04859
http://identifiers.org/kegg.reaction/R00586
http://identifiers.org/kegg.reac

http://identifiers.org/kegg.reaction/R03648
http://identifiers.org/kegg.reaction/R00355
http://identifiers.org/kegg.reaction/R05577
http://identifiers.org/kegg.reaction/R00126
http://identifiers.org/kegg.reaction/R00085
http://identifiers.org/kegg.reaction/R00086
http://identifiers.org/kegg.reaction/R07497
http://identifiers.org/kegg.reaction/R00009
http://identifiers.org/kegg.reaction/R01021
http://identifiers.org/kegg.reaction/R02396
http://identifiers.org/kegg.reaction/R00571
http://identifiers.org/kegg.reaction/R01001
http://identifiers.org/kegg.reaction/R01663
http://identifiers.org/kegg.reaction/R01826
http://identifiers.org/kegg.reaction/R01011
http://identifiers.org/kegg.reaction/R00130
http://identifiers.org/kegg.reaction/R01662
http://identifiers.org/kegg.reaction/R01121
http://identifiers.org/kegg.reaction/R04170
http://identifiers.org/kegg.reaction/R01329
http://identifiers.org/kegg.reaction/R06142
http://identifiers.org/kegg.reaction/R01280
http://identifiers.org/kegg.reac

http://identifiers.org/kegg.reaction/R01549
http://identifiers.org/kegg.reaction/R02371
http://identifiers.org/kegg.reaction/R02327
http://identifiers.org/kegg.reaction/R01880
http://identifiers.org/kegg.reaction/R02096
http://identifiers.org/kegg.reaction/R02372
http://identifiers.org/kegg.reaction/R02332
http://identifiers.org/kegg.reaction/R02016
http://identifiers.org/kegg.reaction/R00967
http://identifiers.org/kegg.reaction/R00228
http://identifiers.org/kegg.reaction/R00742
http://identifiers.org/kegg.reaction/R00317
http://identifiers.org/kegg.reaction/R01213
http://identifiers.org/kegg.reaction/R00704
http://identifiers.org/kegg.reaction/R00703
http://identifiers.org/kegg.reaction/R00212
http://identifiers.org/kegg.reaction/R00230
http://identifiers.org/kegg.reaction/R00230
http://identifiers.org/kegg.reaction/R00224
http://identifiers.org/kegg.reaction/R01196
http://identifiers.org/kegg.reaction/R00019
http://identifiers.org/kegg.reaction/R00032
http://identifiers.org/kegg.reac

In [16]:
bigg_models_reactions

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,kegg_url
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c,
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD,
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob,
3,BIOMASS_Ec_iJO1366_core_53p95M,E. coli biomass objective function (iJO1366) -...,0.000223 10fthf_c + 2.6e-05 2fe2s_c + 0.000223...,iZ_1308; iWFL_1372; iUTI89_1310; iUMNK88_1353;...,MetaNetX (MNX) Equation: http://identifiers.or...,Ec_biomass_iJO1366_core_53p95M,
4,EX_12ppd__S_e,"(S)-Propane-1,2-diol exchange",12ppd__S_e <->,iECH74115_1262; iECP_1309; iECS88_1305; iECED1...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_12ppd_DASH_S_LPAREN_e_RPAREN_; EX_12ppd_S_L...,
...,...,...,...,...,...,...,...
28296,EX_LPS30__L_e,Long O-antigen group O:30 export,LPS30__L_e <->,iYS1720,,EX_LPS30_L,
28297,GTLOA38,Lumped glycosyltransferase reactions group O:3...,uacgam_c + udcdpgalnac_c + udpg_c + 2.0 udpgal...,iYS1720,,GTLOA38,
28298,GTGAL13RMN,Rhamnosyltransferase gal(a1-3)rmn,dtdprmn_c + udcdpgal_c <-> dtdp_c + h_c + udcd...,iYS1720,,GTGAL13RMN,
28299,DM_LPS9_46_27_ST_p,Lipopolysaccharide with a short O-antigenconsi...,LPS9_46_27_ST_p <->,iYS1720,,EX_LPS9_46_27_ST,


In [57]:
bigg_models_reactions.to_csv("../data/processed/2021-05-29-CA-reactions-mapping-table2-kegg-url-output.csv")

In [17]:
#############
# 5th step : So after we grabbed the correct url, we need to extract the KEGG reactions id from the url and put them into a 
# new column
#############

# df['kegg_ids'] = df['kegg_urls'].split(....)
bigg_models_reactions['kegg_ids'] = bigg_models_reactions['kegg_url'].str.split('/')
bigg_models_reactions

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,kegg_url,kegg_ids
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c,,
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD,,
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob,,
3,BIOMASS_Ec_iJO1366_core_53p95M,E. coli biomass objective function (iJO1366) -...,0.000223 10fthf_c + 2.6e-05 2fe2s_c + 0.000223...,iZ_1308; iWFL_1372; iUTI89_1310; iUMNK88_1353;...,MetaNetX (MNX) Equation: http://identifiers.or...,Ec_biomass_iJO1366_core_53p95M,,
4,EX_12ppd__S_e,"(S)-Propane-1,2-diol exchange",12ppd__S_e <->,iECH74115_1262; iECP_1309; iECS88_1305; iECED1...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_12ppd_DASH_S_LPAREN_e_RPAREN_; EX_12ppd_S_L...,,
...,...,...,...,...,...,...,...,...
28296,EX_LPS30__L_e,Long O-antigen group O:30 export,LPS30__L_e <->,iYS1720,,EX_LPS30_L,,
28297,GTLOA38,Lumped glycosyltransferase reactions group O:3...,uacgam_c + udcdpgalnac_c + udpg_c + 2.0 udpgal...,iYS1720,,GTLOA38,,
28298,GTGAL13RMN,Rhamnosyltransferase gal(a1-3)rmn,dtdprmn_c + udcdpgal_c <-> dtdp_c + h_c + udcd...,iYS1720,,GTGAL13RMN,,
28299,DM_LPS9_46_27_ST_p,Lipopolysaccharide with a short O-antigenconsi...,LPS9_46_27_ST_p <->,iYS1720,,EX_LPS9_46_27_ST,,


In [18]:
# Filter the column "kegg_ids" of the dataframe to select the ids which are not null
bigg_models_reactions_kegg_ids = bigg_models_reactions[bigg_models_reactions["kegg_ids"].notna()]
bigg_models_reactions_kegg_ids

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,kegg_url,kegg_ids
175,23PDE2pp,"2',3'-cyclic-nucleotide phosphodiesterase (UMP...",23cump_p + h2o_p <-> 3ump_p + h_p,iEC1364_W; iEC1368_DH5a; iEC1372_W3110; iEC135...,RHEA: http://identifiers.org/rhea/27878; RHEA:...,23PDE2pp,http://identifiers.org/kegg.reaction/R03538,"[http:, , identifiers.org, kegg.reaction, R03538]"
176,23PDE9pp,"2',3'-cyclic-nucleotide phosphodiesterase (GMP...",23cgmp_p + h2o_p <-> 3gmp_p + h_p,iEcHS_1320; iEcE24377_1341; iECS88_1305; iECOK...,RHEA: http://identifiers.org/rhea/27858; RHEA:...,23PDE9pp,http://identifiers.org/kegg.reaction/R05135,"[http:, , identifiers.org, kegg.reaction, R05135]"
198,3HAD160,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hpalmACP_c <-> h2o_c + tpalm2eACP_c,iEC1364_W; iEC1368_DH5a; iEC1372_W3110; iSynCJ...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD160; _3HAD160,http://identifiers.org/kegg.reaction/R04544,"[http:, , identifiers.org, kegg.reaction, R04544]"
200,3HAD180,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hoctaACP_c <-> h2o_c + toctd2eACP_c,iSFV_1184; iSbBS512_1146; iECUMN_1333; iECSP_1...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD180; _3HAD180,http://identifiers.org/kegg.reaction/R07764,"[http:, , identifiers.org, kegg.reaction, R07764]"
201,3HAD80,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hoctACP_c <-> h2o_c + toct2eACP_c,iAF1260; iECDH1ME8569_1439; iPC815; iJN746; iA...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD80; _3HAD80,http://identifiers.org/kegg.reaction/R04537,"[http:, , identifiers.org, kegg.reaction, R04537]"
...,...,...,...,...,...,...,...,...
21049,r0627,Galactosylglycerol galactohydrolase Galactose ...,HC01444_e + h2o_e <-> gal_e + glyc_e,iCHOv1_DG44; Recon3D; iCHOv1,EC Number: http://identifiers.org/ec-code/3.2....,r0627,http://identifiers.org/kegg.reaction/R01104,"[http:, , identifiers.org, kegg.reaction, R01104]"
21057,r0673,"6-Lactoyl-5,6,7,8-tetrahydropterin:NADP+ 2-oxi...",6pthp_c + h_c + nadph_c <-> HC01254_c + nadp_c,iCHOv1; Recon3D,EC Number: http://identifiers.org/ec-code/1.1....,r0673,http://identifiers.org/kegg.reaction/R04285,"[http:, , identifiers.org, kegg.reaction, R04285]"
21058,r0708,"2-Amino-4-hydroxy-6-(erythro-1,2,3-trihydroxyp...",ahdt_c + h2o_c + 4.0 h_c <-> HC01710_c,iCHOv1,EC Number: http://identifiers.org/ec-code/3.5....,r0708,http://identifiers.org/kegg.reaction/R04639,"[http:, , identifiers.org, kegg.reaction, R04639]"
21072,r0789,S-Adenosyl-L-methionine:phosphodimethylethanol...,HC01842_c + amet_c <-> ahcys_c + cholp_c,Recon3D; iCHOv1_DG44; iCHOv1,EC Number: http://identifiers.org/ec-code/2.1....,r0789,http://identifiers.org/kegg.reaction/R06869,"[http:, , identifiers.org, kegg.reaction, R06869]"


In [19]:
# The split() seems to work
# We will take the last element of each line, e.g. the kegg id
def last_element_kegg_id(l):
    if l is not None:
        kegg_id = l[-1]
    else:
        kegg_id = None
    return kegg_id

In [20]:
bigg_models_reactions["kegg_ids_test"] = bigg_models_reactions["kegg_ids"].apply(lambda l:last_element_kegg_id(l))
bigg_models_reactions

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,kegg_url,kegg_ids,kegg_ids_test
0,DM_4crsol_c,Sink needed to allow p-Cresol to leave system,4crsol_c <->,iEcDH1_1363; iJO1366; iE2348C_1286; iECB_1328;...,RHEA: http://identifiers.org/rhea/35071; RHEA:...,DM_4CRSOL; DM_4crsol_c,,,
1,DM_aacald_c,Sink needed to allow aminoacetaldehyde to leav...,aacald_c <->,iECB_1328; iBWG_1329; iE2348C_1286; iECBD_1354...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AACALD; DM_aacald_c; R_DM_AACALD,,,
2,DM_amob_c,Sink needed to allow S-Adenosyl-4-methylthio-2...,amob_c <->,iECNA114_1301; iECDH10B_1368; iECP_1309; iECIA...,MetaNetX (MNX) Equation: http://identifiers.or...,DM_AMOB; DM_amob_c; sink_amob,,,
3,BIOMASS_Ec_iJO1366_core_53p95M,E. coli biomass objective function (iJO1366) -...,0.000223 10fthf_c + 2.6e-05 2fe2s_c + 0.000223...,iZ_1308; iWFL_1372; iUTI89_1310; iUMNK88_1353;...,MetaNetX (MNX) Equation: http://identifiers.or...,Ec_biomass_iJO1366_core_53p95M,,,
4,EX_12ppd__S_e,"(S)-Propane-1,2-diol exchange",12ppd__S_e <->,iECH74115_1262; iECP_1309; iECS88_1305; iECED1...,MetaNetX (MNX) Equation: http://identifiers.or...,EX_12ppd_DASH_S_LPAREN_e_RPAREN_; EX_12ppd_S_L...,,,
...,...,...,...,...,...,...,...,...,...
28296,EX_LPS30__L_e,Long O-antigen group O:30 export,LPS30__L_e <->,iYS1720,,EX_LPS30_L,,,
28297,GTLOA38,Lumped glycosyltransferase reactions group O:3...,uacgam_c + udcdpgalnac_c + udpg_c + 2.0 udpgal...,iYS1720,,GTLOA38,,,
28298,GTGAL13RMN,Rhamnosyltransferase gal(a1-3)rmn,dtdprmn_c + udcdpgal_c <-> dtdp_c + h_c + udcd...,iYS1720,,GTGAL13RMN,,,
28299,DM_LPS9_46_27_ST_p,Lipopolysaccharide with a short O-antigenconsi...,LPS9_46_27_ST_p <->,iYS1720,,EX_LPS9_46_27_ST,,,


In [21]:
bigg_models_reactions_kegg_ids_test = bigg_models_reactions[bigg_models_reactions["kegg_ids_test"].notna()]
bigg_models_reactions_kegg_ids_test

Unnamed: 0,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids,kegg_url,kegg_ids,kegg_ids_test
175,23PDE2pp,"2',3'-cyclic-nucleotide phosphodiesterase (UMP...",23cump_p + h2o_p <-> 3ump_p + h_p,iEC1364_W; iEC1368_DH5a; iEC1372_W3110; iEC135...,RHEA: http://identifiers.org/rhea/27878; RHEA:...,23PDE2pp,http://identifiers.org/kegg.reaction/R03538,"[http:, , identifiers.org, kegg.reaction, R03538]",R03538
176,23PDE9pp,"2',3'-cyclic-nucleotide phosphodiesterase (GMP...",23cgmp_p + h2o_p <-> 3gmp_p + h_p,iEcHS_1320; iEcE24377_1341; iECS88_1305; iECOK...,RHEA: http://identifiers.org/rhea/27858; RHEA:...,23PDE9pp,http://identifiers.org/kegg.reaction/R05135,"[http:, , identifiers.org, kegg.reaction, R05135]",R05135
198,3HAD160,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hpalmACP_c <-> h2o_c + tpalm2eACP_c,iEC1364_W; iEC1368_DH5a; iEC1372_W3110; iSynCJ...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD160; _3HAD160,http://identifiers.org/kegg.reaction/R04544,"[http:, , identifiers.org, kegg.reaction, R04544]",R04544
200,3HAD180,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hoctaACP_c <-> h2o_c + toctd2eACP_c,iSFV_1184; iSbBS512_1146; iECUMN_1333; iECSP_1...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD180; _3HAD180,http://identifiers.org/kegg.reaction/R07764,"[http:, , identifiers.org, kegg.reaction, R07764]",R07764
201,3HAD80,3-hydroxyacyl-[acyl-carrier-protein] dehydrata...,3hoctACP_c <-> h2o_c + toct2eACP_c,iAF1260; iECDH1ME8569_1439; iPC815; iJN746; iA...,EC Number: http://identifiers.org/ec-code/2.3....,3HAD80; _3HAD80,http://identifiers.org/kegg.reaction/R04537,"[http:, , identifiers.org, kegg.reaction, R04537]",R04537
...,...,...,...,...,...,...,...,...,...
21049,r0627,Galactosylglycerol galactohydrolase Galactose ...,HC01444_e + h2o_e <-> gal_e + glyc_e,iCHOv1_DG44; Recon3D; iCHOv1,EC Number: http://identifiers.org/ec-code/3.2....,r0627,http://identifiers.org/kegg.reaction/R01104,"[http:, , identifiers.org, kegg.reaction, R01104]",R01104
21057,r0673,"6-Lactoyl-5,6,7,8-tetrahydropterin:NADP+ 2-oxi...",6pthp_c + h_c + nadph_c <-> HC01254_c + nadp_c,iCHOv1; Recon3D,EC Number: http://identifiers.org/ec-code/1.1....,r0673,http://identifiers.org/kegg.reaction/R04285,"[http:, , identifiers.org, kegg.reaction, R04285]",R04285
21058,r0708,"2-Amino-4-hydroxy-6-(erythro-1,2,3-trihydroxyp...",ahdt_c + h2o_c + 4.0 h_c <-> HC01710_c,iCHOv1,EC Number: http://identifiers.org/ec-code/3.5....,r0708,http://identifiers.org/kegg.reaction/R04639,"[http:, , identifiers.org, kegg.reaction, R04639]",R04639
21072,r0789,S-Adenosyl-L-methionine:phosphodimethylethanol...,HC01842_c + amet_c <-> ahcys_c + cholp_c,Recon3D; iCHOv1_DG44; iCHOv1,EC Number: http://identifiers.org/ec-code/2.1....,r0789,http://identifiers.org/kegg.reaction/R06869,"[http:, , identifiers.org, kegg.reaction, R06869]",R06869


In [None]:
###################
# ModelSEED : Similar approach and method used previously for BIGG Model
###################

# (1) Get the KEGG ids that are given in the annotation of our model 
# (2) Import the bigg_models_reactions.txt as a pandas table in python
# (3) the pandas table will have a column 'database_links', which provides in text format serveral urls to external databases.
# (4) We need to parse the column 'database_links' to get the KEGG url (it will probably start with 
# http://identifiers.org/kegg/ and continue with the KEGG reactions id. 
# -> We may want to use df['kegg_urls'] = df['database_links'].apply(lambda x: myfun(x)), my_fun(x) needs to convert the string
# of urls into a list of url, and than extract the url that starts with  http://identifiers.org/kegg/
# (5) So after we grabbed the correct url, we need to extract the KEGG reactions id from the url and put them into a new 
# column
# df['kegg_ids'] = df['kegg_urls'].split(....)



In [110]:
# Load the ModelSEED reaction file
df_ModelSEED_reactions = pd.read_csv("2021-05-20-CA-reaction-mapping/data-databases/reactionsModelSEED.txt", sep= "\t")
df_ModelSEED_reactions.head()

Unnamed: 0,id,abbreviation,name,code,stoichiometry,is_transport,equation,definition,reversibility,direction,...,aliases,ec_numbers,deltag,deltagerr,compound_ids,status,is_obsolete,linked_reaction,notes,source
0,rxn00001,R00004,diphosphate phosphohydrolase,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-1:cpd00012:0:0:""PPi"";2:...",0,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,(1) H2O[0] + (1) PPi[0] <=> (2) Phosphate[0] +...,>,=,...,AraCyc: INORGPYROPHOSPHAT-RXN|BiGG: IPP1; PPA;...,3.6.1.1,-3.46,0.05,cpd00001;cpd00009;cpd00012;cpd00067,OK,0,rxn27946;rxn27947;rxn27948;rxn32487;rxn38157;r...,GCC|HB|EQC|EQU,Primary Database
1,rxn00002,R00005,urea-1-carboxylate amidohydrolase,(1) cpd00001[0] + (1) cpd00742[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-3:cpd00067:0:0:""H+"";-1:...",0,(1) cpd00001[0] + (3) cpd00067[0] + (1) cpd007...,(1) H2O[0] + (3) H+[0] + (1) Allophanate[0] =>...,>,>,...,AraCyc: ALLOPHANATE-HYDROLASE-RXN|BiGG: ALPHNH...,3.5.1.54,-20.14,1.86,cpd00001;cpd00011;cpd00013;cpd00067;cpd00742,OK,0,rxn30346;rxn35525,GCC|EQC|EQU,Primary Database
2,rxn00003,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...,(1) cpd00011[0] + (1) cpd00668[0] <=> (2) cpd0...,"-1:cpd00011:0:0:""CO2"";-1:cpd00668:0:0:""ALCTT"";...",0,(1) cpd00011[0] + (1) cpd00668[0] <= (2) cpd00...,(1) CO2[0] + (1) ALCTT[0] <= (2) Pyruvate[0] +...,<,<,...,AlgaGEM: R_R00006_c|AraGEM: R_R00006_c|BiGG: I...,2.2.1.6,8.27,0.9,cpd00011;cpd00020;cpd00067;cpd00668,OK,0,rxn30144;rxn33164,GCC|EQC|EQU,Primary Database
3,rxn00004,R00008,4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...,(1) cpd02570[0] <=> (2) cpd00020[0],"-1:cpd02570:0:0:""Parapyruvate"";2:cpd00020:0:0:...",0,(1) cpd02570[0] <=> (2) cpd00020[0],(1) Parapyruvate[0] <=> (2) Pyruvate[0],=,=,...,KEGG: R00008|Name: 4-Hydroxy-4-methyl-2-oxoglu...,4.1.3.17,4.49,0.57,cpd00020;cpd02570,OK,0,,GCC|EQC|EQU,Primary Database
4,rxn00006,R00009,hydrogen-peroxide:hydrogen-peroxide oxidoreduc...,(2) cpd00025[0] <=> (2) cpd00001[0] + (1) cpd0...,"-2:cpd00025:0:0:""H2O2"";2:cpd00001:0:0:""H2O"";1:...",0,(2) cpd00025[0] => (2) cpd00001[0] + (1) cpd00...,(2) H2O2[0] => (2) H2O[0] + (1) O2[0],>,>,...,AraCyc: CATAL-RXN|BiGG: CAT; CATp; CTA1; CTT1|...,1.11.1.21|1.11.1.6,-46.06,1.64,cpd00001;cpd00007;cpd00025,OK,0,rxn19264;rxn22404;rxn27744;rxn31381,GCC|EQC|EQU,Primary Database


In [111]:
# Select the columns of interest
modelSEED_id = df_ModelSEED_reactions.iloc[:, :1]
modelSEED_id

Unnamed: 0,id
0,rxn00001
1,rxn00002
2,rxn00003
3,rxn00004
4,rxn00006
...,...
43769,rxn48571
43770,rxn48572
43771,rxn48573
43772,rxn48574


In [112]:
df_merge_modelSEED = df_mapping_table_reactions.merge(modelSEED_id, how = 'left', left_on = 'IDs', right_on = 'id')
df_merge_modelSEED

Unnamed: 0,IDs,Name,Annotations,id
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",
...,...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'},
568,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'},
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'},
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'},


In [116]:
# (1) need to find a way to get the kegg ids that are already in the annotation of the model itself : add a column named
# "Annotations" in the CSV file, then parse this column to get the kegg ids if they have one
# (2) Add the model KEGG ids to our mapping table => apply() function
# (3) Map the KEGG model ids to the kegg ids in the bigg table for the reaction => merge() function
mapping_table_reactions = pd.read_csv('2021-05-20-CA-reaction-mapping/output/2021-05-25-CA-reaction-mapping-table2.csv', index_col = 0)
mapping_table_reactions.head()

Unnamed: 0_level_0,Name,Annotations
IDs,Unnamed: 1_level_1,Unnamed: 2_level_1
PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c..."
PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."
PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':..."


In [120]:
df_mapping_table_reactions_bigg_merge = df_mapping_table_reactions.merge(bigg_models_reactions, how = 'left', left_on = 'kegg_id', right_on = 'bigg_id')
df_mapping_table_reactions_bigg_merge

Unnamed: 0,IDs,Name,Annotations,kegg_id,bigg_id,name,reaction_string,model_list,database_links,old_bigg_ids
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R09503,,,,,,
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817,,,,,,
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817,,,,,,
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817,,,,,,
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817,,,,,,
...,...,...,...,...,...,...,...,...,...,...
567,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'},,,,,,,
568,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'},,,,,,,
569,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'},,,,,,,
570,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'},,,,,,,


In [121]:
df_mapping_table_reactions_bigg_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 572 entries, 0 to 571
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   IDs              572 non-null    object
 1   Name             572 non-null    object
 2   Annotations      572 non-null    object
 3   kegg_id          572 non-null    object
 4   bigg_id          0 non-null      object
 5   name             0 non-null      object
 6   reaction_string  0 non-null      object
 7   model_list       0 non-null      object
 8   database_links   0 non-null      object
 9   old_bigg_ids     0 non-null      object
dtypes: object(10)
memory usage: 49.2+ KB


In [122]:
df_modelSEED_reactions = pd.read_csv('2021-05-20-CA-reaction-mapping/data-databases/reactionsModelSEED.txt', sep = '\t')
df_modelSEED_reactions.head()

Unnamed: 0,id,abbreviation,name,code,stoichiometry,is_transport,equation,definition,reversibility,direction,...,aliases,ec_numbers,deltag,deltagerr,compound_ids,status,is_obsolete,linked_reaction,notes,source
0,rxn00001,R00004,diphosphate phosphohydrolase,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-1:cpd00012:0:0:""PPi"";2:...",0,(1) cpd00001[0] + (1) cpd00012[0] <=> (2) cpd0...,(1) H2O[0] + (1) PPi[0] <=> (2) Phosphate[0] +...,>,=,...,AraCyc: INORGPYROPHOSPHAT-RXN|BiGG: IPP1; PPA;...,3.6.1.1,-3.46,0.05,cpd00001;cpd00009;cpd00012;cpd00067,OK,0,rxn27946;rxn27947;rxn27948;rxn32487;rxn38157;r...,GCC|HB|EQC|EQU,Primary Database
1,rxn00002,R00005,urea-1-carboxylate amidohydrolase,(1) cpd00001[0] + (1) cpd00742[0] <=> (2) cpd0...,"-1:cpd00001:0:0:""H2O"";-3:cpd00067:0:0:""H+"";-1:...",0,(1) cpd00001[0] + (3) cpd00067[0] + (1) cpd007...,(1) H2O[0] + (3) H+[0] + (1) Allophanate[0] =>...,>,>,...,AraCyc: ALLOPHANATE-HYDROLASE-RXN|BiGG: ALPHNH...,3.5.1.54,-20.14,1.86,cpd00001;cpd00011;cpd00013;cpd00067;cpd00742,OK,0,rxn30346;rxn35525,GCC|EQC|EQU,Primary Database
2,rxn00003,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...,(1) cpd00011[0] + (1) cpd00668[0] <=> (2) cpd0...,"-1:cpd00011:0:0:""CO2"";-1:cpd00668:0:0:""ALCTT"";...",0,(1) cpd00011[0] + (1) cpd00668[0] <= (2) cpd00...,(1) CO2[0] + (1) ALCTT[0] <= (2) Pyruvate[0] +...,<,<,...,AlgaGEM: R_R00006_c|AraGEM: R_R00006_c|BiGG: I...,2.2.1.6,8.27,0.9,cpd00011;cpd00020;cpd00067;cpd00668,OK,0,rxn30144;rxn33164,GCC|EQC|EQU,Primary Database
3,rxn00004,R00008,4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lya...,(1) cpd02570[0] <=> (2) cpd00020[0],"-1:cpd02570:0:0:""Parapyruvate"";2:cpd00020:0:0:...",0,(1) cpd02570[0] <=> (2) cpd00020[0],(1) Parapyruvate[0] <=> (2) Pyruvate[0],=,=,...,KEGG: R00008|Name: 4-Hydroxy-4-methyl-2-oxoglu...,4.1.3.17,4.49,0.57,cpd00020;cpd02570,OK,0,,GCC|EQC|EQU,Primary Database
4,rxn00006,R00009,hydrogen-peroxide:hydrogen-peroxide oxidoreduc...,(2) cpd00025[0] <=> (2) cpd00001[0] + (1) cpd0...,"-2:cpd00025:0:0:""H2O2"";2:cpd00001:0:0:""H2O"";1:...",0,(2) cpd00025[0] => (2) cpd00001[0] + (1) cpd00...,(2) H2O2[0] => (2) H2O[0] + (1) O2[0],>,>,...,AraCyc: CATAL-RXN|BiGG: CAT; CATp; CTA1; CTT1|...,1.11.1.21|1.11.1.6,-46.06,1.64,cpd00001;cpd00007;cpd00025,OK,0,rxn19264;rxn22404;rxn27744;rxn31381,GCC|EQC|EQU,Primary Database


In [123]:
df_mapping_table_reactions_modelSEED_merge = df_mapping_table_reactions.merge(df_modelSEED_reactions, how = 'left', left_on = 'kegg_id', right_on = 'abbreviation')
df_mapping_table_reactions_modelSEED_merge

Unnamed: 0,IDs,Name,Annotations,kegg_id,id,abbreviation,name,code,stoichiometry,is_transport,...,aliases,ec_numbers,deltag,deltagerr,compound_ids,status,is_obsolete,linked_reaction,notes,source
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R09503,rxn16345,R09503,H2O:plastoquinone reductase (light-dependent),(2) cpd00001[0] + (4) cpd11632[0] + (2) cpd120...,"-2:cpd00001:0:0:""H2O"";-4:cpd11632:0:0:""hn"";-2:...",0.0,...,JP_Creinhardtii_NMeth: R_PSII|KEGG: R09503|Nam...,1.10.3.9,10000000.0,10000000.0,cpd00001;cpd00007;cpd11632;cpd12011;cpd16486,OK,0.0,rxn34264,GCP|EQP,Primary Database
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817,rxn11995,R03817,plastoquinol:oxidized-plastocyanin oxidoreductase,(1) cpd01475[0] + (2) cpd12265[0] <=> (2) cpd1...,"-1:cpd01475:0:0:""Plastoquinol-1"";-2:cpd12265:0...",0.0,...,KEGG: R03817|Name: plastoquinol:oxidized-plast...,1.10.9.1,10000000.0,10000000.0,cpd00067;cpd01475;cpd12239;cpd12265;cpd16487,OK,0.0,,HB|GCP|EQP,Primary Database
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",R03817,rxn11995,R03817,plastoquinol:oxidized-plastocyanin oxidoreductase,(1) cpd01475[0] + (2) cpd12265[0] <=> (2) cpd1...,"-1:cpd01475:0:0:""Plastoquinol-1"";-2:cpd12265:0...",0.0,...,KEGG: R03817|Name: plastoquinol:oxidized-plast...,1.10.9.1,10000000.0,10000000.0,cpd00067;cpd01475;cpd12239;cpd12265;cpd16487,OK,0.0,,HB|GCP|EQP,Primary Database
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817,rxn11995,R03817,plastoquinol:oxidized-plastocyanin oxidoreductase,(1) cpd01475[0] + (2) cpd12265[0] <=> (2) cpd1...,"-1:cpd01475:0:0:""Plastoquinol-1"";-2:cpd12265:0...",0.0,...,KEGG: R03817|Name: plastoquinol:oxidized-plast...,1.10.9.1,10000000.0,10000000.0,cpd00067;cpd01475;cpd12239;cpd12265;cpd16487,OK,0.0,,HB|GCP|EQP,Primary Database
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",R03817,rxn11995,R03817,plastoquinol:oxidized-plastocyanin oxidoreductase,(1) cpd01475[0] + (2) cpd12265[0] <=> (2) cpd1...,"-1:cpd01475:0:0:""Plastoquinol-1"";-2:cpd12265:0...",0.0,...,KEGG: R03817|Name: plastoquinol:oxidized-plast...,1.10.9.1,10000000.0,10000000.0,cpd00067;cpd01475;cpd12239;cpd12265;cpd16487,OK,0.0,,HB|GCP|EQP,Primary Database
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
596,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
597,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
598,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,


In [124]:
df_mapping_table_reactions_modelSEED_merge.info() # 256 reactions with no KEGG ID or with no correspondence between KEGG ID and
# abbreviation

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 599
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   IDs                600 non-null    object 
 1   Name               600 non-null    object 
 2   Annotations        600 non-null    object 
 3   kegg_id            600 non-null    object 
 4   id                 344 non-null    object 
 5   abbreviation       344 non-null    object 
 6   name               344 non-null    object 
 7   code               344 non-null    object 
 8   stoichiometry      344 non-null    object 
 9   is_transport       344 non-null    float64
 10  equation           344 non-null    object 
 11  definition         344 non-null    object 
 12  reversibility      344 non-null    object 
 13  direction          344 non-null    object 
 14  abstract_reaction  0 non-null      float64
 15  pathways           310 non-null    object 
 16  aliases            344 non

In [125]:
# We would like to search the reactions which have not an abbreviation linked to the KEGG ID 
df_mapping_table_NaN = df_mapping_table_reactions_modelSEED_merge[df_mapping_table_reactions_modelSEED_merge["abbreviation"].isna()]
df_mapping_table_NaN

Unnamed: 0,IDs,Name,Annotations,kegg_id,id,abbreviation,name,code,stoichiometry,is_transport,...,aliases,ec_numbers,deltag,deltagerr,compound_ids,status,is_obsolete,linked_reaction,notes,source
57,CeS_c1,cellulose synthase,"{'ec-code': '2.4.1.12', 'go': 'GO:0030244', 'k...",R02889,,,,,,,...,,,,,,,,,,
58,CeS_c2,cellulose synthase,"{'ec-code': '2.4.1.12', 'go': 'GO:0030244', 'k...",R02889,,,,,,,...,,,,,,,,,,
59,CeS_c3,cellulose synthase,"{'ec-code': '2.4.1.12', 'go': 'GO:0030244', 'k...",R02889,,,,,,,...,,,,,,,,,,
218,GABATA1_m,GABA transaminase,"{'ec-code': '2.6.1.96', 'go': 'GO:0006538', 'p...",,,,,,,,...,,,,,,,,,,
265,CTHS_h,CTH synthase,"{'ec-code': '2.5.1.48', 'go': 'GO:0009086'}",,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,Tr_KG_Mal_mc,Tr_KG_Mal_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
596,Tr_Asp_mc,Tr_Asp_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
597,Tr_Asp_Glu_mc,Tr_Asp_Glu_mc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,
598,Tr_Pyr_Mal_hc,R_Tr_Pyr_Mal_hc,{'go': 'GO:0006810'},,,,,,,,...,,,,,,,,,,


In [147]:
df_keggID_merge.info() # 253 reactions with a KEGG ID => OK

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256 entries, 0 to 255
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   IDs                256 non-null    object 
 1   Name               256 non-null    object 
 2   Annotations        256 non-null    object 
 3   kegg_id            256 non-null    object 
 4   id                 0 non-null      object 
 5   abbreviation       0 non-null      object 
 6   name               0 non-null      object 
 7   code               0 non-null      object 
 8   stoichiometry      0 non-null      object 
 9   is_transport       0 non-null      float64
 10  equation           0 non-null      object 
 11  definition         0 non-null      object 
 12  reversibility      0 non-null      object 
 13  direction          0 non-null      object 
 14  abstract_reaction  0 non-null      float64
 15  pathways           0 non-null      object 
 16  aliases            0 non-n

In [148]:
df_keggID_merge.to_csv('kegg_id.csv')