In [None]:
################################################################################################################################
################################################################################################################################
############                                -*-  RAPESEED METABOLIC NETWORK -*-                                    #############
################################################################################################################################
################################################################################################################################



################################################################################################################################
############                       -*- Apply Community standards for AraCore Model-*-                              #############
################################################################################################################################



################################################################################################################################
############                                  -*- Parameters of the pipeline -*-                                   #############
################################################################################################################################


datadir = "../models/2018-23-05-mb-genC3.sbml"

In [None]:
################################################################################################################################
################################################################################################################################
############                                -*-  RAPESEED METABOLIC NETWORK -*-                                    #############
################################################################################################################################
################################################################################################################################



################################################################################################################################
############                       -*- Apply Community standards for AraCore Model-*-                              #############
################################################################################################################################



################################################################################################################################
############                                         -*- Pipeline -*-                                              #############
################################################################################################################################



####################################################################
################## I/ Parsing the file
####################################################################


##################
########### Goal : Etract the information we need to annotate the model : metabolites, reactions and genes
##################


import cobra
import pandas as pd


def parsing(fileName):
    model = cobra.io.read_sbml_model(fileName)
    print("Reactions")
    print("---------")
    for x in model.reactions:
        print("%s : %s" % (x.id, x.reaction))

    print("Metabolites")
    print("-----------")
    for x in model.metabolites:
        print("%9s : %s" % (x.id, x.formula))

    print("Genes")
    print("-----")
    for x in model.genes:
        associated_ids = (i.id for i in x.reactions)
        print("%s is associated with reactions: %s" %
        (x.id, "{" + ", ".join(associated_ids) + "}"))

parsing(datadir)



###################################################################
################## II/ Annotate the initial model
###################################################################


###################
############ Goal : Introducing new annotations to the initial model => use of MIRIAM standards for metabolites
###################


############
### 1st step : Create a mapping table for metabolites
############


model = cobra.io.read_sbml_model(fileName)

data = []
df = pd.DataFrame(
    {
        "IDs" : [x.id for x in model.metabolites],
        "Name" : [x.name for x in model.metabolites],
        "Chemical formula" : [x.formula for x in model.metabolites]
    })
df

In [None]:
cobra.io.sbml.validate_sbml_model(datadir) # to see if there are problems with our file

In [None]:
# Create the dataframe
fileName = datadir
model = cobra.io.read_sbml_model(fileName)
df_mapping_table = pd.DataFrame(
    {
        "IDs" : [x.id for x in model.metabolites],
        "Name" : [x.name for x in model.metabolites],
        "Formula" : [x.formula for x in model.metabolites]
    })
df_mapping_table # mapping table 

In [None]:
df_mapping_table["formula_no_X"] = df_mapping_table["Formula"].replace("X", float("Nan")) # WARNING !! Nan is a value => float()
# necessary!
# df_mapping_table["formula_no_X"] is a Serie (one column)
df_mapping_table # is the data frame

In [None]:
df_mapping_table.shape # to print the dimensions of the data frame

In [None]:
# Obtaining IDs with BIGG Model database

# Import the BIGG Models metabolites file 
bigg_models_metabolites = pd.read_csv('../data/external/bigg_models_metabolites.txt', sep = "\t")
bigg_models_metabolites.head()

In [None]:
bigg_models_metabolites.shape # to print the dimensions of the data frame

In [None]:
# As we want to obtain universal BIGG IDs according to MIRIAM standards, we will adapt our notations to the ones presented in 
# the table bigg_models_metabolites, then merge the two tables

# Select the columns of interest in the table named bigg_models_metabolites
universal_bigg_id = bigg_models_metabolites.iloc[:, :2] # slicing : select all the rows and the 2 first columns
universal_bigg_id

In [None]:
df_mapping_table["ID_lower"] = df_mapping_table["IDs"].apply(str.lower) # WARNING! str.lower(): lower() function ALWAYS from a string !!! 
df_mapping_table

In [None]:
# Merge the data frame with data from the BIGG Model database (universal bigg id)
# WARNING !! To merge, we need to have two data frames !
# 1st merge : merge with the BIGG_ID
df_merge = df_mapping_table.merge(universal_bigg_id, how = 'left', left_on = "ID_lower", right_on = 'bigg_id')
df_merge

In [None]:
df_merge.info() # info() function : to print a summary of the data frame obtained after the merge

In [None]:
# Obtaining the IDs according to ModelSEED database

compoundsModelSEED = pd.read_csv("../data/external/compoundsModelSEED.txt", sep = "\t")
compoundsModelSEED.head()

In [None]:
compoundsModelSEED.shape # to print the dimensions of the data frame

In [None]:
compoundsModelSEED.info() # to print the summary of the data frame

In [None]:
def remove_container(ID_lower): 
    # not for loop, because apply() function will call the remove_container function whenever necessary, so far each row of the data frame
    ID_lower_no_c = ID_lower[:-2] # to remove the container => so, the 2 last characters of the string
    return ID_lower_no_c # we want to return the ID_lower without the container, so without the 2 last characters of the string
df_merge["IDs_lower_no_c"] = df_merge["ID_lower"].apply(remove_container)
df_merge

In [None]:
# Merge the data frame with ModelSEED compounds (IDs and abbreviations) 
df_merge_ModelSEED = df_merge.merge(compoundsModelSEED[["id", "abbreviation"]], how = 'left', left_on = 'IDs_lower_no_c', right_on = 'abbreviation')
df_merge_ModelSEED 

In [None]:
df_merge_ModelSEED.info() # to have a summary of the data frame

In [None]:
# Filtering specific rows from the data frame to look at names which have an ID both in BIGG Model and in ModelSEED
# notna() function : detects missing values in the data frame
df_merge_noNAN = df_merge_ModelSEED[df_merge_ModelSEED["universal_bigg_id"].notna() & df_merge_ModelSEED["abbreviation"].notna()]
df_merge_noNAN 

In [None]:
df_merge_noNAN.info() 

In [None]:
# Filtering specific rows from the data frame to look at names which have an ID either in BIGG Model either in ModelSEED
# => XOR in pandas : exclusive union
df_merge_ID = df_merge_ModelSEED[df_merge_ModelSEED["universal_bigg_id"].notna() ^ df_merge_ModelSEED["abbreviation"].notna()]
df_merge_ID

In [None]:
df_merge_ModelSEED["bigg_id"] = "M_" + df_merge_ModelSEED["bigg_id"]
df_merge_ModelSEED

In [None]:
#df_final = df_merge_ModelSEED.rename(columns = {"bigg_id" : "universal_BIGG_id", "id" : "ModelSEED_id", "abbreviation" : "abbreviation_ModelSEED"})
df_final = df_merge_ModelSEED.set_index('IDs')
df_final

In [None]:
df_final.to_csv("../data/processed/2021-05-20-CA-metabolite-mapping-table1-output.csv") # to save the first version of the mapping table as a CSV file

In [None]:
# To be clear (we hope), we rename the "id" and the "abbreviation" columns => ModelSEED_id and ModelSEED_abbreviation
df_final_renamed = df_final.rename(columns = {"id":"ModelSEED_id", "abbreviation":"ModelSEED_abbreviation"})
df_final_renamed

In [None]:
def cutID_lower(ID_lower):
    IDs_no_c = ID_lower[:-2]
    return IDs_no_c
df_final_renamed["BIGG_IDs_no_c"] = df_final_renamed["ID_lower"].apply(cutID_lower)
df_final_renamed

In [None]:
# BIGG-ID : 1st merge on the IDs => Would we have more results with the universal BIGG IDs?
# => 2nd merge on the universal BIGG IDs
# Start from the initial mapping table
df_merge_universal_bigg_id = df_final_renamed.merge(universal_bigg_id, how = 'left', left_on = "BIGG_IDs_no_c", right_on = 'universal_bigg_id')
df_merge_universal_bigg_id
# We have a little bit more correspondences between the BIGG IDs and the universal BIGG IDs, but not so much (40 more apparently)
# => We will try to merge with the column "formula", according to the chemical formulas

In [None]:
df_merge_universal_bigg_id.info()

In [None]:
# Merge data frame BIGG Model (df_merge) with data frame ModelSEED (compoundsModelSEED)(column "formula"):
# 1) We open and we observe the file .sbml
# 2) We look at the .sbml file to see if there are the chemical formulas
# 3) If we have information concerning chemical formulas (this is the case here), we add a column in the df_merge data frame 
# called "chemical formula", and we add the information concernig chemical formulas into this column from the .sbml file
# 4) Filter all the values which don't have any value (NaN) in the first data frame (df_mapping_table) : we create a new dataframe
# (see below) constituted by all the data which have no "NaN" as values; indeed, if we merge the dataframes with the mapping table
# containing the missing values, the merge will be performed on these values ("NaN") : so, we will have a lot of data which will 
# be duplicated
# 5) We merge the two data frames : df_merge (having the supplemental columns Formula and formula_no_X) with compoundsModelSEED:
# we use the data from ModelSEED compounds database because it contains chemical formulas; it is not the case for the BIGG 
# database 
df_mapping_table_noNaN = df_mapping_table[df_mapping_table["formula_no_X"].notna()]
df_mapping_table_noNaN

In [None]:
df_mapping_table_noNaN.info()

In [None]:
# Merge the two data frames df_mapping_table_noNaN and compoundsModelSEED: 
df_merge_formula = df_mapping_table_noNaN.merge(compoundsModelSEED, how = "left", left_on = "formula_no_X", right_on = "formula")
df_merge_formula
# In the results, and according to the compounds described in the ModelSEED database, Plastoquinone-1 seems to be the polymer 1
# => repetition of CH2-CH=CH-CH2 | CH2-CH=CH-CH3 once (n from 1 to 9)
# Now, we can have all the IDs ModelSEED

In [None]:
df_merge_formula = df_merge_formula.rename(columns = {"id":"ModelSEED_id", "abbreviation":"ModelSEED_abbreviation", "name":"ModelSEED_name"})
df_merge_formula

In [None]:
df_merge_formula.info() # ModelSEED_id < IDs and formula_no_X => 46 metabolites to search on the databases by hand

In [None]:
# Now, we would like to merge the two data frames (the one with noNan, containing the IDs and the formulas for the two databases)
# and the one containing all the Nan 
# Select the data frame containing all the NaN => isna() function in pandas (conversly to notna() function)
df_mapping_table_isNaN = df_mapping_table.isna()
df_mapping_table_isNaN

In [None]:
df_mapping_table_isNaN = df_mapping_table[df_mapping_table["formula_no_X"].isna()] # coupled with the filter
# We can select rows
df_mapping_table_isNaN 
# Why aren't there chemical formulas for plastocyanin, proton gradient and NADH (oxidised and reduced) ?
# => 7 rows selected => 7 more metabolites to search by hand => 53 metabolites to search by hand in total (46 + 7)

In [None]:
# Concatenate the 2 data frames : df_merge_formula and df_mapping_table_isNaN 
frames = [df_merge_formula, df_mapping_table_isNaN]
result = pd.concat(frames)
result

In [None]:
# remove the index on the left of the data frame 
result_set_index = result.set_index('IDs')
result_set_index

In [None]:
# convert the data frame without the index to a CSV file format
result_csv = result_set_index.to_csv('../data/processed/2021-05-29-CA-metabolite-mapping-table2-output.csv')

In [None]:
# Now, we :
# 1) open the csv file
# 2) try to search a few IDs in at least one of the two data bases : BIGG_ID or ModelSEED
# 3) report the universal bigg_id or the ID found in the ModelSEED database in the corresponding case in the csv file (just for
# the comparison)
# 4) try to look at the IDs we have reported in the csv file and find a way to merge the IDs from the starting data frame with 
# the IDs reported (comparison of the IDs from the starting data frame with the IDs found in the data bases)
# => Merge the IDs of the starting mapping table (column named IDs) with the universal BIGG ID
metabolite_mapping_table2 = pd.read_csv('../data/processed/2021-05-29-CA-metabolite-mapping-table2-output.csv', sep = ",")
metabolite_mapping_table2.head()

In [None]:
metabolite_mapping_table2["ID_lower"] = metabolite_mapping_table2["IDs"].apply(str.lower) # WARNING! str.lower(): lower() function ALWAYS from a string !!! 
metabolite_mapping_table2

In [None]:
metabolite_mapping_table2_merge = metabolite_mapping_table2.merge(universal_bigg_id, how = 'left', left_on = 'ID_lower', right_on = 'bigg_id')
metabolite_mapping_table2_merge

In [None]:
metabolite_mapping_table2_merge.info()

In [None]:
# We want the IDs which do not have a match both in the mapping table MODELSEED ids and both in the universal bigg id from the
# BIGG database
# XOR in pandas : exclusive union
# isna() function
df_merge_ID2 = metabolite_mapping_table2_merge[metabolite_mapping_table2_merge["universal_bigg_id"].isna() & metabolite_mapping_table2_merge["ModelSEED_abbreviation"].isna()]
df_merge_ID2

In [None]:
df_merge_ID2.info()
# In total : we would have 28 metabolites to search in the databases 
# => How to do to respect the format in identifiers.org ?

In [None]:
metabolites_mapping_table_final = metabolite_mapping_table2_merge.set_index("IDs")
metabolites_mapping_table_final 

In [None]:
metabolites_mapping_table_final.to_csv("../data/processed/2021-05-29-CA-metabolite-mapping-table-final-output.csv")

In [None]:
# 22 metabolites found in the BIGG Model and ModelSEED databases out of 42
# => Why 42 instead of 28?
# => When we open the file and filter the data on excel or Open Office, please filter according to the BOTH conditions, as
# described in above, that is to say according to ModelSEED abbreviation AND universal BIGG id!!  

In [None]:
metabolites_mapping_table_final_version2 = metabolites_mapping_table_final.drop(["formula", "formula_no_X","ID_lower","mass", "source","inchikey", "charge", "is_core", "is_obsolete", "linked_compound", "is_cofactor", "deltag", "deltagerr", "pka", "pkb", "abstract_compound", "comprised_of", "aliases", "smiles", "notes"], axis = 1)
metabolites_mapping_table_final_version2

In [None]:
metabolites_mapping_table_final_version2.to_csv("../data/processed/2021-05-29-CA-metabolite-mapping-table-final2-output.csv")