# Creating reaction mapping table

## Install missing modules and load modules

In [None]:
import pandas as pd
import numpy as np
import cobra
import requests

## Load AraCore Model 

In [None]:
#Get file from github
fileName = 'https://raw.githubusercontent.com/ma-blaetke/CBM_C3_C4_Metabolism/master/data/2018-23-05-mb-genC3.sbml'
r = requests.get(fileName)

In [None]:
#Create model
model = cobra.io.read_sbml_model(r.text)

## Correct Compartment Naming in AraCore Model according to BiGG naming conventions 

In [None]:
model.compartments

In [None]:
bigg_compartments = {'c':	'cytosol',
'e':	'extracellular space',
'p':	'periplasm',
'm':	'mitochondria',
'x':	'peroxisome/glyoxysome',
'r':	'endoplasmic reticulum',
'v':	'vacuole',
'n':	'nucleus',
'g':	'golgi apparatus',
'u':	'thylakoid',
'l':	'lysosome',
'h':	'chloroplast',
'f':	'flagellum',
's':	'eyespot',
'im':	'intermembrane space of mitochondria',
'cx':	'carboxyzome',
'um':	'thylakoid membrane',
'cm':	'cytosolic membrane',
'i':	'inner mitochondrial compartment',
'mm':	'mitochondrial intermembrane',
'w':	'wildtype staph aureus',
'y':	'cytochrome complex'}

In [None]:
#http://bigg.ucsd.edu/compartments

#c	cytosol
#h	chloroplast
#m	mitochondria
#x	peroxisome/glyoxysome
#im	intermembrane space of mitochondria
#h	chloroplast
#ul thylakoid lumen <<< NEW


df_compartment_mapping = pd.Series(
  { 'c': 'c', #cytosol
  'h': 'h', #chloroplast
  'm': 'm', #mitochondria
  'p': 'x', #peroxisome/glyoxysome
  'i': 'im', #intermembrane space of mitochondria
  'l': 'ul', #thylakoid lumen <<< NEW, not in BiGG compartment list
   'e':'e', #extracellular space << NEW, not yet in model
  }
 )

## Create Reaction Table for AraCore Model 

In [None]:
#Create mapping table
df_reactions_aracore = pd.DataFrame(
    {
        "aracore_ids" : [r_obj.id for r_obj in model.reactions],
        "aracore_name" : [r_obj.name for r_obj in model.reactions],
        "aracore_annotations" : [r_obj.annotation for r_obj in model.reactions]
    })
 
df_reactions_aracore.set_index("aracore_ids",inplace=True) #Set index to acacore_ids to make the renaming of certain labels easiers

df_reactions_aracore.head(25)

In [None]:
#BiGG naming convention https://github.com/SBRG/bigg_models/wiki/BiGG-Models-ID-Specification-and-Guidelines
# - Reaction symbols all neeed to be uppercase
# - Only contain upper and lowercase letters, numbers, and underscores

#df.rename(index= {'name': 'new_name'})
#Rename reaction ids StS_h1, StS_h3, StS_h2
df_reactions_aracore.rename(index={'StS_h1':'StS1_h'}, inplace=True)
df_reactions_aracore.rename(index={'StS_h2':'StS2_h'}, inplace=True)
df_reactions_aracore.rename(index={'StS_h3':'StS3_h'}, inplace=True)

#Rename reaction ids CeS_c1, CeS_c2, CeS_c3
df_reactions_aracore.rename(index={'CeS_c1':'CeS1_c'}, inplace=True)
df_reactions_aracore.rename(index={'CeS_c2':'CeS2_c'}, inplace=True)
df_reactions_aracore.rename(index={'CeS_c3':'CeS3_c'}, inplace=True)

#Rename reaction id AGluK
df_reactions_aracore.rename(index={'AGluK':'AGluK_h'}, inplace=True)

#Rename all import reaction ids, add "_e" for extracellular space
for r_obj in model.reactions.query(lambda rxn: (rxn.id.startswith('Im_'))):
  df_reactions_aracore.rename(index={r_obj.id: f"{r_obj.id}_e"}, inplace=True)


#Rename all export reaction ids, add "_e" for extracellular space
for r_obj in model.reactions.query(lambda rxn: (rxn.id.startswith('Ex_'))):
    df_reactions_aracore.rename(index={r_obj.id: f"{r_obj.id}_e"}, inplace=True)

#Rename all transport reaction ids, add compartment symbol of first metabolite
for r_obj in model.reactions.query(lambda rxn: (rxn.id.startswith('Tr_'))):
  met_first_comp = list(r_obj.metabolites.keys())[0].compartment
  if r_obj.id.rsplit('_',1)[-1] in ['mc','hc']:
    r_id_new = f"{r_obj.id.rsplit('_',1)[0]}_{met_first_comp}"
  else:
    r_id_new = f"{r_obj.id}_{met_first_comp}"
  df_reactions_aracore.rename(index={r_obj.id: r_id_new}, inplace=True)


df_reactions_aracore.reset_index(inplace=True) #Reset index to numeric labels, to be able to use the apply function on 'aracore_updated_ids' which is not possible as index

#Update compartment symbols in metabolite ids and make metabolite ids lower case
df_reactions_aracore['aracore_updated_ids'] = df_reactions_aracore['aracore_ids'].apply(lambda r_id: f"{r_id.rsplit('_',1)[0]}_{df_compartment_mapping[r_id.rsplit('_',1)[-1]]}").str.upper()

#Create universal metabolite ids by removing compartment symbols
df_reactions_aracore['aracore_updated_universal_ids'] = df_reactions_aracore['aracore_updated_ids'].apply(lambda r_id: r_id.rsplit('_',1)[0])

df_reactions_aracore.head(25)

In [None]:
df_reactions_aracore[df_reactions_aracore['aracore_ids'].str.contains('DASH')]

In [None]:
# Extract Kegg Id from 'aracore_annotations' column (annotations are already provided as dictionaries => key "kegg.reactions")
df_reactions_aracore['kegg_id'] = df_reactions_aracore['aracore_annotations'].apply(lambda anno_dict: anno_dict['kegg.reaction'] if 'kegg.reaction' in anno_dict.keys() else None)

df_reactions_aracore.head(25)

In [None]:
# Extract Ec code from 'aracore_annotations' column (annotations are already provided as dictionaries => key "ec-code")
df_reactions_aracore['ec-code'] = df_reactions_aracore['aracore_annotations'].apply(lambda anno_dict: anno_dict['ec-code'] if 'ec-code' in anno_dict.keys() else None)

df_reactions_aracore.head(25)

## BiGG Reactions 

In [None]:
# Loag BIGG Reaction Table
bigg_reactions_url = 'http://bigg.ucsd.edu/static/namespace/bigg_models_reactions.txt'
df_reactions_bigg = pd.read_csv(bigg_reactions_url, sep='\t')
df_reactions_bigg.loc[170:200,:] # I just selected a range of reactions that will have a kegg identifier for sure, its only for demo

In [None]:
#Convert string of database links into dictionaries of database identifier/symbol (key) and database-specific metabolite/annotation id (value)
df_reactions_bigg['database_links'] = df_reactions_bigg['database_links'].apply(lambda str_links: {db_link.split(':',1)[-1].split('/')[-2]: db_link.split(':',1)[-1].split('/')[-1] for db_link in str_links.split(';')} if isinstance(str_links,str) else {})

In [None]:
#All database keys in database_links
np.unique(df_reactions_bigg['database_links'].apply(lambda x: list(x.keys())).sum())

In [None]:
#Extract kegg id from 'database_links' column (dictionary => key "kegg.reaction")
df_reactions_bigg['kegg.reaction'] = df_reactions_bigg['database_links'].apply(lambda dict_db_link:  dict_db_link['kegg.reaction'] if 'kegg.reaction' in dict_db_link.keys() else None)
df_reactions_bigg.loc[170:200]

In [None]:
#Extract ec-code from 'database_links' column (dictionary => key "ec-code")
df_reactions_bigg['ec-code'] = df_reactions_bigg['database_links'].apply(lambda dict_db_link:  dict_db_link['ec-code'] if 'ec-code' in dict_db_link.keys() else None)
df_reactions_bigg.loc[170:200]

In [None]:
#Extract seed.reaction from 'database_links' column (dictionary => key "seed.reaction")
df_reactions_bigg['seed.reaction'] = df_reactions_bigg['database_links'].apply(lambda dict_db_link:  dict_db_link['seed.reaction'] if 'seed.reaction' in dict_db_link.keys() else None)
df_reactions_bigg.loc[170:200]

## ModelSEED Reactions 

In [None]:
# Load ModelSeed Reaction Table

seed_reactions_url = 'https://raw.githubusercontent.com/ModelSEED/ModelSEEDDatabase/master/Biochemistry/reactions.tsv'
df_reactions_seed = pd.read_csv(seed_reactions_url, sep='\t')

df_reactions_seed.head(25)

In [None]:
#Convert string of alias pairs into dictionaries of keys and value - 
# 1) split by "|" to sepertae the different key - value pairs
# 2) split by ":" to seperate keys and values
df_reactions_seed['aliases'] = df_reactions_seed['aliases'].apply(lambda aliases_str: 
                                                                      {alias.split(':',1)[0]: alias.split(':',1)[-1] 
                                                                       for alias in aliases_str.split('|')} if isinstance(aliases_str, str) else {} )

# Some of the keys have values that are again a strings with multiple items that need to be splitted
# 3) split those  string by ';' and also remove leading and tailing white spaces 
df_reactions_seed['aliases'] = df_reactions_seed['aliases'].apply(lambda aliases_dict: 
                                                                      {alias_key:
                                                                       [alias_value.strip() for alias_value in alias_values.split(';')]
                                                                       for alias_key, alias_values in aliases_dict.items()}
                                                                      )

df_reactions_seed.head(25)

In [None]:
#All possible keys in ModelSEED aliases => seems like only KEGG, BiGG and AraCyc would make sense
#array(['AlgaGEM', 'AraCyc', 'AraGEM', 'BiGG', 'BrachyCyc', 'ChlamyCyc',
#       'CornCyc', 'DF_Athaliana', 'EcoCyc', 'JM_Creinhardtii',
#       'JP_Creinhardtii_MSB', 'JP_Creinhardtii_NMeth', 'KEGG', 'MaizeCyc',
#       'Maize_C4GEM', 'MetaCyc', 'Name', 'PlantCyc', 'PoplarCyc',
#       'RiceCyc', 'SorghumCyc', 'SoyCyc', 'TS_Athaliana', 'iAF1260',
#       'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800',
#       'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904',
#       'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083',
#       'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844'], dtype='<U21')

In [None]:
#Get BiGG reaction ids
df_reactions_seed['BiGG'] = df_reactions_seed['aliases'].apply(lambda dict_aliases:  dict_aliases['BiGG'] if 'BiGG' in dict_aliases.keys() else None)
df_reactions_seed.head(25)

In [None]:
#Get BiGG reaction ids
df_reactions_seed['KEGG'] = df_reactions_seed['aliases'].apply(lambda dict_aliases:  dict_aliases['KEGG'] if 'KEGG' in dict_aliases.keys() else None)
df_reactions_seed.head(25)

In [None]:
#Get AraCyc reaction ids
df_reactions_seed['AraCyc'] = df_reactions_seed['aliases'].apply(lambda dict_aliases:  dict_aliases['AraCyc'] if 'AraCyc' in dict_aliases.keys() else None)
df_reactions_seed.head(25)

In [None]:
# 'ec_numbers' should also be convert into a list 
df_reactions_seed['ec_numbers'] = df_reactions_seed['ec_numbers'].astype(str)
df_reactions_seed['ec_numbers'] = df_reactions_seed['ec_numbers'].apply(lambda x: x.split('|') if x else None)
df_reactions_seed.head(25)

## Mapping AraCore and BiGG 

### Using KEGG Ids 

In [None]:
#Check if kegg_id of the aracore reactions is in any of the list of kegg ids mapped in the BiGG table
df_reactions_aracore['is_bigg_kegg_id'] = df_reactions_aracore['kegg_id'].apply(lambda kegg_id: ((df_reactions_bigg['kegg.reaction'] == kegg_id).sum() > 0) if isinstance(kegg_id, str) else False)

#get the BiGG ids where kegg ids are matching
df_reactions_aracore['kegg_bigg_ids'] = df_reactions_aracore[['kegg_id','is_bigg_kegg_id']].apply(lambda x: 
                                                                                            df_reactions_bigg[df_reactions_bigg['kegg.reaction'] == x[0]]['bigg_id'].tolist() 
                                                                                            if x[1] 
                                                                                            else [None], axis=1)
df_reactions_aracore.head(25)

In [None]:
df_reactions_aracore['is_bigg_kegg_id'].value_counts() #=> 249 reactions mapped

### Using Ec-code 

In [None]:
#Check if ec-code of the aracore reactions is in any of the list of ec-code mapped in the BiGG table
df_reactions_aracore['is_bigg_ec_code'] = df_reactions_aracore['ec-code'].apply(lambda kegg_id: ((df_reactions_bigg['ec-code'] == kegg_id).sum() > 0) if isinstance(kegg_id, str) else False)

#get the ec-code ids where kegg ids are matching
df_reactions_aracore['ec_bigg_ids'] = df_reactions_aracore[['ec-code','is_bigg_ec_code']].apply(lambda x: 
                                                                                            df_reactions_bigg[df_reactions_bigg['ec-code'] == x[0]]['bigg_id'].tolist() 
                                                                                            if x[1] 
                                                                                            else [None], axis=1)
df_reactions_aracore.head(50)

In [None]:
df_reactions_aracore['is_bigg_ec_code'].value_counts() #=> 249 reactions mapped 

## Aggregate all BiGG ids from mapping of KEGG ids and Ec-codes 

In [None]:
#Fuse list of mapped BiGG ids by kegg and ec code 
df_reactions_aracore['kegg_ec_bigg_id'] = df_reactions_aracore[['ec_bigg_ids','kegg_bigg_ids']].apply(lambda x: list(filter(None, x[0]+x[1])), axis=1)

#Create list of unique BiGG ids
df_reactions_aracore['kegg_ec_bigg_id'].apply(lambda x: list(np.unique(x)))

#Clean dataframe and remove cols
df_reactions_aracore.drop(['is_bigg_ec_code','is_bigg_kegg_id','ec_bigg_ids','kegg_bigg_ids'], axis=1, inplace=True)

df_reactions_aracore.head(50)

In [None]:
(df_reactions_aracore['kegg_ec_bigg_id'].apply(len) > 1).value_counts() #284 have mapped BiGG ids

## Add SEED Reaction ids that are known from the BiGG Table 

In [None]:
#Add all ModelSeed ids that have already been mapped to BiGG ids in the Bigg Table
df_reactions_aracore['bigg_seed_id'] = df_reactions_aracore['kegg_ec_bigg_id'].apply(lambda bigg_id_list: 
                                      [df_reactions_bigg[df_reactions_bigg['bigg_id'] == bigg_id]['seed.reaction'] for bigg_id in bigg_id_list] if bigg_id_list else [[None]] )
#Process packed lists of ModelSEED Ids
df_reactions_aracore['bigg_seed_id'] = df_reactions_aracore['bigg_seed_id'].apply(lambda x: [item_x  for sub_x in x for item_x in sub_x] )

#Create list of unique ModelSEED ids
df_reactions_aracore['bigg_seed_id'] = df_reactions_aracore['bigg_seed_id'].apply(lambda x: list(np.unique(list(filter(None,x)))))

## Mapping AraCore and ModelSeed 

### Using KEGG ids 

In [None]:
def check_for_kegg_modelseed(kegg_id):
  return df_reactions_seed[df_reactions_seed['KEGG'].apply(lambda x: kegg_id in x if x else False)]['id'].tolist() 

#Map KEGG ids in the aracore table and the ModelSEED table to get additional Seed ids
df_reactions_aracore['kegg_seed_ids'] = df_reactions_aracore['kegg_id'].apply(lambda kegg_id: check_for_kegg_modelseed(kegg_id) if isinstance(kegg_id, str) else [None])

df_reactions_aracore.head(25)

In [None]:
df_reactions_aracore['kegg_seed_ids'].apply(lambda x: isinstance(x,list)).value_counts() #319 reactions with ModelSeed Id

## Add BiGG Reaction ids that are known from the ModelSeed Table 

In [None]:
#Add all BiGG ids that have already been mapped to ModelSEED ids in the ModelSEED Table
df_reactions_aracore['seed_bigg_id'] = df_reactions_aracore['kegg_seed_ids'].apply(lambda kegg_seed_ids: 
                                            [df_reactions_seed[df_reactions_seed['id'] == seed_id]['BiGG'].tolist() 
                                            for seed_id in kegg_seed_ids ]
                                            if isinstance(kegg_seed_ids, list) else None ) 

#Process packed lists of BiGG Ids
df_reactions_aracore['seed_bigg_id'] = df_reactions_aracore['seed_bigg_id'].apply(lambda x: [item_x  for sub_x in x for item_x in sub_x] if x else None )

#Create list of unique BiGG Ids
df_reactions_aracore['seed_bigg_id'] = df_reactions_aracore['seed_bigg_id'].apply(lambda x: list(np.unique(list(filter(None,x)))) if x else [None])

In [None]:
df_reactions_aracore.head(50)

## Aggregate Seed and BiGG Ids 

In [38]:
#Fuse list of mapped BiGG ids and create list of unique ids
df_reactions_aracore['bigg_id_aggr'] = df_reactions_aracore[['seed_bigg_id','kegg_ec_bigg_id']].apply(lambda x: list(filter(None, x[0]+x[1])), axis=1)
df_reactions_aracore['bigg_id_aggr'].apply(lambda x: list(np.unique(x)))

#Fuse list of mapped Seeds ids and create list of unique ids
df_reactions_aracore['seed_id_aggr'] = df_reactions_aracore[['kegg_seed_ids','bigg_seed_id']].apply(lambda x: list(filter(None, x[0]+x[1])), axis=1)
df_reactions_aracore['seed_id_aggr'].apply(lambda x: list(np.unique(x)))

#Clean dataframe and drop cols
df_reactions_aracore.drop(['kegg_ec_bigg_id','bigg_seed_id','kegg_seed_ids','seed_bigg_id'], axis=1, inplace=True)

df_reactions_aracore.head(50)

Unnamed: 0,aracore_ids,aracore_name,aracore_annotations,aracore_updated_ids,aracore_updated_universal_ids,kegg_id,ec-code,bigg_id_aggr,seed_id_aggr
0,PSII_h,photosystem II,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",PSII_H,PSII,R09503,1.10.3.9,[],"[rxn16345, rxn34264]"
1,Cytb6f1_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",CYTB6F1_H,CYTB6F1,R03817,1.10.9.1,[],"[rxn11995, rxn39426]"
2,Cytb6f2_h,cytochrom b6f complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",CYTB6F2_H,CYTB6F2,R03817,1.10.9.1,[],"[rxn11995, rxn39426]"
3,PGR5PGRL11_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",PGR5PGRL11_H,PGR5PGRL11,R03817,,[],"[rxn11995, rxn39426]"
4,PGR5PGRL12_h,proton gradient regulation 5 (PGR5)/PGR5-like ...,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",PGR5PGRL12_H,PGR5PGRL12,R03817,,[],"[rxn11995, rxn39426]"
5,NDH1_h,NADH dehydrogenase-like (NDH) complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",NDH1_H,NDH1,R03817,,[],"[rxn11995, rxn39426]"
6,NDH2_h,NADH dehydrogenase-like (NDH) complex,"{'doi': '10.1016/j.tplants.2011.10.004', 'go':...",NDH2_H,NDH2,R03817,,[],"[rxn11995, rxn39426]"
7,PSI_h,photosystem I,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",PSI_H,PSI,R09542,1.97.1.12,[],[rxn16384]
8,Fd_DASH_NADPR_h,ferredoxin-NADP reductase,"{'ec-code': '1.18.1.2', 'go': 'GO:0019684', 'k...",FD_DASH_NADPR_H,FD_DASH_NADPR,R01195,1.18.1.2,"[FRDO, FNOR, FNOR]","[rxn14159, rxn05937, rxn14159]"
9,ATPase_h,ATPase,"{'doi': '10.1016/j.tplants.2011.10.004', 'ec-c...",ATPASE_H,ATPASE,R00086,3.6.3.14,"[ATPM, ATPOBJ, FATP, NTP1, U214, ATPS4rpp, ATP...","[rxn00062, rxn11300, rxn27586, rxn27587, rxn27..."


In [40]:
(df_reactions_aracore['bigg_id_aggr'].apply(len) != 0).value_counts() #298 reactions have at least one bigg id

True     298
False    274
Name: bigg_id_aggr, dtype: int64

In [41]:
(df_reactions_aracore['seed_id_aggr'].apply(len) != 0).value_counts() #320 reactions have at least one seed id

True     320
False    252
Name: seed_id_aggr, dtype: int64

In [39]:
#Export final mapping table for manual mapping
df_reactions_aracore.to_csv('../data/processed/2021-05-31-reactions-mapping-table.csv') 