# Cuantitative comparision of same condition in different studies

The goal of this notebook is to implement an automated comparision between conditions of different studies that:
- Checks for the consistency of predictions
- Accounts for study-level variability
- Infer core functions and genes/reactions that are condition-specific

In [45]:
import re
import cobra
import itertools
import pandas as pd
from IPython.lib.pretty import pretty

### 1.Load Used studies

In [2]:
species = 'escherichia_coli'
studies_file = 'sample_table.csv'

#### 1.1 Generate the study and the uptakes dictionaries

In [4]:
studies = ['Omics', 'Cra/Crp', 'Crp ARs', 'ICA']
entry_column = 'SRX'
condition_columns = ['Base Media', 'Carbon Source (g/L)', 'Supplement']
metadata_filepath = '/'.join(['data',species, studies_file])
#generate study dicts with SRX entries and calculate uptake with Growth Rate
metadata = pd.read_csv(metadata_filepath)

#Load the universal and the organism model
org_model_name = 'iML1515.xml'
uni_model_name = 'universal_model_cobrapy.json'
org_model_filepath = '/'.join(['models', org_model_name])
uni_model_filepath = '/'.join(['models', uni_model_name])
org_cobra_model = cobra.io.read_sbml_model(org_model_filepath)
uni_cobra_model = cobra.io.load_json_model(uni_model_filepath)
org_cobra_model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
ca2_e,EX_ca2_e,0.004565,0,0.00%
cl_e,EX_cl_e,0.004565,0,0.00%
cobalt2_e,EX_cobalt2_e,2.192e-05,0,0.00%
cu2_e,EX_cu2_e,0.0006218,0,0.00%
fe2_e,EX_fe2_e,0.01409,0,0.00%
glc__D_e,EX_glc__D_e,10.0,6,100.00%
k_e,EX_k_e,0.1712,0,0.00%
mg2_e,EX_mg2_e,0.007608,0,0.00%
mn2_e,EX_mn2_e,0.000606,0,0.00%
mobd_e,EX_mobd_e,6.139e-06,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
4crsol_c,DM_4crsol_c,-0.0001956,7,0.01%
5drib_c,DM_5drib_c,-0.0001973,5,0.00%
amob_c,DM_amob_c,-1.754e-06,15,0.00%
co2_e,EX_co2_e,-24.0,1,99.99%
h2o_e,EX_h2o_e,-47.16,0,0.00%
h_e,EX_h_e,-8.058,0,0.00%
meoh_e,EX_meoh_e,-1.754e-06,1,0.00%


In [24]:
#get the metabolite dict from universal model
target_metabolites = ['glucose', 'glutamate', 'glycine', 'threonine', 'thiamine', 'acetate', 'fumarate', 'pyruvate']

exchanges = [{exchanges : met.name} 
             for met in [mm
                         for mm in uni_cobra_model.metabolites
                         if any([m in mm.name.lower() for m in target_metabolites])]
             for exchanges in [r.id for r in met.reactions if len(r.metabolites)==1 ]]

exchanges = { k : v
              for exchange in exchanges
              for k,v in exchange.items()
            }

#curate it by only taking the lowest Levenshtein distances
exchanges_curated = {}
for met in target_metabolites:
    candidate_dict = {}
    for k, v in exchanges.items():
        if met.lower() in v.lower():
            candidate_dict[len(v)-len(met)] = k

    best_candidate = candidate_dict[min(candidate_dict.keys())]
    exchanges_curated[met] = uni_cobra_model.reactions.get_by_id(best_candidate)
print(exchanges_curated)
#with this translate the study dicts

{'glucose': <Reaction EX_glc__D_e at 0x7f9fdaa7d1b0>, 'glutamate': <Reaction EX_glu__D_e at 0x7f9fd7f9b5b0>, 'glycine': <Reaction EX_gly_e at 0x7f9fdaa7d630>, 'threonine': <Reaction EX_thr__L_e at 0x7f9fdde7a230>, 'thiamine': <Reaction EX_thmpp_e at 0x7f9fddca8c70>, 'acetate': <Reaction EX_ac_e at 0x7f9fdac2a5f0>, 'fumarate': <Reaction EX_fum_e at 0x7f9fdaa7caf0>, 'pyruvate': <Reaction EX_pyr_e at 0x7f9fdde794b0>}


In [35]:
#generate df with all valid samples from selected studies
interest_columns = [ 'study',
                     'condition',
                     'Strain Description',
                     'Base Media', 
                     'Carbon Source (g/L)', 
                     'Nitrogen Source (g/L)',
                     'Electron Acceptor',
                     'Supplement',
                     'Growth Rate (1/hr)',
                      entry_column
                    ]

uptakes_dict = {}

studies_metadata = metadata[interest_columns]
valid_studies_metadata = studies_metadata.loc[studies_metadata['study'].isin(studies) &
                                              studies_metadata['Strain Description'].str.contains('MG1655') ] #only use data from strain MG1655

display(valid_studies_metadata)


Unnamed: 0,study,condition,Strain Description,Base Media,Carbon Source (g/L),Nitrogen Source (g/L),Electron Acceptor,Supplement,Growth Rate (1/hr),SRX
48,Omics,wt_glu,Escherichia coli K-12 MG1655,M9,glucose(4),NH4Cl(1),O2,glutamate (10mM),,SRX4985299
49,Omics,wt_glu,Escherichia coli K-12 MG1655,M9,glucose(4),NH4Cl(1),O2,glutamate (10mM),,SRX4985300
50,Omics,wt_gly,Escherichia coli K-12 MG1655,M9,glucose(4),NH4Cl(1),O2,glycine (10mM),,SRX661397
51,Omics,wt_gly,Escherichia coli K-12 MG1655,M9,glucose(4),NH4Cl(1),O2,glycine (10mM),,SRX661398
52,Omics,wt_thr,Escherichia coli K-12 MG1655,M9,glucose(4),NH4Cl(1),O2,threonine (10mM),,SRX661399
...,...,...,...,...,...,...,...,...,...,...
179,ICA,ura_pyr,Escherichia coli K-12 MG1655,M9,pyruvate(3.3),NH4Cl(1),O2,uracil (1 mM),0.27,SRX4993803
180,ICA,wt_glc,Escherichia coli K-12 MG1655,M9,glucose(2),NH4Cl(1),O2,,0.63,SRX5975758
181,ICA,wt_glc,Escherichia coli K-12 MG1655,M9,glucose(2),NH4Cl(1),O2,,0.63,SRX5975759
184,ICA,ade_glc,Escherichia coli K-12 MG1655,M9,glucose(2),NH4Cl(1),O2,adenine (100mg/L),0.78,SRX5975762


In [52]:
#generate the different study dicts:
for study in studies:
    study_dict = {}
    uptake_dict = {}
    study_df = valid_studies_metadata.loc[metadata['study']==study]
    study_df.fillna('', inplace=True)
    unique_features = [ study_df[feature].unique().tolist() for feature in condition_columns ]
    feature_combination = list(itertools.product(*unique_features))
    for (media, carbon_source, supplement) in feature_combination:
        condition_df = study_df.loc[ (study_df[condition_columns[0]]==media) &
                                     (study_df[condition_columns[1]]==carbon_source) &
                                     (study_df[condition_columns[2]]==supplement)
                                   ]
        if len(condition_df)>1:
            for index, row in condition_df.iterrows():
                media_tag = '-'.join([ re.sub(r"\s?\(.*\)", "", row[col].lower())  for col in condition_columns  ])
                media_tag = [exchanges_curated[met].id
                             if met in exchanges_curated.keys()
                             else met
                             for met in media_tag.split('-') ]
                study_dict['-'.join(media_tag)] = condition_df[entry_column].tolist()
    
                             
    print('_'.join([study, 'study']))
    print(pretty(study_dict))
    print('-------------------------------------------------------------------')

Omics_study
{'m9-EX_glc__D_e-EX_glu__D_e': ['SRX4985299', 'SRX4985300'],
 'm9-EX_glc__D_e-EX_gly_e': ['SRX661397', 'SRX661398'],
 'm9-EX_glc__D_e-EX_thr__L_e': ['SRX661399', 'SRX661400']}
-------------------------------------------------------------------
Cra/Crp_study
{'m9-EX_ac_e-': ['SRX865361', 'SRX865362', 'SRX865367', 'SRX865368'],
 'm9-fructose-': ['SRX865359', 'SRX865360', 'SRX865365', 'SRX865366'],
 'm9-EX_glc__D_e-': ['SRX865363', 'SRX865364']}
-------------------------------------------------------------------
Crp ARs_study
{'m9-fructose-': ['SRX837339', 'SRX837330', 'SRX837331', 'SRX837332'],
 'm9-glycerol-': ['SRX837344',
  'SRX837345',
  'SRX837322',
  'SRX837323',
  'SRX837324',
  'SRX837325',
  'SRX837326',
  'SRX837327',
  'SRX837328',
  'SRX837329',
  'SRX837336',
  'SRX837337',
  'SRX837338'],
 'm9-EX_glc__D_e-': ['SRX837333', 'SRX837334', 'SRX837335']}
-------------------------------------------------------------------
ICA_study
{'m9-EX_glc__D_e-': ['SRX4993780',
  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  study_df.fillna('', inplace=True)


#### *P.putida KT2440* studies

In [None]:
carbon_study = {   'm9-EX_cit_e:7.44' : ['SRX4552613',
                                         'SRX4552614',
                                         'SRX4552615'],
                   'm9-EX_fer_e:2.91' : ['SRX4552616',      #Experimentally measured in 0.1 glu
                                         'SRX4552617',
                                         'SRX4552618'],
                   'm9-EX_glc_e:7.44' : ['SRX4552619',      #Experimentally measured in 0.1 glu
                                         'SRX4552621'],
                   'm9-EX_ser__L_e:14.88' : ['SRX4552622',
                                             'SRX4552623',
                                             'SRX4552624'] }  



aromatics_study = { 'm9-EX_T4hcinnm_e:4.04' : ['SBRG_UNeb__coum__1',            #Experimentally measured in 0.1 glu
                                               'SBRG_UNeb__coum__2',
                                               'SBRG_UNeb__coum__3'], 
                    'm9-EX_T4hcinnm_e:2.02-EX_fer_e:1.4505' : ['SBRG_UNeb__coumferul__1', #Derived
                                                               'SBRG_UNeb__coumferul__2',
                                                               'SBRG_UNeb__coumferul__3'],  
                    'm9-EX_fer_e:2.91' : ['SBRG_UNeb__ferulate__1',            #Experimentally measured in 0.1 glu
                                          'SBRG_UNeb__ferulate__2',
                                          'SBRG_UNeb__ferulate__3'],  
                    'm9-EX_glc_e:7.44' : ['SBRG_UNeb__glc__1',                 #Experimentally measured in 0.1 glu
                                          'SBRG_UNeb__glc__2',
                                          'SBRG_UNeb__glc__3'] }

muconate_dict = {  'm9-EX_glc_e:7.44' : ['KT2440_glu_1',                 #Experimentally measured in 0.1 glu
                                        'KT2440_glu_2',
                                        'KT2440_glu_3'],
                   'm9-EX_glcn_e:7.44':['KT2440_glc_1',                 
                                        'KT2440_glc_2',
                                        'KT2440_glc_3'],
                   'm9-EX_fru_e:1.32' :['KT2440_f_1',                   #Multiplying the ratio of glucose and fructose
                                        'KT2440_f_2',                   #fluxes found in 0.3 glu by the glu flux in 0.1
                                        'KT2440_f_3'],                  #glu
                   'm9-EX_glc_e:3.72-EX_glcn_e:3.72' : ['KT2440_gg_1',
                                                        'KT2440_gg_2',
                                                        'KT2440_gg_3'],
                   'm9-EX_glc_e:3.72-EX_fru_e:0.66':['KT2440_fg_1',
                                                        'KT2440_fg_2',
                                                        'KT2440_fg_3'],
                   'm9-EX_glc_e:2.48-EX_glcn_e:2.48-EX_fru_e:0.44' : ['KT2440_gfg_1',
                                                                      'KT2440_gfg_2',
                                                                      'KT2440_gfg_3'] }

study_dict = { 'm9-EX_glc_e:7.44' : ['SRX7195897',        #Experimentally measured in 0.1 glu
                                     'SRX7195898',
                                     'SRX7195899'],
               'm9-EX_T4hcinnm_e:4.04' : ['SRX7195900',
                                          'SRX7195901',
                                          'SRX7195902']}  #Experimentally measured in 0.1 glu


#### *E. coli K12* studies

In [None]:
omics_study = { 'm9-glucose(4)-glutamate (10mm)': ['SRX4985299',
                                                   'SRX4985300'],
                'm9-glucose(4)-glycine (10mm)': ['SRX661397',
                                                 'SRX661398'],
                'm9-glucose(4)-threonine (10mm)': ['SRX661399',
                                                   'SRX661400']
              }


Cra_Crp_study = { 'm9-acetate(2)-': ['SRX865361',
                                     'SRX865362',    #Need to include deletion 
                                     'SRX865367',    #of Cra in last 2 replicates
                                     'SRX865368'],
                  'm9-fructose(2)-': ['SRX865359',
                                      'SRX865360',    #Need to include deletion   
                                      'SRX865365',    #of Cra in last 2 replicates
                                      'SRX865366'],
                  'm9-glucose(2)-': ['SRX865363',     #Need to include deletion
                                     'SRX865364'] }   #of Cra for 2 replicates


Crp_ARs_study = {'m9-fructose(2)-': [ 'SRX837339',
                                      'SRX837330',   #Include deletion of crp in last 3 repl
                                      'SRX837331',
                                      'SRX837332'],
                 'm9-glycerol(2)-': [ 'SRX837344',
                                      'SRX837345',
                                      'SRX837322',  #Ar1 deletion
                                      'SRX837323',  #Ar1 deletion
                                      'SRX837324',  #Ar1 and Ar2 deletion
                                      'SRX837325',  #Ar1 and Ar2 deletion
                                      'SRX837326',  #Ar1 and Ar2 deletion
                                      'SRX837327',  #Ar2 deletion
                                      'SRX837328',  #Ar2 deletion
                                      'SRX837329',  #Ar2 deletion
                                      'SRX837336',  #crp deletion
                                      'SRX837337',  #crp deletion
                                      'SRX837338'], #crp deletion
                 'm9-glucose(2)-': [ 'SRX837333',   #crp deletion
                                     'SRX837334',   #crp deletion
                                     'SRX837335']}  #crp deletion


ICA_study = {'m9-glucose(2)-': [ 'SRX4993780',
                                 'SRX4993781',
                                 'SRX4993782',
                                 'SRX4993783',
                                 'SRX4993794',    #anaerobic condition (KNO3 as e- acceptor)
                                 'SRX4993795',    #anaerobic condition (KNO3 as e- acceptor)
                                 'SRX5975758',
                                 'SRX5975759'],
             'm9-glucose(2)-cytidine (1mm)': ['SRX5975760',    #purR deletion
                                              'SRX5975761'],   #purR deletion
             'm9-glucose(2)-glutathione (1mm)': ['SRX4993788',
                                                 'SRX4993789'],
             'm9-glucose(2)-methionine (5mm)': ['SRX4993792',
                                                'SRX4993793'],
             'm9-glucose(2)-adenine (100mg/l)': ['SRX5975762',
                                                 'SRX5975763'],
             'm9-sorbitol(4)-l-arginine (5.75mm)': ['SRX4993784',
                                                    'SRX4993785'],
             'm9-d-ribose(4)-cytidine (1mm)': ['SRX4993786',
                                               'SRX4993787'],
             'm9-glucarate(4)-leucine (10mm)': ['SRX4993790',
                                                'SRX4993791'],
             'm9-n-acetylglucosamine(4)-phenylalanine (5mm)': ['SRX4993796',
                                                               'SRX4993797'],
             'm9-galactose(4)-thiamine (1um)': ['SRX4993798',
                                                'SRX4993799'],
             'm9-gluconate(4)-tyrosine (5mm)': ['SRX4993800',
                                                'SRX4993801'],
             'm9-pyruvate(3.3)-uracil (1 mm)': ['SRX4993802',
                                                'SRX4993803']}

#COmprobar si es un ALE -> si resulta serlo habría que quitarlo
Enzyme_Promiscuity_study = { 'm9-d-lyxose(2)-': [ 'SRX4073855',
                                                  'SRX4073856',
                                                  'SRX4073857',
                                                  'SRX4073858',
                                                  'SRX4073859'],
                             'm9-d-2-deoxyribose(2)-': ['SRX4073860',
                                                        'SRX4073861',
                                                        'SRX4073862'],
                             'm9-d-arabinose(2)-': [ 'SRX4073863',
                                                     'SRX4073864',
                                                     'SRX4073865',
                                                     'SRX4073867',
                                                     'SRX4073868',
                                                     'SRX4073869'],
                             'm9-m-tartrate(2)-': [ 'SRX4073870',
                                                    'SRX4073871',
                                                    'SRX4073874',
                                                    'SRX4073875',
                                                    'SRX4073872',
                                                    'SRX4073873',
                                                    'SRX4073876',
                                                    'SRX4073877']}