In [20]:
import os
import itertools
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [21]:
#Get the data from all species
species = ['pseudomonas_putida','escherichia_coli', 'synechocystis_sp_pcc_6803']
reaction_types = ['NBR', 'BAR']
relevant_files_for_study = ['carbon_source_enriched_Gene.xlsx']
presence_cutoff = 1
in_every_cutoff= {}

print('Getting data from : \n')
for specie in species:
    in_every_cutoff[specie] = {}
    results_path = '/'.join(['results', specie])
    for subdir, dirs, files in os.walk(results_path):
        for file in files:
            if any([relevant_file in os.path.join(subdir, file) for relevant_file in relevant_files_for_study]):
                filepath= os.path.join(subdir, file)
                study = filepath.split('/')[2]
                in_every_cutoff[specie][study] = {}
                dfs = pd.read_excel(os.path.join(subdir, file), sheet_name=None)
                print(filepath)
                for experiment in dfs:
                    sensitivity_data = dfs[experiment]
                    in_every_cutoff[specie][study][experiment] = set(sensitivity_data.loc[sensitivity_data["Presence"]==presence_cutoff, 'Gene'].tolist())

Getting data from : 

results/pseudomonas_putida/Carbon2_Study_2/pseudomonas_putida_carbon_source_enriched_Gene.xlsx
results/pseudomonas_putida/Aromatics_Study_2/pseudomonas_putida_carbon_source_enriched_Gene.xlsx
results/pseudomonas_putida/Muconate_Study_2/pseudomonas_putida_carbon_source_enriched_Gene.xlsx
results/pseudomonas_putida/Carbon_Study_2/pseudomonas_putida_carbon_source_enriched_Gene.xlsx
results/escherichia_coli/Crp_ARs_Study/ecoli_carbon_source_enriched_Gene.xlsx
results/escherichia_coli/Cra_Crp_Study/ecoli_carbon_source_enriched_Gene.xlsx
results/escherichia_coli/ICA_Study/ecoli_carbon_source_enriched_Gene.xlsx
results/escherichia_coli/Omics_Study/ecoli_carbon_source_enriched_Gene.xlsx
results/synechocystis_sp_pcc_6803/Iron_depletion_oxidative_stress_Study/synechocystis_sp_pcc_6803_carbon_source_enriched_Gene.xlsx
results/synechocystis_sp_pcc_6803/Day_night_cycles_Study/synechocystis_sp_pcc_6803_carbon_source_enriched_Gene.xlsx
results/synechocystis_sp_pcc_6803/Iron_limi

In [22]:
def jaccard_index(list_1, list_2):
    s1 = set(list_1)
    s2 = set(list_2)
    
    return float(len(s1.intersection(s2)) / len(s1.union(s2))) 

In [23]:
#Generate the combination to test
spp_list = []
study_list = []
pair_of_conditions = []
core_nbr_count_list = []
core_bar_count_list = []
ratio_list = []

for specie in in_every_cutoff:
    species_study = in_every_cutoff[specie]
    for study in species_study:
        study_conditions = set(['_'.join(cond.split('_')[:-1]) for cond in species_study[study]])   
        condition_combinations = itertools.combinations(study_conditions, 2)
        for comb in condition_combinations:
            ratio_dict = { cond : species_study[study][cond]
                           for cond in species_study[study]
                           for el in comb
                           if el in cond
                         }
            
            nbr_conditions = [condition for condition in ratio_dict if 'NBR' in condition]
            nbr_core = set.intersection(*[ratio_dict[condition] for condition in nbr_conditions])
            
            bar_conditions = [condition for condition in ratio_dict if 'BAR' in condition]
            bar_core =set.intersection(*[ratio_dict[condition] for condition in bar_conditions])
            
            spp_list.append(specie)
            study_list.append(study)
            pair_of_conditions.append('_'.join(comb))
            core_nbr_count_list.append(len(nbr_core))
            core_bar_count_list.append(len(bar_core))
            ratio_list.append(len(nbr_core)/len(bar_core))
            

#Fit the data in df
ratios_data = { 'Species' : spp_list,
                'Study' : study_list,
                'Pair_of_conditions' : pair_of_conditions,
                'NBRs' : core_nbr_count_list,
                'BARs' : core_bar_count_list,
                'NB_Ratio': ratio_list
              }


ratios_df = pd.DataFrame.from_dict(ratios_data)
ratios_df

Unnamed: 0,Species,Study,Pair_of_conditions,NBRs,BARs,NB_Ratio
0,pseudomonas_putida,Carbon2_Study_2,m9-EX_glc_e7.44_m9-EX_T4hcinnm_e4.04,395,771,0.512322
1,pseudomonas_putida,Aromatics_Study_2,m9-EX_glc_e7.44_m9-EX_T4hcinnm_e4.04,391,759,0.515152
2,pseudomonas_putida,Aromatics_Study_2,m9-EX_glc_e7.44_m9-EX_T4hcinnm_e2.02-EX_fer_e1...,384,721,0.532594
3,pseudomonas_putida,Aromatics_Study_2,m9-EX_glc_e7.44_m9-EX_fer_e2.91,394,736,0.535326
4,pseudomonas_putida,Aromatics_Study_2,m9-EX_T4hcinnm_e4.04_m9-EX_T4hcinnm_e2.02-EX_f...,416,739,0.562923
...,...,...,...,...,...,...
268,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-treatment_4h_36_bg11-E...,52,528,0.098485
269,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-treatment_4h_36_bg11-E...,88,551,0.159710
270,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-control_9h_30_bg11-EX_...,62,533,0.116323
271,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-control_9h_30_bg11-EX_...,110,559,0.196780


In [24]:
ratios_df.loc[ratios_df['Species']=='synechocystis_sp_pcc_6803']['NB_Ratio'].mean()

0.14049282325910942

In [25]:
ratios_df.loc[ratios_df['Species']=='escherichia_coli']['NB_Ratio'].mean()

0.2964346539410139

In [26]:
ratios_df.loc[ratios_df['Species']=='pseudomonas_putida']['NB_Ratio'].mean()

0.5426735017937494

In [27]:
hist_data = []
hist_labels = []

for species in ratios_df.Species.unique():
    hist_labels.append(species)
    hist_data.append(ratios_df.loc[ratios_df['Species']==species]['NB_Ratio'])
    
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, hist_labels, bin_size=.03, histnorm='probability')
fig.show()

In [34]:
#perform an histogram with the jaccard index of  nbr reactions
col_n = 1

fig = make_subplots(rows=2,
                    cols=len(in_every_cutoff),
                    vertical_spacing=0.05,
                    subplot_titles=tuple([specie for specie in in_every_cutoff]))

for specie in in_every_cutoff:
    nbr_list = []
    bar_list = []
    condition_list = []
    jaccard_nbr_list = []
    jaccard_bar_list = []
    species_study = in_every_cutoff[specie]
    for study in species_study:
        study_conditions = list(set(['_'.join(cond.split('_')[:-1]) for cond in species_study[study]]))
        condition_list += ['_'.join([c, study]) for c in study_conditions]
        nbr_list += [species_study[study]['_'.join([condition, 'NBR-Gene'])] for condition in study_conditions]
        bar_list += [species_study[study]['_'.join([condition, 'BAR-Gene'])] for condition in study_conditions]
    
    jaccard_nbr_list = [[jaccard_index(nbr_1 ,nbr_2) for nbr_1 in nbr_list]
                        for nbr_2 in nbr_list]

    jaccard_bar_list = [[jaccard_index(bar_1 ,bar_2) for bar_1 in bar_list]
                        for bar_2 in bar_list]

    
    fig.add_trace(
        px.imshow(
            jaccard_nbr_list,
            x=condition_list,
            y=condition_list,
        ).data[0],
        row=1, col=col_n
    )
    fig.add_trace(
        px.imshow(
            jaccard_bar_list,
            x=condition_list,
            y=condition_list
        ).data[0],
        row=2, col=col_n
    )
    col_n += 1

# edit figure
fig['layout']['yaxis']['title']='NBR SET'
fig['layout']['yaxis4']['title']='BAR SET'
fig['layout']['yaxis']['showticklabels']=False
fig['layout']['yaxis2']['showticklabels']=False
fig['layout']['yaxis3']['showticklabels']=False
fig['layout']['yaxis4']['showticklabels']=False
fig['layout']['yaxis5']['showticklabels']=False
fig['layout']['yaxis6']['showticklabels']=False
fig['layout']['xaxis']['showticklabels']=False
fig['layout']['xaxis2']['showticklabels']=False
fig['layout']['xaxis3']['showticklabels']=False
fig['layout']['xaxis4']['tickangle']=-60
fig['layout']['xaxis5']['tickangle']=-60
fig['layout']['xaxis6']['tickangle']=-60
fig['layout']['height'] = 1400
fig['layout']['width'] = 1800
fig['layout']['coloraxis']['colorbar']['len']=0.5
fig['layout']['coloraxis']['colorscale']='viridis'

fig.show()
fig.write_image('jaccard_index_of_reaction_sets_between_species.svg')

In [29]:
hist_data = []
hist_labels = []

for study in ratios_df.Study.unique():
    study_data = ratios_df.loc[ratios_df['Study']==study]['NB_Ratio']
    if len(study_data) > 1:
        hist_labels.append(study)
        hist_data.append(study_data)

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, hist_labels, bin_size=.03, histnorm='probability')
fig.show()

In [22]:
#Remove the deletion conditions
similar_conditions_ratios_df = ratios_df.loc[~ratios_df['Pair_of_conditions'].str.contains('deletion')]
similar_conditions_ratios_df

Unnamed: 0,Species,Study,Pair_of_conditions,NBRs,BARs,NB_Ratio
0,pseudomonas_putida,Carbon2_Study_2,m9-EX_T4hcinnm_e4.04_m9-EX_glc_e7.44,395,771,0.512322
1,pseudomonas_putida,Aromatics_Study_2,m9-EX_T4hcinnm_e4.04_m9-EX_glc_e7.44,391,759,0.515152
2,pseudomonas_putida,Aromatics_Study_2,m9-EX_T4hcinnm_e4.04_m9-EX_fer_e2.91,410,745,0.550336
3,pseudomonas_putida,Aromatics_Study_2,m9-EX_T4hcinnm_e4.04_m9-EX_T4hcinnm_e2.02-EX_f...,416,739,0.562923
4,pseudomonas_putida,Aromatics_Study_2,m9-EX_glc_e7.44_m9-EX_fer_e2.91,394,736,0.535326
...,...,...,...,...,...,...
271,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-treatment_8h_42_bg11-E...,75,547,0.137112
272,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-treatment_8h_42_bg11-E...,61,538,0.113383
273,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-control_1h_30_bg11-EX_...,70,539,0.129870
274,synechocystis_sp_pcc_6803,Thermal_Study,bg11-EX_photon_e478.046-control_1h_30_bg11-EX_...,60,532,0.112782


In [23]:
similar_conditions_ratios_df.loc[similar_conditions_ratios_df['Species']=='synechocystis_sp_pcc_6803']['NB_Ratio'].mean()

0.14089811414800948

In [24]:
similar_conditions_ratios_df.loc[similar_conditions_ratios_df['Species']=='escherichia_coli']['NB_Ratio'].mean()

0.2571932880032422

In [25]:
similar_conditions_ratios_df.loc[similar_conditions_ratios_df['Species']=='pseudomonas_putida']['NB_Ratio'].mean()

0.5426735017937494

In [26]:
hist_data = []
hist_labels = []

for species in similar_conditions_ratios_df.Species.unique():
    hist_labels.append(species)
    hist_data.append(similar_conditions_ratios_df.loc[similar_conditions_ratios_df['Species']==species]['NB_Ratio'])
    
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, hist_labels, bin_size=.03, histnorm='probability')

fig.update_layout(
    title=dict(text='NB Ratio Distribution', x=0.5),
    paper_bgcolor='rgba(255,255,255,0)',
    plot_bgcolor='rgba(245,245,245,1)',
    xaxis=dict(title='NB RATIO', tickfont=dict(size=16)),
    yaxis=dict(title='PROBABILITY', gridcolor='grey', zerolinecolor='grey', tickfont=dict(size=16)),
    yaxis2=dict(title='POINTS', gridcolor='rgba(255,255,255,0)')
)
fig.show()
fig.write_image('NB_ratio_distribution_ecoli_putida.svg')

## Functional Composition Analysis

We are going to extract from the data the top subsystems in which each subset is enriched within the same species for all different studies and conditions. For this purpose, we first **compute a ranking consisting on the counts of each subsystem in the top 3 of enriched subsystems** for each reaction set, species and condition. Among the ranking we are going to **select top 3 of ranking for each category combination**. Finally, we **plot the percentage each subsystem represents within the corresponding reaction set** in order to check wether if it remains **stable among conditions** (base activation) and if some subsystems are **consistently more abundant** in a specific reaction set.

In [2]:
print('Getting data from : \n')
species = ['pseudomonas_putida','escherichia_coli', 'synechocystis_sp_pcc_6803']
reaction_types = ['NBR', 'BAR']
relevant_files_for_study = ['NBR_condition-specific_enrichment_analysis.xlsx', 'BAR_condition-specific_enrichment_analysis.xlsx']
top_n = 3
top_subsystems = {}

for specie in species:
    top_subsystems[specie] = {'NBR' : [], 'BAR': []}
    results_path = '/'.join(['results', specie])
    for subdir, dirs, files in os.walk(results_path):
        for file in files:
            if any([relevant_file in os.path.join(subdir, file) for relevant_file in relevant_files_for_study]):
                filepath= os.path.join(subdir, file)
                study = filepath.split('/')[2]
                reaction_type = file.split('_')[0]
                dfs = pd.read_excel(os.path.join(subdir, file), sheet_name=None)
                print(filepath)
                for experiment in dfs:
                    df = dfs[experiment]
                    df = df.sort_values(by='p-Value')
                    #merge all S_Alternate_Carbon in putida in one category to be comparable to the one in ecoli
                    #do the same for S_Aromatic_Compounds_Degradation
                    df.loc[df['Subsystem'].str.startswith('S_Alternate_Carbon'), 'Subsystem'] = 'S_Alternate_Carbon'
                    df.loc[df['Subsystem'].str.startswith('S_Aromatic_Compounds_Degradation'), 'Subsystem'] = 'S_Aromatic_Compounds_Degradation'
                    top_subsystems[specie][reaction_type] += df['Subsystem'][:top_n].tolist()
                    
unique_in_top = set([top for reaction_sets in top_subsystems.values() for top_sets in reaction_sets.values() for top in top_sets])

top_data = { 'Specie' : [],
             'Subsystem' : [],
             'Reaction_Type': [],
             'Count' : [] 
           }

for specie in top_subsystems:
    for reaction_type in top_subsystems[specie]:
        for subsystem in unique_in_top:
            top_data['Specie'].append(specie)
            top_data['Subsystem'].append(subsystem)
            top_data['Reaction_Type'].append(reaction_type)
            top_data['Count'].append(top_subsystems[specie][reaction_type].count(subsystem))

top_df=pd.DataFrame.from_dict(top_data)
top_df = top_df[(top_df != 0).all(1)]
top_df

Getting data from : 

results/pseudomonas_putida/Carbon2_Study_2/BAR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Carbon2_Study_2/NBR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Aromatics_Study_2/BAR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Aromatics_Study_2/NBR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Muconate_Study_2/BAR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Muconate_Study_2/NBR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Carbon_Study_2/BAR_condition-specific_enrichment_analysis.xlsx
results/pseudomonas_putida/Carbon_Study_2/NBR_condition-specific_enrichment_analysis.xlsx
results/escherichia_coli/Crp_ARs_Study/BAR_condition-specific_enrichment_analysis.xlsx
results/escherichia_coli/Crp_ARs_Study/NBR_condition-specific_enrichment_analysis.xlsx
results/escherichia_coli/Cra_Crp_Study/BAR_condition-specific_enrichment

Unnamed: 0,Specie,Subsystem,Reaction_Type,Count
3,pseudomonas_putida,S_Alternate_Carbon,NBR,6
4,pseudomonas_putida,S_tRNA_Charging,NBR,1
7,pseudomonas_putida,S_Transport__ABC_system,NBR,16
16,pseudomonas_putida,Unassigned,NBR,5
29,pseudomonas_putida,S_Aromatic_Compounds_Degradation,NBR,1
34,pseudomonas_putida,S_Transport__Inner_Membrane,NBR,16
38,pseudomonas_putida,S_Transport__Outer_Membrane,NBR,3
55,pseudomonas_putida,S_Purine_Metabolism,BAR,16
58,pseudomonas_putida,S_Pyrimidine_Metabolism,BAR,6
60,pseudomonas_putida,S_Phenylalanine_Tyrosine_Tryptophan_Biosynthesis,BAR,12


In [3]:
top_df.loc[(top_df['Specie']=='pseudomonas_putida')&(top_df['Reaction_Type']=='NBR')].sort_values(by='Count')

Unnamed: 0,Specie,Subsystem,Reaction_Type,Count
4,pseudomonas_putida,S_tRNA_Charging,NBR,1
29,pseudomonas_putida,S_Aromatic_Compounds_Degradation,NBR,1
38,pseudomonas_putida,S_Transport__Outer_Membrane,NBR,3
16,pseudomonas_putida,Unassigned,NBR,5
3,pseudomonas_putida,S_Alternate_Carbon,NBR,6
7,pseudomonas_putida,S_Transport__ABC_system,NBR,16
34,pseudomonas_putida,S_Transport__Inner_Membrane,NBR,16


In [4]:
top_df.loc[(top_df['Specie']=='pseudomonas_putida')&(top_df['Reaction_Type']=='BAR')].sort_values(by='Count')

Unnamed: 0,Specie,Subsystem,Reaction_Type,Count
81,pseudomonas_putida,S_Lysine_Metabolism,BAR,4
58,pseudomonas_putida,S_Pyrimidine_Metabolism,BAR,6
76,pseudomonas_putida,S_Oxidative_Phosphorylation,BAR,10
60,pseudomonas_putida,S_Phenylalanine_Tyrosine_Tryptophan_Biosynthesis,BAR,12
55,pseudomonas_putida,S_Purine_Metabolism,BAR,16


In [5]:
#Get the data from all species
species = ['pseudomonas_putida','escherichia_coli', 'synechocystis_sp_pcc_6803']
reaction_types = ['NBR', 'BAR']
relevant_files_for_study = ['functional_composition.xlsx']
reactions_per_type = 3

df_list = []

print('Getting data from : \n')
for specie in species:
    results_path = '/'.join(['results', specie])
    for subdir, dirs, files in os.walk(results_path):
        for file in files:
            if any([relevant_file in os.path.join(subdir, file) for relevant_file in relevant_files_for_study]):
                filepath= os.path.join(subdir, file)
                study = filepath.split('/')[2]
                dfs = pd.read_excel(os.path.join(subdir, file), sheet_name=None)
                print(filepath)
                for experiment in dfs:
                    if 'Subsystem' in experiment:
                        target_subsystems = []
                        df = dfs[experiment]
                        #merge all S_Alternate_Carbon in putida in one category to be comparable to the one in ecoli
                        #do the same for S_Aromatic_Compounds_Degradation
                        df.loc[df['Subsystem_function'].str.startswith('S_Alternate_Carbon'), 'Subsystem_function'] = 'S_Alternate_Carbon'
                        df.loc[df['Subsystem_function'].str.startswith('S_Aromatic_Compounds_Degradation'), 'Subsystem_function'] = 'S_Aromatic_Compounds_Degradation'
                        df_grouped = df.groupby(['Subsystem_function']).sum()
                        df = df_grouped.reset_index().fillna(0)
                        for rt in reaction_types:
                            target_subsystems += top_df.loc[(top_df['Specie']==specie)
                                                            & (top_df['Reaction_Type']==rt)
                                                           ].sort_values(by='Count', ascending=False)['Subsystem'].tolist()[:reactions_per_type]
                        
                        target_subsystems = list(set(target_subsystems))
                        subset_df = df.loc[df['Subsystem_function'].isin(target_subsystems)]
                        subset_df['Species'] = [specie]*len(subset_df)
                        subset_df['Condition'] = ['-'.join(experiment.split('-')[1:-1])]*len(subset_df)
                        df_list.append(subset_df)

composition_df = pd.concat(df_list)
display(composition_df)

Getting data from : 

results/pseudomonas_putida/Carbon2_Study_2/pseudomonas_putida_carbon_source_functional_composition.xlsx
results/pseudomonas_putida/Aromatics_Study_2/pseudomonas_putida_carbon_source_functional_composition.xlsx
results/pseudomonas_putida/Muconate_Study_2/pseudomonas_putida_carbon_source_functional_composition.xlsx
results/pseudomonas_putida/Carbon_Study_2/pseudomonas_putida_carbon_source_functional_composition.xlsx
results/escherichia_coli/Crp_ARs_Study/ecoli_carbon_source_functional_composition.xlsx
results/escherichia_coli/Cra_Crp_Study/ecoli_carbon_source_functional_composition.xlsx
results/escherichia_coli/ICA_Study/ecoli_carbon_source_functional_composition.xlsx
results/escherichia_coli/Omics_Study_2/escherichia_coli_carbon_source_functional_composition.xlsx
results/escherichia_coli/Omics_Study/ecoli_carbon_source_functional_composition.xlsx
results/synechocystis_sp_pcc_6803/Iron_depletion_oxidative_stress_Study/synechocystis_sp_pcc_6803_carbon_source_function

Unnamed: 0,Subsystem_function,BAR_Set_percentage,NBR_Set_percentage,Species,Condition
5,S_Alternate_Carbon,6.739130,12.709030,pseudomonas_putida,EX_glc_e7.44
50,S_Oxidative_Phosphorylation,5.000000,0.000000,pseudomonas_putida,EX_glc_e7.44
53,S_Phenylalanine_Tyrosine_Tryptophan_Biosynthesis,2.608696,0.334448,pseudomonas_putida,EX_glc_e7.44
56,S_Purine_Metabolism,5.434783,2.173913,pseudomonas_putida,EX_glc_e7.44
62,S_Transport__ABC_system,5.978261,15.050167,pseudomonas_putida,EX_glc_e7.44
...,...,...,...,...,...
11,Fructose and mannose metabolism,0.354610,8.653846,synechocystis_sp_pcc_6803,EX_photon_e478.046-treatment_10h_45
27,Oxidative phosphorylation,10.815603,3.846154,synechocystis_sp_pcc_6803,EX_photon_e478.046-treatment_10h_45
33,Photosynthesis,13.829787,0.000000,synechocystis_sp_pcc_6803,EX_photon_e478.046-treatment_10h_45
35,Purine metabolism,5.673759,7.692308,synechocystis_sp_pcc_6803,EX_photon_e478.046-treatment_10h_45


In [6]:
spp_cond = set([s+c for s,c in zip(composition_df['Species'].tolist(), composition_df['Condition'].tolist())])
for cond in composition_df['Condition'].unique():
    print(cond, composition_df['Condition'].tolist().count(cond))

EX_glc_e7.44 24
EX_T4hcinnm_e4.04 12
EX_T4hcinnm_e2.02-EX_fer_e1.4505 6
EX_fer_e2.91 12
EX_glcn_e7.44 6
EX_fru_e1.32 6
EX_glc_e3.72-EX_glcn_e3.72 6
EX_glc_e3.72-EX_fru_e0.66 6
EX_glc_e2.48-EX_glcn_e2.48-EX_fru_e0.44 6
EX_cit_e7.44 6
EX_ser__L_e14.88 6
EX_fru_e10.0 10
EX_fru_e10.0-deletion_crp 5
EX_glyc_e5.0 5
EX_glyc_e5.0-deletion_Ar1 5
EX_glyc_e5.0-deletion_Ar1_Ar2 5
EX_glyc_e5.0-deletion_Ar2 5
EX_glyc_e5.0-deletion_crp 5
EX_glc__D_e10.0-deletion_crp 5
EX_ac_e3.33 5
EX_ac_e3.33-deletion_Cra 5
EX_fru_e10.0-deletion_Cra 5
EX_glc__D_e10.0-deletion_Cra 5
EX_glc__D_e10.0 5
EX_glc__D_e10.0-EX_o2_e0.0 5
EX_glc__D_e10.0-EX_gthrd_e16.67 5
EX_glc__D_e10.0-EX_met__L_e8.33 5
EX_glc__D_e10.0-EX_ade_e8.33 5
EX_sbt__D_e10.0-EX_arg__L_e10.0 5
EX_rib__D_e8.33-EX_cytd_e15.0 5
EX_glcr_e10.0-EX_leu__L_e10.0 5
EX_acgam_e13.33-EX_phe__L_e15.0 5
EX_gal_e10.0-EX_thm_e20.0 5
EX_glcn_e10.0-EX_tyr__L_e15.0 5
EX_pyr_e5.0-EX_ura_e6.67 5
EX_glc__D_e10.0-EX_glu__L_e8.33 10
EX_glc__D_e10.0-EX_gly_e3.33 10
EX_glc__D_

In [12]:
fig = make_subplots(rows=2,
                    cols=len(in_every_cutoff),
                    vertical_spacing=0.05,
                    horizontal_spacing=0.05,
                    subplot_titles=tuple(['<b>%s</b>' % specie for specie in composition_df['Species'].unique()]))

col_n = 1
fig_n = 1

for specie in composition_df['Species'].unique():
    row_n = 1
    specie_df = composition_df.loc[composition_df['Species']==specie]
    for reaction in reaction_types:
        target_column = '_'.join([reaction, 'Set_percentage'])
        target_columns = ['Subsystem_function', 'Condition', target_column]
        df = specie_df[target_columns]
        #generate statistics for those conditions having more than one replicate
        std = df.groupby(['Subsystem_function', 'Condition']).std()
        grouped_df = df.groupby(['Subsystem_function', 'Condition']).mean()
        grouped_df['SD'] = std
        plot_df = grouped_df.reset_index().fillna(0)
        plot_df.sort_values(by='Subsystem_function', inplace=True)
        unique_ss = plot_df.Subsystem_function.unique().tolist()
        plot_df['subsystem_key'] = [unique_ss.index(ss) for ss in plot_df['Subsystem_function'].tolist() ]
        
        for subsystem in plot_df.Subsystem_function.unique():
            fig.add_trace(go.Scatter(x=plot_df.query("Subsystem_function == '%s'" % subsystem)["Condition"],
                                     y=plot_df.query("Subsystem_function == '%s'" % subsystem)[target_column],
                                     name = subsystem, mode='markers',
                                     error_y=dict(
                                         type='data', # value of error bar given in data coordinates
                                         array=plot_df.query("Subsystem_function == '%s'" % subsystem)['SD'],
                                         visible=True),
                                     marker={"size": 10, "color":px.colors.qualitative.D3[unique_ss.index(subsystem)]},
                                     legendgroup = str(fig_n)
                                    ),
                          row=row_n, col=col_n)
        
        row_n += 1
        fig_n += 1
        
    col_n += 1


# edit figure
fig['layout']['plot_bgcolor'] = 'rgba(245,245,245,1)'
fig['layout']['yaxis']['title']='<b>NBR SET</b>'
fig['layout']['yaxis']['title']['font']['size']=20
fig['layout']['yaxis']['range']=[-5, 35]
fig['layout']['yaxis4']['title']='<b>BAR SET</b>'
fig['layout']['yaxis4']['title']['font']['size']=20
fig['layout']['yaxis3']['showticklabels']=False
fig['layout']['yaxis3']['range']=[-5, 35]
fig['layout']['yaxis2']['showticklabels']=False
fig['layout']['yaxis2']['range']=[-5, 35]
fig['layout']['yaxis4']['showticklabels']=True
fig['layout']['yaxis4']['range']=[-5, 35]
fig['layout']['yaxis5']['showticklabels']=False
fig['layout']['yaxis5']['range']=[-5, 35]
fig['layout']['yaxis6']['showticklabels']=False
fig['layout']['yaxis6']['range']=[-5, 35]
fig['layout']['xaxis']['showticklabels']=False
fig['layout']['xaxis2']['showticklabels']=False
fig['layout']['xaxis3']['showticklabels']=False
fig['layout']['xaxis3']['tickfont']['size']=14
fig['layout']['xaxis4']['tickangle']=-60
fig['layout']['xaxis5']['tickangle']=-60
fig['layout']['xaxis6']['tickangle']=-60
fig['layout']['xaxis4']['tickfont']['size']=14
fig['layout']['height'] = 1000
fig['layout']['width'] = 2700
fig['layout']['coloraxis']['colorbar']['len']=0.5
fig['layout']['coloraxis']['colorscale']='viridis'

    
fig.show()
fig.write_image('subsystem_composition_conditions_species_final.svg')