In [1]:
import pandas as pd
import cobra
import driven
import numpy as np
from driven.data_sets import ExpressionProfile

In [2]:
fname = '../data/nobori_2018/pnas.1800529115.sd02.xlsx'
conditions = ['MM','vivo','flg22','D36E','AvrRpt2','AvrRps4']
all_conditions_frames = []
condition_to_p = {}
# load the up and down sheet for each comparison, then merge
# them into a single dataframe and rename the columns to meet the
# conventions in the driven package.
for condition in conditions:
    down = pd.read_excel(fname,sheet_name=condition+"_down")
    down = down.rename({condition+'_down':'gene'},axis='columns')
    up = pd.read_excel(fname,sheet_name=condition+"_up")
    up = up.rename({condition+'_up':'gene'},axis='columns')
    both = pd.concat([down,up])
    fchange_col = both.columns
    expression_column = [c for c in both.columns if "fold change" in c][0]
    p_column = [c for c in both.columns if "q-value" in c][0]
    condition = expression_column.split(': ')[1]
    
    both = both.rename({p_column:\
        "p-value-"+condition,expression_column:condition},axis='columns')
    
    # split each gene name by '-' to get the KEGG gene ID only,
    # then reassign it to the index
    new_index = [s[0] for s in both['gene'].str.split('-')]
    both.index = new_index
    both = both.drop('gene',axis=1)
    all_conditions_frames.append(both)
    
    # save the exact condition name and the p value column associated with it
    condition_to_p[condition] = "p-value-"+condition
xomics_data = pd.concat(all_conditions_frames,axis=1)

# for p-value columns that are NaN after the merge, reassign as 1 to indicate
# not significant
p_cols = [c for c in xomics_data.columns if "p-value" in c]
xomics_data[p_cols] = xomics_data[p_cols].fillna(value=1.0)

# for expression, do the equivalent (set foldchange to 0)
diff_cols = [c for c in xomics_data.columns if "p-value" not in c]
xomics_data[diff_cols] = xomics_data[diff_cols].fillna(value=0.0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
# Load the count data
fname = "../data/nobori_2018/GSE103441_count.txt"
count_frame = pd.read_csv(fname,sep='\t',index_col=0)
# remove sample names from the end of column names
# and merge samples group-wise into a mean
#'_'.join(count_frame.columns[0].split('_')[:-1])
groups = set(['_'.join(c.split('_')[:-1]) for c in count_frame.columns])
mean_count_frame = pd.DataFrame(index=count_frame.index)
for group in groups:
    samps = [col for col in count_frame.columns if group in col]
    mean = count_frame[samps].mean(axis=1)
    mean_count_frame[group] = mean
mean_count_frame

Unnamed: 0,Pto-AvrRpt2_pad4,Pto-AvrRpt2_stp,Pto_sid2,Pto-AvrRpt2_KB,Pto_col_SA,Pto_stp,Pto-AvrRpt2_col,Pto-D36E_col,Pto_deps,Pto-AvrRps4_col,...,Pto-D36E_KB,Pto_pad4,Pto_KB,Pto-AvrRpt2_npr1,Pto-AvrRpt2_sid2,Pto-AvrRpt2_sid2pmr4,Pto_pad4sid2,Pto-AvrRpt2_deps,Pto-AvrRpt2_b2b3,Pto-AvrRpt2_pad4sid2
PSPTOA0002,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.0,0.0,...,0.000000,0.000,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,0.000000
PSPTOA0004,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.0,0.0,...,0.000000,0.000,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,0.000000
PSPTOA0005,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.0,0.0,...,0.000000,0.000,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,0.000000
PSPTOA0007,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.0,0.0,...,0.000000,0.000,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,0.000000
PSPTOA0008,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.000000,0.0,0.0,...,0.000000,0.000,0.0,0.000000,0.0,0.0,0.00,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PSPTO_B0073-PilT domain-containing protein,268.142857,238.0,273.0,130.000000,117.000000,227.0,227.2500,0.333333,369.5,0.5,...,0.333333,243.375,219.0,265.666667,249.0,224.0,270.00,276.666667,383.5,320.000000
PSPTO_B0074-hypothetical protein,128.714286,91.5,131.4,69.000000,50.666667,86.0,95.4375,0.333333,195.5,0.0,...,0.000000,101.250,137.0,121.666667,121.4,114.5,88.75,130.333333,157.5,140.000000
"PSPTO_B0075-trbB protein, putative",19.285714,11.5,15.0,10.333333,4.666667,6.0,9.1875,0.000000,15.0,0.0,...,0.000000,18.375,21.0,12.333333,14.8,12.5,21.25,9.333333,12.5,22.333333
PSPTO_B0076-trbC protein,11.142857,25.5,17.2,15.666667,1.000000,11.0,15.5625,0.000000,23.5,0.0,...,0.000000,24.750,27.5,14.000000,19.8,18.5,30.50,17.666667,16.5,7.333333


In [4]:
# extract the groups of interest from count frame
flg_v_mock_counts = mean_count_frame[['Pto_col_mock','Pto_col_flg22']]
flg_v_mm_counts = mean_count_frame[["Pto_col_flg22", "Pto_MM_A80"]]
# replace A in the gene names with _ as in the other
# spreadsheets provided by the author
#the following line is not necessary because PSPTO_0001 and PSPTOA0001 are different genes, changed to 
#replace "A" with "_A" like plasmid B
flg_v_mock_counts.index = flg_v_mock_counts.index.str.replace('A','_A')
flg_v_mm_counts.index = flg_v_mm_counts.index.str.replace('A', "_A")
# add p-values
flg_v_mock_p = xomics_data["p-value-flg22_Pto - Mock_Pto"]
flg_v_mock_counts['p-value'] = flg_v_mock_p
flg_v_mock_counts = flg_v_mock_counts.fillna(1)

# rename the gene identifiers by splitting at "-" to only include the actual identifiers,
# not the descriptions.
identifiers = [gene.split('-')[0] for gene in flg_v_mock_counts.index.tolist()]

flg_v_mock_exp_prof = ExpressionProfile(identifiers=identifiers,
                                        conditions=['Pto_col_mock','Pto_col_flg22'],
                                        expression=flg_v_mock_counts[['Pto_col_mock','Pto_col_flg22']].values,
                                        p_values=np.reshape(flg_v_mock_counts['p-value'].values,(-1, 1)))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
# load the reconstruction
model = cobra.io.read_sbml_model('../results/reconstructions/pst7_complete_media.xml')

In [6]:
import numpy as np
np.log2(flg_v_mock_counts[flg_v_mock_counts['Pto_col_flg22'] >= 1]['Pto_col_flg22']
      ).hist(bins = 100, alpha = 0.5)
np.log2(flg_v_mock_counts[flg_v_mock_counts['Pto_col_mock'] >= 1]['Pto_col_mock']
      ).hist(bins = 100, alpha = 0.5)
# based on plot below, 4 (2 on log2 scale) seems like a reasonable cutoff that excludes the
# somewhat flat tail of the distribution from 0-2.

<matplotlib.axes._subplots.AxesSubplot at 0x11e5dfd68>

In [7]:
#threshold for the GIMME algorithm
#assign the variables to new ones
mock_model, mock_solution = driven.gimme(model,flg_v_mock_exp_prof,condition = 'Pto_col_mock',
                                                       cutoff=4,fraction_of_optimum=0.05)
flg_model, flg_solution = driven.gimme(model,flg_v_mock_exp_prof,condition = 'Pto_col_flg22',
                                                       cutoff=4,fraction_of_optimum=0.05)
#open up all the exchange reactions, but gimme does not have OF of biomass,
#look into the gimme solution more, figure out how the driven model is modified and what is supposed to return
#gunna take some inkering, model that is further constrained than hte original by wither loss of reaction
#or changing the bounds

In [8]:
flg_reactions = []
for reaction in flg_solution.fluxes.keys():
    if abs(flg_solution.fluxes[reaction]) > 0:
        flg_reactions.append(reaction)

print(len(flg_reactions))

mock_reactions = []
for reaction in mock_solution.fluxes.keys():
    if abs(mock_solution.fluxes[reaction]) > 0:
        mock_reactions.append(reaction)

print(len(mock_reactions))

496
491


In [9]:
shared = len(set(flg_reactions) & set(mock_reactions))
print("Number of reactions shared between the two context-specific models: " + str(shared))
total = len(set(flg_reactions) | set(mock_reactions))
print("Number of reactions active in either context-specific model: " + str(total))

Number of reactions shared between the two context-specific models: 454
Number of reactions active in either context-specific model: 533


In [10]:
mock_after_gimme = model.copy()
flg_after_gimme = model.copy()

In [11]:
# generate a truly "constrained" model by closing all reactions that didn't have flux in the gimme solution
def constrain_from_gimme_solution(model,gimme_solution):
    open_reactions = []
    for reaction in gimme_solution.fluxes.keys():
        if abs(gimme_solution.fluxes[reaction]) > 0:
            open_reactions.append(reaction)
    for reaction in model.reactions:
        if reaction.id not in open_reactions:
            reaction.bounds = (0,0)

constrain_from_gimme_solution(mock_after_gimme, mock_solution)
constrain_from_gimme_solution(flg_after_gimme, flg_solution)

In [12]:
# Perform genome-wide single gene essentiality simulations
mock_gene_essentiality = cobra.flux_analysis.single_gene_deletion(mock_after_gimme,
                                         gene_list = mock_after_gimme.genes)
flg_gene_essentiality = cobra.flux_analysis.single_gene_deletion(flg_after_gimme,
                                         gene_list = flg_after_gimme.genes)

In [13]:
# filter to binary growth/no growth (True/False, respectively)
mock_gene_essentiality['growth'] = mock_gene_essentiality['growth'] > 1E-6
flg_gene_essentiality['growth'] = flg_gene_essentiality['growth'] > 1E-6

In [14]:
# get discordant predictions
discordant_mock = mock_gene_essentiality[mock_gene_essentiality['growth'].values != 
                                         flg_gene_essentiality['growth'].values]
discordant_flg = flg_gene_essentiality[flg_gene_essentiality['growth'].values != 
                                         mock_gene_essentiality['growth'].values]

In [15]:
# save the constrained models
cobra.io.write_sbml_model(mock_after_gimme, 
                          '../results/reconstructions/gimme_constrained/mock_gimme_cutoff4.xml')
cobra.io.write_sbml_model(flg_after_gimme, 
                          '../results/reconstructions/gimme_constrained/flg_gimme_cutoff4.xml')