In [1]:
import pandas as pd
import cobra
import driven
import numpy as np
from driven.data_sets import ExpressionProfile

In [2]:
fname = '../data/nobori_2018/pnas.1800529115.sd02.xlsx'
conditions = ['MM','vivo','flg22','D36E','AvrRpt2','AvrRps4']
all_conditions_frames = []
condition_to_p = {}
# load the up and down sheet for each comparison, then merge
# them into a single dataframe and rename the columns to meet the
# conventions in the driven package.
for condition in conditions:
    down = pd.read_excel(fname,sheet_name=condition+"_down")
    down = down.rename({condition+'_down':'gene'},axis='columns')
    up = pd.read_excel(fname,sheet_name=condition+"_up")
    up = up.rename({condition+'_up':'gene'},axis='columns')
    both = pd.concat([down,up])
    fchange_col = both.columns
    expression_column = [c for c in both.columns if "fold change" in c][0]
    p_column = [c for c in both.columns if "q-value" in c][0]
    condition = expression_column.split(': ')[1]
    
    both = both.rename({p_column:\
        "p-value-"+condition,expression_column:condition},axis='columns')
    
    # split each gene name by '-' to get the KEGG gene ID only,
    # then reassign it to the index
    new_index = [s[0] for s in both['gene'].str.split('-')]
    both.index = new_index
    both = both.drop('gene',axis=1)
    all_conditions_frames.append(both)
    
    # save the exact condition name and the p value column associated with it
    condition_to_p[condition] = "p-value-"+condition
xomics_data = pd.concat(all_conditions_frames,axis=1)

# for p-value columns that are NaN after the merge, reassign as 1 to indicate
# not significant
p_cols = [c for c in xomics_data.columns if "p-value" in c]
xomics_data[p_cols] = xomics_data[p_cols].fillna(value=1.0)

# for expression, do the equivalent (set foldchange to 0)
diff_cols = [c for c in xomics_data.columns if "p-value" not in c]
xomics_data[diff_cols] = xomics_data[diff_cols].fillna(value=0.0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
# Load the count data
fname = "../data/nobori_2018/GSE103441_count.txt"
count_frame = pd.read_csv(fname,sep='\t',index_col=0)
# remove sample names from the end of column names
# and merge samples group-wise into a mean
#'_'.join(count_frame.columns[0].split('_')[:-1])
groups = set(['_'.join(c.split('_')[:-1]) for c in count_frame.columns])
mean_count_frame = pd.DataFrame(index=count_frame.index)
for group in groups:
    samps = [col for col in count_frame.columns if group in col]
    mean = count_frame[samps].mean(axis=1)
    mean_count_frame[group] = mean
mean_count_frame

Unnamed: 0,Pto_col,Pto-AvrRps4_KB,Pto-AvrRpt2_b2b3,Pto-AvrRpt2_stp,Pto-AvrRps4_col,Pto_col_SA,Pto_stp,Pto-AvrRpt2_KB,Pto_sid2,Pto_sid2pmr4,...,Pto-D36E_KB,Pto_col_chitin,Pto_de,Pto_MM_A80,Pto-AvrRpt2_rr,Pto_npr1,Pto-AvrRpt2_de,Pto_KB,Pto_deps,Pto-D36E_col
PSPTOA0002,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.00,0.00,0.0,0.0,0.000000,0.0,0.0,0.000000
PSPTOA0004,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.00,0.00,0.0,0.0,0.000000,0.0,0.0,0.000000
PSPTOA0005,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.00,0.00,0.0,0.0,0.000000,0.0,0.0,0.000000
PSPTOA0007,0.103448,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.00,0.00,0.0,0.0,0.000000,0.0,0.0,0.000000
PSPTOA0008,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.00,0.00,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PSPTO_B0073-PilT domain-containing protein,198.827586,3.5,383.5,238.0,0.5,117.000000,227.0,130.000000,273.0,388.0,...,0.333333,108.666667,291.50,297.25,166.0,269.5,238.500000,219.0,369.5,0.333333
PSPTO_B0074-hypothetical protein,68.241379,0.0,157.5,91.5,0.0,50.666667,86.0,69.000000,131.4,161.0,...,0.000000,52.333333,165.00,208.25,70.5,144.5,116.500000,137.0,195.5,0.333333
"PSPTO_B0075-trbB protein, putative",8.034483,0.0,12.5,11.5,0.0,4.666667,6.0,10.333333,15.0,9.5,...,0.000000,9.666667,10.75,40.75,11.0,14.0,8.666667,21.0,15.0,0.000000
PSPTO_B0076-trbC protein,15.724138,0.0,16.5,25.5,0.0,1.000000,11.0,15.666667,17.2,17.5,...,0.000000,4.000000,20.25,35.25,6.5,12.5,16.500000,27.5,23.5,0.000000


In [4]:
# extract the groups of interest from count frame
flg_v_mock_counts = mean_count_frame[['Pto_col_mock','Pto_col_flg22']]
flg_v_mm_counts = mean_count_frame[["Pto_col_flg22", "Pto_MM_A80"]]
# replace A in the gene names with _ as in the other
# spreadsheets provided by the author
#the following line is not necessary because PSPTO_0001 and PSPTOA0001 are different genes, changed to 
#replace "A" with "_A" like plasmid B
flg_v_mock_counts.index = flg_v_mock_counts.index.str.replace('A','_A')
flg_v_mm_counts.index = flg_v_mm_counts.index.str.replace('A', "_A")
# add p-values
flg_v_mock_p = xomics_data["p-value-flg22_Pto - Mock_Pto"]
flg_v_mock_counts['p-value'] = flg_v_mock_p
flg_v_mock_counts = flg_v_mock_counts.fillna(1)
#flg_v_mm_counts_p_val = xomics_data["p-value-"]

flg_v_mock_exp_prof = ExpressionProfile(identifiers=flg_v_mock_counts.index.tolist(),
                                        conditions=['Pto_col_mock','Pto_col_flg22'],
                                        expression=flg_v_mock_counts[['Pto_col_mock','Pto_col_flg22']].values,
                                        p_values=np.reshape(flg_v_mock_counts['p-value'].values,(-1, 1)))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
# load the reconstruction
model = cobra.io.read_sbml_model('../results/reconstructions/pst7_complete_media.xml')

In [6]:
model

0,1
Name,PST
Memory address,0x01141f28d0
Number of metabolites,955
Number of reactions,959
Number of groups,0
Objective expression,1.0*bio1 - 1.0*bio1_reverse_b18f7
Compartments,"c, e"


In [7]:
#threshold for the GIMME algorithm
#assign the variables to new ones
constrained_model, constrained_solution = driven.gimme(model,flg_v_mock_exp_prof,cutoff=20000,fraction_of_optimum=1.0)
#open up all the exchange reactions, but gimme does not have OF of biomass,
#look into the gimme solution more, figure out how the driven model is modified and what is supposed to return
#gunna take some inkering, model that is further constrained than hte original by wither loss of reaction
#or changing the bounds

In [8]:
constrained_solution['bio1']

118.52448094946378

In [9]:
# any closed reaction in the constrained model? Based on the GIMME code,
# there won't be because they use the model in a context (so the original model
# gets returned)
for reaction in constrained_model.reactions:
    if reaction.lower_bound == reaction.upper_bound:
        print(reaction)

In [10]:
open_reactions = []
for reaction in constrained_solution.fluxes.keys():
    if abs(constrained_solution.fluxes[reaction]) > 0:
        open_reactions.append(reaction)

print(len(open_reactions))

479


In [11]:
constrained_after_gimme = model.copy()

In [12]:
# generate a truly "constrained" model by closing all reactions that didn't have flux in the gimme solution
for reaction in constrained_after_gimme.reactions:
    if reaction.id not in open_reactions:
        reaction.bounds = (0,0)

In [13]:
constrained_after_gimme.optimize()

Unnamed: 0,fluxes,reduced_costs
rxn00001_c,0.000000,-9.481958e-02
rxn00002_c,0.000000,0.000000e+00
rxn00011_c,2.664227,3.377860e-17
rxn00012_c,0.000000,0.000000e+00
rxn00020_c,0.000000,1.271495e-17
...,...,...
rxn01626_c,262.945651,4.930381e-32
rxn01763_c,0.000000,-4.385312e-18
rxn05390_c,10.096429,0.000000e+00
rxn05359_c,10.096429,-3.469447e-18


In [14]:
# Below are data explorations not related to GIMME
flg_v_mock = xomics_data[["p-value-flg22_Pto - Mock_Pto","flg22_Pto - Mock_Pto"]]

In [15]:
flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.01]

Unnamed: 0,p-value-flg22_Pto - Mock_Pto,flg22_Pto - Mock_Pto
PSPTO_0003,1.156187e-09,-1.057579
PSPTO_0005,1.863133e-05,-1.044168
PSPTO_0016,1.006172e-09,1.846849
PSPTO_0021,2.931064e-11,1.490298
PSPTO_0023,1.572790e-05,-1.374920
...,...,...
PSPTO_5631,1.740638e-15,3.142781
PSPTO_5635,4.762937e-18,2.153327
PSPTO_5637,4.778934e-07,-1.418637
PSPTO_5638,7.054630e-13,-2.343354


In [16]:
# How many differentially expressed genes are in the model, diff = significantly different genes
#big_diff = very significantly expressed genes
gene_ids = [gene.id for gene in model.genes]
diff = flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.01]
diff_genes = diff.index
big_diff = diff.loc[abs(diff['flg22_Pto - Mock_Pto']) > 2]
big_diff_genes = big_diff.index
print("differentially expressed genes condition:",len(diff_genes))
print("Bigly-differentially expressed genes condition:",len(big_diff_genes))
print("differentially expressed genes in model:",len(set(gene_ids) & set(diff_genes)))
print("Bigly-differentially expressed genes in model:",len(set(gene_ids) & set(big_diff_genes)))

differentially expressed genes condition: 1292
Bigly-differentially expressed genes condition: 382
differentially expressed genes in model: 144
Bigly-differentially expressed genes in model: 41


In [17]:
#find the associated reactions to the differentially expressed genes in the diff genes lists
big_diff_in_model = set(gene_ids) & set(big_diff_genes)
associated_reactions = []
for gene in big_diff_in_model:
    for reaction in model.genes.get_by_id(gene).reactions:
        associated_reactions.append(reaction)

In [18]:
#generate the reactions associated as well as reaction ids
for reaction in associated_reactions:
    print(reaction.build_reaction_string())
    print (reaction.id)

cpd00938_c <=> cpd00067_c + cpd00856_c
rxn02369_c
cpd00130_c <=> cpd00001_c + cpd00106_c
rxn00799_c
cpd00001_c + cpd00344_c <=> cpd00013_c + cpd00770_c
rxn01640_c
cpd00067_e + cpd00162_e <=> cpd00067_c + cpd00162_c
rxn05551_c
cpd00002_c + cpd00010_c + cpd00024_c --> cpd00011_c + cpd00078_c
cor0003_c
cor00010_c + cpd00002_c + cpd00010_c --> cor00006_c
cor0007_c
cpd00013_e <=> cpd00013_c
rxn05466_c
cor00003_c --> cor00004_c + cpd00001_c
cor0005_c
cpd00003_c + cpd01795_c <=> cpd00004_c + cpd00067_c + cpd02441_c
rxn02842_c
cpd00001_c + cpd00007_c + cpd00183_c --> cpd00025_c + cpd00033_c + cpd00055_c
rxn00433_c
cpd00070_c + cpd00078_c + cpd11493_c --> cor00003_c
cor0004_c
cpd00001_c + cpd00007_c + cpd00183_c --> cpd00025_c + cpd00033_c + cpd00055_c
rxn00433_c
cpd00001_c + cpd00002_c + cpd00058_c --> cpd00008_c + cpd00009_c + cpd00058_e + cpd00067_c
rxn05528_c
cpd00001_c + cpd00002_c + cpd00058_e --> cpd00008_c + cpd00009_c + cpd00058_c + cpd00067_c
rxn10481_c
cpd00001_c + cpd00002_c + cpd04