In [2]:
import pandas as pd
import cobra
import driven
import numpy as np
from driven.data_sets import ExpressionProfile

In [3]:
fname = '../data/nobori_2018/pnas.1800529115.sd02.xlsx'
conditions = ['MM','vivo','flg22','D36E','AvrRpt2','AvrRps4']
all_conditions_frames = []
condition_to_p = {}
# load the up and down sheet for each comparison, then merge
# them into a single dataframe and rename the columns to meet the
# conventions in the driven package.
for condition in conditions:
    down = pd.read_excel(fname,sheet_name=condition+"_down")
    down = down.rename({condition+'_down':'gene'},axis='columns')
    up = pd.read_excel(fname,sheet_name=condition+"_up")
    up = up.rename({condition+'_up':'gene'},axis='columns')
    both = pd.concat([down,up])
    fchange_col = both.columns
    expression_column = [c for c in both.columns if "fold change" in c][0]
    p_column = [c for c in both.columns if "q-value" in c][0]
    condition = expression_column.split(': ')[1]
    
    both = both.rename({p_column:\
        "p-value-"+condition,expression_column:condition},axis='columns')
    
    # split each gene name by '-' to get the KEGG gene ID only,
    # then reassign it to the index
    new_index = [s[0] for s in both['gene'].str.split('-')]
    both.index = new_index
    both = both.drop('gene',axis=1)
    all_conditions_frames.append(both)
    
    # save the exact condition name and the p value column associated with it
    condition_to_p[condition] = "p-value-"+condition
xomics_data = pd.concat(all_conditions_frames,axis=1)

# for p-value columns that are NaN after the merge, reassign as 1 to indicate
# not significant
p_cols = [c for c in xomics_data.columns if "p-value" in c]
xomics_data[p_cols] = xomics_data[p_cols].fillna(value=1.0)

# for expression, do the equivalent (set foldchange to 0)
diff_cols = [c for c in xomics_data.columns if "p-value" not in c]
xomics_data[diff_cols] = xomics_data[diff_cols].fillna(value=0.0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




In [4]:
# Load the count data
fname = "../data/nobori_2018/GSE103441_count.txt"
count_frame = pd.read_csv(fname,sep='\t',index_col=0)
# remove sample names from the end of column names
# and merge samples group-wise into a mean
#'_'.join(count_frame.columns[0].split('_')[:-1])
groups = set(['_'.join(c.split('_')[:-1]) for c in count_frame.columns])
mean_count_frame = pd.DataFrame(index=count_frame.index)
for group in groups:
    samps = [col for col in count_frame.columns if group in col]
    mean = count_frame[samps].mean(axis=1)
    mean_count_frame[group] = mean
mean_count_frame

Unnamed: 0,Pto_npr1,Pto_pad4sid2,Pto-AvrRpt2_sid2pmr4,Pto-AvrRpt2_de,Pto_col_mock,Pto_sid2pmr4,Pto-AvrRpt2_stp,Pto-AvrRpt2_rr,Pto-D36E_KB,Pto_KB,...,Pto_col_SA,Pto_col_chitin,Pto_de,Pto_MM_A80,Pto-AvrRpt2_KB,Pto_stp,Pto_deps,Pto-AvrRpt2_npr1,Pto-AvrRpt2_deps,Pto_pad4
PSPTOA0002,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0004,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0005,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0007,0.0,0.00,0.0,0.000000,0.6,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0008,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0009,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0010,0.0,0.00,0.0,0.000000,0.2,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0011,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0012,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000
PSPTOA0013,0.0,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.000000,0.00,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000


In [6]:
# extract the groups of interest from count frame
flg_v_mock_counts = mean_count_frame[['Pto_col_mock','Pto_col_flg22']]
flg_v_mm_counts = mean_count_frame[["Pto_col_flg22", "Pto_MM_A80"]]
# replace A in the gene names with _ as in the other
# spreadsheets provided by the author
#the following line is not necessary because PSPTO_0001 and PSPTOA0001 are different genes, changed to 
#replace "A" with "_A" like plasmid B
flg_v_mock_counts.index = flg_v_mock_counts.index.str.replace('A','_A')
flg_v_mm_counts.index = flg_v_mm_counts.index.str.replace('A', "_A")
# add p-values
flg_v_mock_p = xomics_data["p-value-flg22_Pto - Mock_Pto"]
flg_v_mock_counts['p-value'] = flg_v_mock_p
flg_v_mock_counts = flg_v_mock_counts.fillna(1)
#flg_v_mm_counts_p_val = xomics_data["p-value-"]

flg_v_mock_exp_prof = ExpressionProfile(identifiers=flg_v_mock_counts.index.tolist(),
                                        conditions=['Pto_col_mock','Pto_col_flg22'],
                                        expression=flg_v_mock_counts[['Pto_col_mock','Pto_col_flg22']].values,
                                        p_values=np.reshape(flg_v_mock_counts['p-value'].values,(-1, 1)))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
# load the reconstruction
model = cobra.io.read_sbml_model('../results/test_for_memote.xml')

In [8]:
[m.id for m in model.metabolites if m.id.startswith('cPY')]

[]

In [9]:
#threshold for the GIMME algorithm
#assign the variables to new ones
constrained_model, constrained_solution = driven.gimme(model,flg_v_mock_exp_prof,cutoff=5,fraction_of_optimum=0.05)
#open up all the exchange reactions, but gimme does not have OF of biomass,
#look into the gimme solution more, figure out how the driven model is modified and what is supposed to return
#gunna take some inkering, model that is further constrained than hte original by wither loss of reaction
#or changing the bounds

In [None]:
# try loading flg vs. mock expression first
flg_v_mock = xomics_data[["p-value-flg22_Pto - Mock_Pto","flg22_Pto - Mock_Pto"]]

In [22]:
flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.01]

Unnamed: 0,p-value-flg22_Pto - Mock_Pto,flg22_Pto - Mock_Pto
PSPTO_0003,1.156187e-09,-1.057579
PSPTO_0005,1.863133e-05,-1.044168
PSPTO_0016,1.006172e-09,1.846849
PSPTO_0021,2.931064e-11,1.490298
PSPTO_0023,1.572790e-05,-1.374920
PSPTO_0025,9.026485e-07,1.700713
PSPTO_0043,2.475167e-14,1.639614
PSPTO_0044,8.589775e-07,-1.913224
PSPTO_0049,1.880575e-11,1.278185
PSPTO_0061,5.814111e-06,-1.159054


In [27]:
# How many differentially expressed genes are in the model, diff = significantly different genes
#big_diff = very significantly expressed genes
gene_ids = [gene.id for gene in model.genes]
diff = flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.01]
diff_genes = diff.index
big_diff = diff.loc[abs(diff['flg22_Pto - Mock_Pto']) > 2]
big_diff_genes = big_diff.index
print("differentially expressed genes condition:",len(diff_genes))
print("Bigly-differentially expressed genes condition:",len(big_diff_genes))
print("differentially expressed genes in model:",len(set(gene_ids) & set(diff_genes)))
print("Bigly-differentially expressed genes in model:",len(set(gene_ids) & set(big_diff_genes)))

differentially expressed genes condition: 1292
Bigly-differentially expressed genes condition: 382
differentially expressed genes in model: 135
Bigly-differentially expressed genes in model: 32


In [13]:
#find the associated reactions to the differentially expressed genes in the diff genes lists
big_diff_in_model = set(gene_ids) & set(big_diff_genes)
associated_reactions = []
for gene in big_diff_in_model:
    for reaction in model.genes.get_by_id(gene).reactions:
        associated_reactions.append(reaction)

In [19]:
#generate the reactions associated as well as reaction ids
for reaction in associated_reactions:
    print(reaction.build_reaction_string())
    print (reaction.id)

cpd00130_c <=> cpd00001_c + cpd00106_c
rxn00799_c0
cpd00022_c + cpd03124_c <=> cpd00010_c + cpd03121_c
rxn03248_c0
cpd00001_c + cpd00344_c <=> cpd00013_c + cpd00770_c
rxn01640_c0
cpd00007_c + cpd00197_c --> 2.0 cpd00067_c + cpd00856_c
rxn01192_c0
cpd00162_c --> cpd00013_c + cpd00071_c
rxn00539_c0
cpd00067_e + cpd00281_e <=> cpd00067_c + cpd00281_c
rxn05564_c0
cpd00001_c + cpd00002_c + cpd00107_e --> cpd00008_c + cpd00009_c + cpd00067_c + cpd00107_c
rxn05161_c0
cpd00001_c + cpd00002_c + cpd00156_e --> cpd00008_c + cpd00009_c + cpd00067_c + cpd00156_c
rxn05168_c0
cpd00001_c + cpd00002_c + cpd11578_e --> cpd00008_c + cpd00009_c + cpd00067_c + cpd11578_c
rxn05585_c0
cpd00024_c + cpd00281_c <=> cpd00023_c + cpd00199_c
rxn01204_c0
cpd00003_c + cpd00314_c <=> cpd00004_c + cpd00067_c + cpd00082_c
rxn00629_c0
cpd00001_c + cpd00002_c + cpd00531_c --> cpd00008_c + cpd00009_c + cpd00067_c + cpd00531_e
rxn10343_c0
cpd00001_c + cpd00002_c + cpd00058_c --> cpd00008_c + cpd00009_c + cpd00058_e + cpd00

In [15]:
associated_reactions[0]

0,1
Reaction identifier,rxn00799_c0
Name,(S)-malate hydro-lyase (fumarate-forming)
Memory address,0x011efebd30
Stoichiometry,cpd00130_c <=> cpd00001_c + cpd00106_c  L-Malate <=> H2O + Fumarate
GPR,PSPTO_4339 or PSPTO_1731 or PSPTO_4461
Lower bound,-1000.0
Upper bound,1000.0


In [20]:
flg_v_mock.loc['PSPTO_4339']

p-value-flg22_Pto - Mock_Pto    1.0
flg22_Pto - Mock_Pto            0.0
Name: PSPTO_4339, dtype: float64

In [None]:
#conditions = [key for key in condition_to_p.keys()]
#expression = xomics_data[]
flg_v_mock_exp_prof = ExpressionProfile(identifiers=flg_v_mock.index.tolist(),\
                                        conditions=['flg22_Pto - Mock_Pto'],\
                                        expression=np.matrix(flg_v_mock['flg22_Pto - Mock_Pto'].tolist()).transpose(),\
                                        p_values=np.matrix(flg_v_mock['p-value-flg22_Pto - Mock_Pto'].tolist()).transpose())

In [None]:
conditions = [key for key in condition_to_p.keys()]
xomics_profile = ExpressionProfile.from_data_frame(xomics_data[conditions])

In [None]:
xomics_profile.p_values

In [18]:
a = None
if not a:
    print("yep")

yep
