In [1]:
import pandas as pd
import cobra
import driven
import numpy as np
from driven import ExpressionProfile

In [2]:
fname = '../data/nobori_2018/pnas.1800529115.sd02.xlsx'
conditions = ['MM','vivo','flg22','D36E','AvrRpt2','AvrRps4']
all_conditions_frames = []
condition_to_p = {}
# load the up and down sheet for each comparison, then merge
# them into a single dataframe and rename the columns to meet the
# conventions in the driven package.
for condition in conditions:
    down = pd.read_excel(fname,sheet_name=condition+"_down")
    down = down.rename({condition+'_down':'gene'},axis='columns')
    up = pd.read_excel(fname,sheet_name=condition+"_up")
    up = up.rename({condition+'_up':'gene'},axis='columns')
    both = pd.concat([down,up])
    fchange_col = both.columns
    expression_column = [c for c in both.columns if "fold change" in c][0]
    p_column = [c for c in both.columns if "q-value" in c][0]
    condition = expression_column.split(': ')[1]
    
    both = both.rename({p_column:\
        "p-value-"+condition,expression_column:condition},axis='columns')
    
    # split each gene name by '-' to get the KEGG gene ID only,
    # then reassign it to the index
    new_index = [s[0] for s in both['gene'].str.split('-')]
    both.index = new_index
    both = both.drop('gene',axis=1)
    all_conditions_frames.append(both)
    
    # save the exact condition name and the p value column associated with it
    condition_to_p[condition] = "p-value-"+condition
xomics_data = pd.concat(all_conditions_frames,axis=1)

# for p-value columns that are NaN after the merge, reassign as 1 to indicate
# not significant
p_cols = [c for c in xomics_data.columns if "p-value" in c]
xomics_data[p_cols] = xomics_data[p_cols].fillna(value=1.0)

# for expression, do the equivalent (set foldchange to 0)
diff_cols = [c for c in xomics_data.columns if "p-value" not in c]
xomics_data[diff_cols] = xomics_data[diff_cols].fillna(value=0.0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [3]:
# Load the count data
fname = "../data/nobori_2018/GSE103441_count.txt"
count_frame = pd.read_csv(fname,sep='\t',index_col=0)
# remove sample names from the end of column names
# and merge samples group-wise into a mean
#'_'.join(count_frame.columns[0].split('_')[:-1])
groups = set(['_'.join(c.split('_')[:-1]) for c in count_frame.columns])
mean_count_frame = pd.DataFrame(index=count_frame.index)
for group in groups:
    samps = [col for col in count_frame.columns if group in col]
    mean = count_frame[samps].mean(axis=1)
    mean_count_frame[group] = mean
mean_count_frame

Unnamed: 0,Pto_stp,Pto_sid2pmr4,Pto_col,Pto-AvrRpt2_b2b3,Pto_deps,Pto-AvrRpt2_stp,Pto_de,Pto-AvrRpt2_de,Pto_npr1,Pto_pad4sid2,...,Pto_b2b3,Pto-AvrRpt2_pad4,Pto-AvrRpt2_KB,Pto-AvrRpt2_rr,Pto_pad4,Pto-AvrRpt2_sid2,Pto-AvrRpt2_sid2pmr4,Pto_col_mock,Pto_col_flg22,Pto-AvrRpt2_deps
PSPTOA0002,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0004,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0005,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0007,0.0,0.0,0.103448,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.6,0.0,0.000000
PSPTOA0008,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0009,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0010,0.0,0.0,0.034483,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.2,0.0,0.000000
PSPTOA0011,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0012,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000
PSPTOA0013,0.0,0.0,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.00,...,0.0,0.000000,0.000000,0.0,0.000,0.0,0.0,0.0,0.0,0.000000


In [4]:
# etract the groups of interest from count frame
flg_v_mock_counts = mean_count_frame[['Pto_col_mock','Pto_col_flg22']]

# replace A in the gene names with _ as in the other
# spreadsheets provided by the author
flg_v_mock_counts.index = flg_v_mock_counts.index.str.replace('A','_')

# add p-values
flg_v_mock_p = xomics_data["p-value-flg22_Pto - Mock_Pto"]
flg_v_mock_counts['p-value'] = flg_v_mock_p
flg_v_mock_counts = flg_v_mock_counts.fillna(1)


flg_v_mock_exp_prof = ExpressionProfile(identifiers=flg_v_mock_counts.index.tolist(),
                                        conditions=['Pto_col_mock','Pto_col_flg22'],
                                        expression=flg_v_mock_counts[['Pto_col_mock','Pto_col_flg22']].values,
                                        p_values=np.reshape(flg_v_mock_counts['p-value'].values,(-1, 1)))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
# load the reconstruction
model = cobra.io.load_json_model('../results/PST_pao1_temp_biomass_v2.json')

In [15]:
[m.id for m in model.metabolites if m.id.startswith('cPY')]

['cPY00140_c',
 'cPY00138_c',
 'cPY00124_c',
 'cPY00129_c',
 'cPY00132_c',
 'cPY00135_c',
 'cPY00016_e',
 'cPY00116_e']

In [None]:
driven.gimme(model,flg_v_mock_exp_prof,cutoff=5,fraction_of_optimum=0.05)

In [None]:
# try loading flg vs. mock expression first
flg_v_mock = xomics_data[["p-value-flg22_Pto - Mock_Pto","flg22_Pto - Mock_Pto"]]

In [None]:
# Load the current Pst model
pst = cobra.io.load_json_model('../results/pst_feeder.json')

In [None]:
flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.05]

In [None]:
# How many differentially expressed genes are in the model
gene_ids = [gene.id for gene in pst.genes]
diff = flg_v_mock.loc[flg_v_mock['p-value-flg22_Pto - Mock_Pto'] < 0.05]
diff_genes = diff.index
big_diff = diff.loc[abs(diff['flg22_Pto - Mock_Pto']) > 2]
big_diff_genes = big_diff.index
print("differentially expressed genes condition:",len(diff_genes))
print("Bigly-differentially expressed genes condition:",len(big_diff_genes))
print("differentially expressed genes in model:",len(set(gene_ids) & set(diff_genes)))
print("Bigly-differentially expressed genes in model:",len(set(gene_ids) & set(big_diff_genes)))

In [None]:
big_diff_in_model = set(gene_ids) & set(big_diff_genes)
associated_reactions = []
for gene in big_diff_in_model:
    for reaction in pst.genes.get_by_id(gene).reactions:
        associated_reactions.append(reaction)

In [None]:
for reaction in associated_reactions:
    print(reaction.build_reaction_string())

In [None]:
associated_reactions[0]

In [None]:
flg_v_mock.loc['PSPTO_0774']

In [None]:
#conditions = [key for key in condition_to_p.keys()]
#expression = xomics_data[]
flg_v_mock_exp_prof = ExpressionProfile(identifiers=flg_v_mock.index.tolist(),\
                                        conditions=['flg22_Pto - Mock_Pto'],\
                                        expression=np.matrix(flg_v_mock['flg22_Pto - Mock_Pto'].tolist()).transpose(),\
                                        p_values=np.matrix(flg_v_mock['p-value-flg22_Pto - Mock_Pto'].tolist()).transpose())

In [None]:
conditions = [key for key in condition_to_p.keys()]
xomics_profile = ExpressionProfile.from_data_frame(xomics_data[conditions])

In [None]:
xomics_profile.p_values

In [None]:
a = None
if not a:
    print("yep")