In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%pylab inline

In [None]:
import os
import sys
import pathlib
import pickle
import warnings

sys.path.append('..')

In [None]:
import pandas as pd
from loguru import logger
import seaborn as sns

In [None]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.noise import construct_intensity_df, add_random_peaks, plot_intensity_matrix, convert_to_data_source
from pals.evaluation import run_noise_experiment, calc_av_p_scores, construct_single_box_df
from pals.feature_extraction import DataSource

from pals.PALS import PALS
from pals.ORA import ORA
from pals.GSEA import GSEA
from pals.common import *

## Synthetic data test

Try generating some synthetic data without noise

In [None]:
control_fnames = ['Control_1.mzXML', 'Control_2.mzXML', 'Control_3.mzXML', 'Control_4.mzXML']
case_fnames = ['Case_1.mzXML', 'Case_2.mzXML', 'Case_3.mzXML', 'Case_4.mzXML']
pathway_names ={'two':2, 'four':4, 'six':6, 'ten':10, 'twenty':20, 'forty':40, 'eighty':80}

num_iterations = 500
bg_pw = 100
gsea_resamples = 1000
min_replace = 5000
plage_weight = 5
hg_weight = 1
prob_missing_peaks = 0.2

# pbar = True
# parallel = False

pbar = False
parallel = True

base_dir = os.path.join('test_data', 'synthetic')
create_if_not_exist(base_dir)

In [None]:
sample_fnames = control_fnames + case_fnames
int_df, updated_pathway_names = construct_intensity_df(sample_fnames, pathway_names, random=False, background_pathways=bg_pw)
print(int_df.shape)
print(updated_pathway_names)

out_file = os.path.join(base_dir, 'simulated_intensity.eps')
plot_intensity_matrix(int_df, out_file=out_file)

Convert simulated data above to a PALS DataSource object and run pathway analysis

In [None]:
ds = convert_to_data_source(int_df, updated_pathway_names, case_fnames, control_fnames, prob_missing_peaks, min_replace)

In [None]:
pals = PALS(ds, plage_weight=plage_weight, hg_weight=hg_weight)
pals_df = pals.get_pathway_df()
pals_df.sort_values('pw_name')

In [None]:
ora = ORA(ds)
ora_df = ora.get_pathway_df(correct_multiple_tests=True)
ora_df.sort_values('pw_name')

In [None]:
gsea = GSEA(ds, random_sets=gsea_resamples, pbar=pbar)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values('pw_name')

Now take the synthetic data matrix above, add some noisy peaks (100%) and run pathway analysis methods again.

In [None]:
percent = 250
noise_mean = 0
noise_std = 5
int_df = add_random_peaks(sample_fnames, pathway_names, int_df, percent, noise_mean, noise_std)
print(int_df.shape)
plot_intensity_matrix(int_df)

In [None]:
ds = convert_to_data_source(int_df, updated_pathway_names, case_fnames, control_fnames, prob_missing_peaks, min_replace)

In [None]:
pals = PALS(ds, plage_weight=plage_weight, hg_weight=hg_weight)
pals_df = pals.get_pathway_df()
pals_df.sort_values('pw_name')

In [None]:
ora = ORA(ds)
ora_df = ora.get_pathway_df(correct_multiple_tests=True)
ora_df.sort_values('pw_name')

In [None]:
gsea = GSEA(ds, random_sets=gsea_resamples, pbar=True)
gsea_df = gsea.get_pathway_df()
gsea_df.sort_values('pw_name')

### Experiments

### Experiment 1: adding noise

In this experiment, we add some % of noise to the data, and compares how the different methods perform.

In [None]:
out_file = os.path.join(base_dir, 'experiment_1.p')

In [None]:
prob_missing_peaks = 0.20
noise_std = 5
reqd_scenarios = [
    { 'noise_std': noise_std, 'percent': 0, 'prob_missing_peaks': prob_missing_peaks, },
    { 'noise_std': noise_std, 'percent': 25, 'prob_missing_peaks': prob_missing_peaks, },
    { 'noise_std': noise_std, 'percent': 50, 'prob_missing_peaks': prob_missing_peaks, },
    { 'noise_std': noise_std, 'percent': 100, 'prob_missing_peaks': prob_missing_peaks, },        
    { 'noise_std': noise_std, 'percent': 250, 'prob_missing_peaks': prob_missing_peaks, },
    { 'noise_std': noise_std, 'percent': 500, 'prob_missing_peaks': prob_missing_peaks, },
    { 'noise_std': noise_std, 'percent': 1000, 'prob_missing_peaks': prob_missing_peaks, },
]
pals_df, ora_df, gsea_df = run_noise_experiment(bg_pw, case_fnames, control_fnames, pathway_names, num_iterations, plage_weight, hg_weight, gsea_resamples, reqd_scenarios, pbar=pbar, parallel=parallel)
combined_df = pd.concat([pals_df, ora_df, gsea_df])

In [None]:
results = {
    'pals_df': pals_df,
    'ora_df': ora_df,
    'gsea_df': gsea_df,
    'combined_df': combined_df
}
save_obj(results, out_file)

#### Load experiment results

In [None]:
out_file = os.path.join(base_dir, 'experiment_1.p')

In [None]:
results = load_obj(out_file)
pals_df = results['pals_df']
ora_df = results['ora_df']
gsea_df = results['gsea_df']
combined_df = results['combined_df']

In [None]:
filtered_pals_df = pals_df[~pals_df.pathway.str.contains("background")].copy()
filtered_ora_df = ora_df[~ora_df.pathway.str.contains("background")].copy()
filtered_gsea_df = gsea_df[~gsea_df.pathway.str.contains("background")].copy()
filtered_combined_df = combined_df[~combined_df.pathway.str.contains("background")].copy()

In [None]:
filtered_pals_df.groupby('percent').describe()

In [None]:
filtered_ora_df.groupby('percent').describe()

In [None]:
filtered_gsea_df.groupby('percent').describe()

Define seaborn styles. This will affect all plots from hereon.

In [None]:
sns.set(style="ticks")
style.use('seaborn-poster') #sets the size of the charts
sns.set_context('poster')  #Everything is larger

In [None]:
ax = sns.boxplot(x="percent", y="comb_p_value", hue='method', hue_order=['ORA', 'GSEA', 'PALS'], data=filtered_combined_df, palette=("rainbow"))
ax.set(xlabel='Noise Level (%)', ylabel='p-value')
plt.plot([-0.5, 6.5], [0.05, 0.05], 'r--')
plt.title('Increasing Noise Level')
plt.legend(loc='upper left')
plt.tight_layout()
sns.despine()

out_file = os.path.join(base_dir, 'adding_noise.eps')
plt.savefig(out_file, dpi=300)

In [None]:
pals_pvalue_mean = pd.DataFrame(filtered_pals_df.groupby('percent').describe()['p_value']['mean'])
ora_pvalue_mean = pd.DataFrame(filtered_ora_df.groupby('percent').describe()['p_value']['mean'])
gsea_pvalue_mean = pd.DataFrame(filtered_gsea_df.groupby('percent').describe()['p_value']['mean'])

pals_pvalue_mean['method'] = 'PALS'
ora_pvalue_mean['method'] = 'ORA'
gsea_pvalue_mean['method'] = 'GSEA'
combined_df = pd.concat([pals_pvalue_mean, ora_pvalue_mean, gsea_pvalue_mean])

ax = sns.lineplot(data=combined_df.reset_index(), x='percent', y='mean', hue='method', hue_order=['ORA', 'GSEA', 'PALS'], palette='rainbow')
ax.set(xlabel='Noise Level (%)', ylabel='Mean p-value')
plt.title('Mean p-values for all noise levels')

plt.tight_layout()
sns.despine()

out_file = os.path.join(base_dir, 'mean_pvalues.eps')
plt.savefig(out_file, dpi=300)

#### Plot for individual pathways

Make lineplot

In [None]:
sns.set(style="whitegrid")
style.use('seaborn-poster') #sets the size of the charts
sns.set_context('poster')  #Everything is larger

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True)

df = filtered_ora_df
df = df.groupby(['pathway', 'percent']).describe()
df = pd.DataFrame(df['comb_p_value']['mean'])
df = pd.DataFrame(df.to_records())
df['pathway'] = pd.Categorical(df['pathway'], ['two', 'four', 'six', 'ten', 'twenty', 'forty', 'eighty'])
df = df.sort_values(['pathway', 'percent'])    
df['percent'] = ['$%s$' % x for x in df['percent']] # https://github.com/mwaskom/seaborn/issues/1653
ax = sns.lineplot(data=df, x='pathway', y='mean', hue='percent', palette=("rainbow"), marker='o', markersize=10, 
                  ax=axes[0], legend=False)
sns.despine()
ax.set(xlabel='Pathways', ylabel='Mean p-value')
ax.yaxis.set_ticks(np.arange(0, 1.05, 0.05))        
ax.set_title('ORA')

df = filtered_gsea_df
df = df.groupby(['pathway', 'percent']).describe()
df = pd.DataFrame(df['comb_p_value']['mean'])
df = pd.DataFrame(df.to_records())
df['pathway'] = pd.Categorical(df['pathway'], ['two', 'four', 'six', 'ten', 'twenty', 'forty', 'eighty'])
df = df.sort_values(['pathway', 'percent'])    
df['percent'] = ['$%s$' % x for x in df['percent']] # https://github.com/mwaskom/seaborn/issues/1653
ax = sns.lineplot(data=df, x='pathway', y='mean', hue='percent', palette=("rainbow"), marker='o', markersize=10, 
                  ax=axes[1], legend=False)
sns.despine()
ax.set(xlabel='Pathways', ylabel='Mean p-value')
ax.yaxis.set_ticks(np.arange(0, 1.05, 0.05))        
ax.set_title('GSEA')

df = filtered_pals_df
df = df.groupby(['pathway', 'percent']).describe()
df = pd.DataFrame(df['comb_p_value']['mean'])
df = pd.DataFrame(df.to_records())
df['pathway'] = pd.Categorical(df['pathway'], ['two', 'four', 'six', 'ten', 'twenty', 'forty', 'eighty'])
df = df.sort_values(['pathway', 'percent'])    
df['percent'] = ['$%s$' % x for x in df['percent']] # https://github.com/mwaskom/seaborn/issues/1653
ax = sns.lineplot(data=df, x='pathway', y='mean', hue='percent', palette=("rainbow"), marker='o', markersize=10, 
                  ax=axes[2], legend='brief')
sns.despine()
ax.set(xlabel='Pathways', ylabel='Mean p-value')
ax.yaxis.set_ticks(np.arange(0, 1.05, 0.05))        
ax.set_title('PALS')

ax.legend(bbox_to_anchor=(1.05, 0), loc='lower left', borderaxespad=0.)
         # it will place the legend on the outer right-hand side of the last axes
    
for ax in fig.axes:
    matplotlib.pyplot.sca(ax)
    plt.xticks(rotation=90)
    
plt.suptitle('Mean p-values for Each Pathway')
    
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
out_file = os.path.join(base_dir, 'mean_pvalues.eps')
plt.savefig(out_file, dpi=300)

Make barplots

In [None]:
filtered_pals_df['pathway'] = pd.Categorical(filtered_pals_df['pathway'], ['eighty', 'forty', 'twenty', 'ten', 'six', 'four', 'two'])
filtered_ora_df['pathway'] = pd.Categorical(filtered_ora_df['pathway'], ['eighty', 'forty', 'twenty', 'ten', 'six', 'four', 'two'])
filtered_gsea_df['pathway'] = pd.Categorical(filtered_gsea_df['pathway'], ['eighty', 'forty', 'twenty', 'ten', 'six', 'four', 'two'])

filtered_pals_df = filtered_pals_df.sort_values('pathway')
filtered_ora_df = filtered_ora_df.sort_values('pathway')
filtered_gsea_df = filtered_gsea_df.sort_values('pathway')

In [None]:
ax = sns.boxplot(x='pathway', y='comb_p_value', hue='percent', data=filtered_ora_df, palette=("rainbow"))
ax.set(xlabel='Pathways', ylabel='p-value')
plt.plot([-0.5, 6.5], [0.05, 0.05], 'r--')
plt.title('Individual Pathway Rankings (ORA)')

out_file = os.path.join(base_dir, 'pathway_ranking_ora.eps')
plt.savefig(out_file, dpi=300)

In [None]:
ax = sns.boxplot(x='pathway', y='comb_p_value', hue='percent', data=filtered_gsea_df, palette=("rainbow"))
ax.set(xlabel='Pathways', ylabel='p-value')
plt.plot([-0.5, 6.5], [0.05, 0.05], 'r--')
plt.title('Individual Pathway Rankings (GSEA)')

out_file = os.path.join(base_dir, 'pathway_ranking_gsea.eps')
plt.savefig(out_file, dpi=300)

In [None]:
ax = sns.boxplot(x='pathway', y='comb_p_value', hue='percent', data=filtered_pals_df, palette=("rainbow"))
ax.set(xlabel='Pathways', ylabel='p-value')
plt.plot([-0.5, 6.5], [0.05, 0.05], 'r--')
plt.title('Individual Pathway Rankings (PALS)')

out_file = os.path.join(base_dir, 'pathway_ranking_pals.eps')
plt.savefig(out_file, dpi=300)

### Experiment 2: introducing missing peaks

In this experiment, we add a fixed (100%) noisy peaks, then increase the number of missing peaks in the data.

In [None]:
out_file = os.path.join(base_dir, 'experiment_2.p')

In [None]:
percent = 100
noise_std = 5
reqd_scenarios = [
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.1, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.2, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.3, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.4, },        
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.5, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.6, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.7, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.8, },
    { 'noise_std': noise_std, 'percent': percent, 'prob_missing_peaks': 0.9, },
]
pals_df, ora_df, gsea_df = run_noise_experiment(bg_pw, case_fnames, control_fnames, pathway_names, num_iterations, plage_weight, hg_weight, gsea_resamples, reqd_scenarios, pbar=pbar, parallel=parallel)
combined_df = pd.concat([pals_df, ora_df, gsea_df])

In [None]:
results = {
    'pals_df': pals_df,
    'ora_df': ora_df,
    'gsea_df': gsea_df,
    'combined_df': combined_df
}
save_obj(results, out_file)

#### Load experiment results

In [None]:
out_file = os.path.join(base_dir, 'experiment_2.p')

In [None]:
results = load_obj(out_file)
pals_df = results['pals_df']
ora_df = results['ora_df']
gsea_df = results['gsea_df']
combined_df = results['combined_df']

In [None]:
filtered_pals_df = pals_df[~pals_df.pathway.str.contains("background")].copy()
filtered_ora_df = ora_df[~ora_df.pathway.str.contains("background")].copy()
filtered_gsea_df = gsea_df[~gsea_df.pathway.str.contains("background")].copy()
filtered_combined_df = combined_df[~combined_df.pathway.str.contains("background")].copy()

In [None]:
filtered_pals_df.groupby('prob_missing').describe()

In [None]:
filtered_ora_df.groupby('prob_missing').describe()

In [None]:
filtered_gsea_df.groupby('prob_missing').describe()

In [None]:
style.use('seaborn-poster') #sets the size of the charts
sns.set(style="whitegrid")
style.use('ggplot')
sns.set_context('poster')  #Everything is larger

ax = sns.boxplot(x="prob_missing", y="comb_p_value", hue='method', hue_order=['ORA', 'GSEA', 'PALS'], data=filtered_combined_df, palette=("rainbow"))
ax.set(xlabel='Missing Peaks', ylabel='p-value')
plt.plot([-0.5, 8.5], [0.05, 0.05], 'r--')
plt.title('Increasing Missing Peaks')
plt.legend(loc='upper left')
plt.tight_layout()

out_file = os.path.join(base_dir, 'missing_peaks.eps')
plt.savefig(out_file, dpi=300)

### Experiment 3: Nothing is changing, it's all random

In [None]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', r'divide by zero')
    warnings.filterwarnings('ignore', r'invalid value encountered')
    results = calc_av_p_scores(case_fnames, control_fnames, pathway_names, 1, random=True, plage_weight=plage_weight, hg_weight=hg_weight, gsea_resamples=gsea_resamples, background_pathways=bg_pw, pbar=pbar)
    pals_df = construct_single_box_df(results, 100, 0.2, 1, 'PALS')      
    ora_df = construct_single_box_df(results, 100, 0.2, 1, 'ORA')   
    gsea_df = construct_single_box_df(results, 100, 0.2, 1, 'GSEA')   

In [None]:
filtered_pals_df = pals_df[~pals_df.pathway.str.contains("background")].copy()
filtered_ora_df = ora_df[~ora_df.pathway.str.contains("background")].copy()
filtered_gsea_df = gsea_df[~gsea_df.pathway.str.contains("background")].copy()

In [None]:
filtered_ora_df

In [None]:
filtered_pals_df

In [None]:
filtered_gsea_df