In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
from loguru import logger

In [4]:
import os
import sys
sys.path.append('..')

In [5]:
from pals.pimp_tools import get_pimp_API_token_from_env, PIMP_HOST, download_from_pimp
from pals.evaluation import run_resample_experiment, _select_significant_entries, _compute_prec_rec_f1
from pals.common import *
from pals.feature_extraction import DataSource
from pals.PALS import PALS
from pals.ORA import ORA



# HAT Data Analysis

This notebook is used to generate resampled data and run the different methods for comparison in the manuscript. Results are analysed in `PALS_HAT_experiment_evaluation.ipynb`

### Load data

Assume token is stored in environmental variable *PIMP_API_TOKEN*

In [6]:
token = get_pimp_API_token_from_env()

In [7]:
analysis_id_plasma = 636
int_df_plasma, annotation_df_plasma, experimental_design_plasma = download_from_pimp(token, PIMP_HOST, analysis_id_plasma, 'kegg')

2020-02-04 14:32:23.426 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: /tmp/pimp_analysis_636.p


In [8]:
analysis_id_csf = 635
int_df_csf, annotation_df_csf, experimental_design_csf = download_from_pimp(token, PIMP_HOST, analysis_id_csf, 'kegg')

2020-02-04 14:32:23.499 | DEBUG    | pals.pimp_tools:download_from_pimp:119 - Trying to load data from temp file: /tmp/pimp_analysis_635.p


### Create Data Sources

In [9]:
database_name = DATABASE_PIMP_KEGG
ds_plasma = DataSource(int_df_plasma, annotation_df_plasma, experimental_design_plasma, database_name)
ds_csf = DataSource(int_df_csf, annotation_df_csf, experimental_design_csf, database_name)

2020-02-04 14:32:23.543 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-02-04 14:32:23.544 | DEBUG    | pals.feature_extraction:get_database:105 - Loading /home/joewandy/git/PALS/pals/data/PiMP_KEGG.json.zip
2020-02-04 14:32:23.604 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-02-04 14:32:23.611 | DEBUG    | pals.feature_extraction:__init__:69 - Creating dataset to pathway mapping
2020-02-04 14:32:24.142 | DEBUG    | pals.feature_extraction:__init__:97 - Computing unique id counts
2020-02-04 14:32:24.153 | DEBUG    | pals.feature_extraction:__init__:42 - Using PiMP_KEGG as database
2020-02-04 14:32:24.154 | DEBUG    | pals.feature_extraction:get_database:105 - Loading /home/joewandy/git/PALS/pals/data/PiMP_KEGG.json.zip
2020-02-04 14:32:24.180 | DEBUG    | pals.feature_extraction:__init__:55 - Mapping pathway to unique ids
2020-02-04 14:32:24.186 | DEBUG    | pals.feature_extraction:__init__:69 - Creating datas

In [10]:
ds_plasma.get_measurements().shape

(15584, 60)

In [11]:
ds_csf.get_measurements().shape

(8154, 57)

# Run PALS Experiments

In [12]:
prob_missing_peaks = np.array([0.2, 0.4, 0.6, 0.8])
num_iterations = 500
plage_weight = 1
hg_weight = 0
gsea_resamples = 1000
gsea_ranking_method = GSEA_RANKING_SNR
parallel = True

In [13]:
results = {}

#### Disable debug logging

In [14]:
set_log_level_info()

#### Run on Plasma samples

In [15]:
data_source = ds_plasma
experiment_name = 'plasma'
case = 'Stage1'
control = 'Control'
significant_column = 'PiMP_KEGG Stage1/Control comb_p'

In [16]:
res = run_resample_experiment(experiment_name, data_source, case, control, prob_missing_peaks, significant_column, num_iterations, plage_weight, hg_weight, gsea_resamples, gsea_ranking_method, parallel=parallel)
results[experiment_name] = res

2020-02-04 14:32:24.772 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.20 n_sample=12467 PALS experiment=plasma case=Stage1 control=Control
2020-02-04 14:37:29.569 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.40 n_sample=9350 PALS experiment=plasma case=Stage1 control=Control
2020-02-04 14:41:30.489 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.60 n_sample=6233 PALS experiment=plasma case=Stage1 control=Control
2020-02-04 14:44:26.588 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.80 n_sample=3116 PALS experiment=plasma case=Stage1 control=Control


#### Run on CSF samples

In [17]:
data_source = ds_csf
experiment_name = 'csf'
case = 'Stage_2'
control = 'Control'
significant_column = 'PiMP_KEGG Stage_2/Control comb_p'

In [18]:
res = run_resample_experiment(experiment_name, data_source, case, control, prob_missing_peaks, significant_column, num_iterations, plage_weight, hg_weight, gsea_resamples, gsea_ranking_method, parallel=parallel)
results[experiment_name] = res

2020-02-04 14:46:14.617 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.20 n_sample=6523 PALS experiment=csf case=Stage_2 control=Control
2020-02-04 14:50:15.303 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.40 n_sample=4892 PALS experiment=csf case=Stage_2 control=Control
2020-02-04 14:53:33.147 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.60 n_sample=3261 PALS experiment=csf case=Stage_2 control=Control
2020-02-04 14:56:01.870 | INFO     | pals.evaluation:run_resample_experiment:298 - prob_missing_peaks=0.80 n_sample=1630 PALS experiment=csf case=Stage_2 control=Control


#### Save Results

In [19]:
save_obj(results, os.path.join('test_data', 'HAT', 'HAT_results.p'))