## Apply enrichment method

This notebook plugs in other gene set enrichment methods to demonstrate that our method, SOPHIE, can be inserted into different pipelines and work with other methods

In [1]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import pickle

from rpy2.robjects import pandas2ri
pandas2ri.activate()

from ponyo import utils
from generic_expression_patterns_modules import ranking

np.random.seed(123)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_general.tsv")
)

params = utils.read_config(config_filename)

In [3]:
# Load params
local_dir = params["local_dir"]
project_id = params['project_id']
statistic = params['gsea_statistic']
hallmark_DB_filename = params["pathway_DB_filename"]
num_runs = params["num_simulated"]
dataset_name = params['dataset_name']

# TO DO:
# What are your choices of methods to use?
enrichment_method = "ROAST"

In [4]:
# Load DE stats directory
DE_stats_dir = os.path.join(local_dir, "DE_stats")

# Template experiment gene expression
template_expression_filename = os.path.join(base_dir, dataset_name, params["processed_template_filename"])

# Template experiment DE stats
template_DE_stats_filename = os.path.join(
    DE_stats_dir,
    f"DE_stats_template_data_{project_id}_real.txt"
)

# Metadata file with sample grouping to define comparison
metadata_filename = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    f"{project_id}_groups.tsv"
)

## Enrichment methods
* [ROAST](https://pubmed.ncbi.nlm.nih.gov/20610611/) is available in limma
* [CAMERA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3458527/) is available in limma
* [GSVA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3618321/) its own bioconductor package
* [ORA]() is available in PathwayStudios or David

TO DO: Write about each method

In [5]:
# Define function
# ORA works on list of DE <<-- how to download and install???

# ROAST, CAMERA <<- what input???

# Process data using voom

In [6]:
# Create "<local_dir>/GSEA_stats/" subdirectory
os.makedirs(os.path.join(local_dir, "EA_stats"), exist_ok=True)

In [7]:
# Load pathway data
hallmark_DB_filename = params["pathway_DB_filename"]

**Apply enrichment to template experiment**

See supplementary tables: https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbz158/5722384

In [8]:
%%R -i base_dir -i local_dir -i project_id -i template_expression_filename -i hallmark_DB_filename -i metadata_filename -i enrichment_method -o template_enriched_pathways

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

out_filename <- paste(local_dir, 
                      "EA_stats/",
                      enrichment_method,
                      "_stats_template_data_",
                      project_id,
                      "_real.txt", 
                      sep = "")

if (enrichment_method == "GSVA"){
    
    template_enriched_pathways <- find_enriched_pathways_GSVA(template_expression_filename, hallmark_DB_filename)
}
else if (enrichment_method == "ROAST"){
    
    template_enriched_pathways <- find_enriched_pathways_ROAST(template_expression_filename, metadata_filename, hallmark_DB_filename)
}
write.table(as.data.frame(template_enriched_pathways), file = out_filename, row.names = F, sep = "\t")


Attaching package: ‘edgeR’



    DGEList




[1] "Checking sample ordering..."
[1] TRUE
 [1] 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2
Levels: 1 2


  res = PandasDataFrame.from_items(items)


In [9]:
# Quick check
print(template_enriched_pathways.shape)
template_enriched_pathways

(50, 8)


Unnamed: 0,NGenes,PropDown,PropUp,Direction,PValue,FDR,PValue.Mixed,FDR.Mixed
0,197,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
1,197,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
2,196,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
3,195,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
4,195,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
5,193,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
6,193,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
7,193,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
8,192,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999
9,191,1.0,0.0,Down,0.000999,0.000999,0.000999,0.000999


**Apply enrichment to simulated experiments**

In [10]:
## TO DO: EA stats not outputting in correct location for some reason.

In [15]:
%%R -i project_id -i local_dir -i hallmark_DB_filename -i metadata_filename -i num_runs -i base_dir -i enrichment_method

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

for (i in 0:(num_runs-1)){
    print(i)
    simulated_expression_filename <- paste(local_dir, 
                                           "pseudo_experiment/selected_simulated_data_",
                                           project_id,
                                           "_", 
                                           i,
                                           "_processed.txt",
                                           sep = "")

    out_filename <- paste(local_dir,
                          "EA_stats/",
                          enrichment_method,
                          "_stats_simulated_data_",
                          project_id,
                          "_",
                          i,
                          ".txt", 
                          sep = "")
    print(out_filename)
    
    if (enrichment_method == "GSVA"){
        enriched_pathways <- find_enriched_pathways_GSVA(simulated_expression_filename, hallmark_DB_filename) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in GSVA")
    }
    else if (enrichment_method == "ROAST"){
        enriched_pathways <- find_enriched_pathways_ROAST(simulated_expression_filename, metadata_filename, hallmark_DB_filename) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in ROAST")
    }
}

[1] 0
[1] "/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/ROAST_stats_simulated_data_SRP012656_0.txt"
[1] "Checking sample ordering..."
[1] TRUE
 [1] 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2
Levels: 1 2
[1] "in ROAST"
[1] 1
[1] "/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/ROAST_stats_simulated_data_SRP012656_1.txt"
[1] "Checking sample ordering..."
[1] TRUE
 [1] 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2
Levels: 1 2
[1] "in ROAST"
[1] 2
[1] "/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/ROAST_stats_simulated_data_SRP012656_2.txt"
[1] "Checking sample ordering..."
[1] TRUE
 [1] 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2
Levels: 1 2
[1] "in ROAST"
[1] 3
[1] "/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/ROAST_stats_simulated_data_SRP012656_3.txt"
[1] "Checking sample ordering..."
[1] TRUE
 [1] 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2 1 1 2 2
Levels: 1 2
[1] "in ROAST"
[1] 4
[1] "/home

### TO DO:
Validate results. Looks like all pathways have the same statistic for ROAST

## Format enrichment output

Each method yields a different output format so we will need to format the data before we can rank and summarize it

In [12]:
%%R -i hallmark_DB_filename -o hallmark_DB_names
library("GSA")

hallmark_DB <- GSA.read.gmt(hallmark_DB_filename)

hallmark_DB_names <- as.data.frame(hallmark_DB$geneset.names)





12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [13]:
ranking.format_enrichment_output(
    local_dir, 
    project_id, 
    enrichment_method, 
    hallmark_DB_names,
    num_runs
)

## Rank pathways

In [14]:
analysis_type = "GSEA"
col_to_rank_pathways = "ES"

template_GSEA_stats_filename = os.path.join(
    local_dir,
    "EA_stats",
    f"{enrichment_method}_stats_template_data_{project_id}_real.txt"    
)
template_GSEA_stats, simulated_GSEA_summary_stats = ranking.process_and_rank_genes_pathways(
    template_GSEA_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_pathways,
)

KeyError: 'ES'

## Pathway summary table

In [None]:
# Create intermediate file: "<local_dir>/gene_summary_table_<col_to_rank_pathways>.tsv"
summary_pathway_ranks = ranking.generate_summary_table(
    template_GSEA_stats_filename,
    template_GSEA_stats,
    simulated_GSEA_summary_stats,
    col_to_rank_pathways,
    local_dir,
    'pathway',
    params
)

summary_pathway_ranks.sort_values(by="Z score", ascending=False).head()

In [None]:
# Create `pathway_summary_filename`
summary_pathway_ranks.to_csv(pathway_summary_filename, sep='\t')