## Apply enrichment method

This notebook plugs in other gene set enrichment methods to demonstrate that our method, SOPHIE, can be inserted into different pipelines and work with other methods

In [20]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import pickle

from rpy2.robjects import pandas2ri
pandas2ri.activate()

from ponyo import utils
from generic_expression_patterns_modules import ranking

np.random.seed(123)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_general.tsv")
)

params = utils.read_config(config_filename)

In [3]:
# Load params
local_dir = params["local_dir"]
project_id = params['project_id']
statistic = params['gsea_statistic']
hallmark_DB_filename = params["pathway_DB_filename"]
num_runs = params["num_simulated"]

# TO DO:
# What are your choices of methods to use?
enrichment_method = "GSVA"

In [4]:
# Load DE stats directory
DE_stats_dir = os.path.join(local_dir, "DE_stats")

# Template experiment gene expression
template_expression_filename = params["processed_template_filename"]

# Template experiment DE stats
template_DE_stats_filename = os.path.join(
    DE_stats_dir,
    f"DE_stats_template_data_{project_id}_real.txt"
)

## Enrichment methods
* [ROAST](https://pubmed.ncbi.nlm.nih.gov/20610611/) is available in limma
* [CAMERA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3458527/) is available in limma
* [GSVA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3618321/) its own bioconductor package
* [ORA]() is available in PathwayStudios or David

TO DO: Write about each method

In [5]:
# Define function
# ORA works on list of DE <<-- how to download and install???

# ROAST, CAMERA <<- what input???

# Process data using voom

In [6]:
# Create "<local_dir>/GSEA_stats/" subdirectory
os.makedirs(os.path.join(local_dir, "EA_stats"), exist_ok=True)

In [7]:
# Load pathway data
hallmark_DB_filename = params["pathway_DB_filename"]

**Apply enrichment to template experiment**

See supplementary tables: https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbz158/5722384

In [8]:
%%R -i base_dir -i local_dir -i project_id -i template_expression_filename -i hallmark_DB_filename -i enrichment_method -o template_enriched_pathways

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

out_filename <- paste(local_dir, 
                      "EA_stats/",
                      enrichment_method,
                      "_stats_template_data_",
                      project_id,
                      "_real.txt", 
                      sep = "")

if (enrichment_method == "GSVA"){
    template_expression_filename <- paste(base_dir,
                                          "human_general_analysis",
                                          template_expression_filename,
                                          sep="/"
                                         )
    template_enriched_pathways <- find_enriched_pathways_GSVA(template_expression_filename, hallmark_DB_filename)
}
write.table(as.data.frame(template_enriched_pathways), file = out_filename, row.names = F, sep = "\t")




[1] "here"
Estimating GSVA scores for 50 gene sets.
Computing observed enrichment scores
Estimating ECDFs with Poisson kernels
Using parallel with 6 cores
  |                                                                              |                                                                      |   0%


  res = PandasDataFrame.from_items(items)


In [9]:
# Format output
print(template_enriched_pathways.shape)
template_enriched_pathways

(50, 24)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24
0,-0.177589,-0.152554,-0.01247,-0.014787,0.325992,0.327002,0.050864,0.055933,0.156162,0.181439,...,-0.30628,-0.325204,0.127233,0.086722,-0.115175,-0.102359,-0.064773,-0.032049,-0.110016,-0.117527
1,0.580593,0.572238,-0.468968,-0.473175,-0.423948,-0.461453,-0.307227,-0.31755,0.182646,0.181667,...,0.488696,0.519054,-0.043653,-0.048665,-0.324918,-0.367858,0.256912,0.250587,-0.087133,-0.075696
2,0.199356,0.175913,-0.086424,-0.106702,-0.318971,-0.358907,0.048769,0.03386,-0.347229,-0.385512,...,-0.136656,-0.127084,0.319045,0.324769,0.079276,0.140143,0.319306,0.37036,0.051098,0.04568
3,0.129557,0.198386,-0.396201,-0.302683,0.089756,0.132144,-0.321932,-0.326543,0.188094,0.162738,...,0.26014,0.221021,0.01741,0.000288,0.279824,0.263493,-0.15122,-0.082793,-0.358318,-0.305144
4,-0.001787,0.046338,-0.2435,-0.250038,0.239842,0.256769,-0.180053,-0.114921,0.230885,0.279681,...,0.004008,-0.005443,0.273646,0.220974,-0.082674,-0.069519,-0.016431,-0.040709,-0.363079,-0.35606
5,0.085163,0.111781,-0.402276,-0.412405,0.117308,0.114318,0.149284,0.050673,0.222217,0.259361,...,-0.202745,-0.052204,0.300968,0.209854,-0.19671,-0.252021,0.181836,0.158318,-0.319183,-0.20044
6,0.371533,0.370646,-0.35122,-0.33207,-0.203986,-0.20365,-0.266735,-0.271368,0.272343,0.242142,...,-0.134075,-0.178138,0.269571,0.327266,-0.159751,-0.189887,0.257271,0.234359,-0.080849,-0.075691
7,-0.01715,0.056125,-0.146066,-0.145133,0.254396,0.273755,-0.101156,-0.117243,0.045502,-0.042034,...,-0.166344,-0.099867,0.086079,0.14155,-0.179895,-0.129958,0.100631,0.135138,0.058844,0.063873
8,0.023373,0.100393,-0.122864,-0.134463,0.256775,0.267582,-0.083461,-0.083977,0.147714,0.208907,...,-0.433618,-0.434164,0.234038,0.20082,-0.102424,-0.087456,0.160721,0.186635,0.005924,-0.046535
9,0.315451,0.30425,-0.300034,-0.262166,0.106882,0.094665,-0.237313,-0.317681,0.210623,0.222678,...,0.047641,0.06426,0.162469,0.115962,-0.02951,-0.034318,0.040793,0.004175,-0.278758,-0.221244


**Apply enrichment to simulated experiments**

In [None]:
## EA stats not outputting in correct location for some reason.

In [27]:
%%R -i project_id -i local_dir -i hallmark_DB_filename -i num_runs -i base_dir -i enrichment_method

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

if (enrichment_method == "GSVA"){
    for (i in 0:(num_runs-1)){
        print(i)
        simulated_expression_filename <- paste(local_dir, 
                                               "pseudo_experiment/selected_simulated_data_",
                                               project_id,
                                               "_", 
                                               i,
                                               "_processed.txt",
                                               sep = "")

        out_filename <- paste(local_dir,
                              "EA_stats/",
                              enrichment_method,
                              "_stats_simulated_data_",
                              project_id,
                              "_",
                              i,
                              ".txt", 
                              sep = "")
        print(out_filename)

        enriched_pathways <- find_enriched_pathways_GSVA(simulated_expression_filename, hallmark_DB_filename) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("here")
    }
}

[1] 0
[1] "/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/GSVA_stats_simulated_data_SRP061689_0.txt"
Estimating GSVA scores for 50 gene sets.
Computing observed enrichment scores
Estimating ECDFs with Poisson kernels
Using parallel with 6 cores
  |                                                                              |                                                                      |   0%


## Format enrichment output

Each method yields a different output format so we will need to format the data before we can rank and summarize it

In [19]:
%%R -i hallmark_DB_filename -o hallmark_DB_names
library("GSA")

hallmark_DB <- GSA.read.gmt(hallmark_DB_filename)

hallmark_DB_names <- as.data.frame(hallmark_DB$geneset.names)

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [22]:
ranking.format_enrichment_output(
    local_dir, 
    project_id, 
    enrichment_method, 
    hallmark_DB_names,
    num_runs
)

FileNotFoundError: File b'/home/alexandra/Documents/Data/Generic_expression_patterns/EA_stats/GSVA_stats_simulated_data_SRP061689_0.txt' does not exist

## Rank pathways

In [None]:
analysis_type = "GSEA"
col_to_rank_pathways = "ES"

template_GSEA_stats_filename = os.path.join(
    local_dir,
    "EA_stats",
    f"{enrichment_method}_stats_template_data_{project_id}_real.txt"    
)
template_GSEA_stats, simulated_GSEA_summary_stats = ranking.process_and_rank_genes_pathways(
    template_GSEA_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_pathways,
)

## Pathway summary table

In [None]:
# Create intermediate file: "<local_dir>/gene_summary_table_<col_to_rank_pathways>.tsv"
summary_pathway_ranks = ranking.generate_summary_table(
    template_GSEA_stats_filename,
    template_GSEA_stats,
    simulated_GSEA_summary_stats,
    col_to_rank_pathways,
    local_dir,
    'pathway',
    params
)

summary_pathway_ranks.sort_values(by="Z score", ascending=False).head()

In [None]:
# Create `pathway_summary_filename`
summary_pathway_ranks.to_csv(pathway_summary_filename, sep='\t')