## Apply enrichment method

This notebook plugs in other gene set enrichment methods to demonstrate that our method, SOPHIE, can be inserted into different pipelines and work with other methods

In [14]:
%load_ext autoreload
%load_ext rpy2.ipython
%autoreload 2

import os
import sys
import pandas as pd
import numpy as np
import pickle

from rpy2.robjects import pandas2ri
pandas2ri.activate()

from ponyo import utils
from generic_expression_patterns_modules import ranking

np.random.seed(123)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [15]:
# Read in config variables
base_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))

config_filename = os.path.abspath(
    os.path.join(base_dir, "configs", "config_human_general.tsv")
)

params = utils.read_config(config_filename)

In [16]:
# Load params
local_dir = params["local_dir"]
project_id = params['project_id']
statistic = params['gsea_statistic']
hallmark_DB_filename = params["pathway_DB_filename"]
num_runs = params["num_simulated"]
dataset_name = params['dataset_name']

# Select enrichment method
# enrichment_method = ["GSEA", GSVA", "ROAST", "CAMERA", "OSA"]
# If enrichment_method == "GSEA" then use "padj" to rank
# If enrichment_method == "GSVA" then use "ES" to rank
# If enrichment_method == "ROAST" or "CAMERA" then use "FDR" to rank
# If using "OSA" then use "padj" to rank
enrichment_method = "ORA"
col_to_rank_pathways = "padj"

In [17]:
# Load DE stats directory
DE_stats_dir = os.path.join(local_dir, "DE_stats")

# Template experiment gene expression
template_expression_filename = os.path.join(base_dir, dataset_name, params["processed_template_filename"])

# Template experiment DE stats
template_DE_stats_filename = os.path.join(
    DE_stats_dir,
    f"DE_stats_template_data_{project_id}_real.txt"
)

# Metadata file with sample grouping to define comparison
metadata_filename = os.path.join(
    base_dir,
    dataset_name,
    "data",
    "metadata",
    f"{project_id}_groups.tsv"
)

## Enrichment methods
* [ROAST](https://pubmed.ncbi.nlm.nih.gov/20610611/) is available in limma
* [CAMERA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3458527/) is available in limma
* [GSVA](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3618321/) its own bioconductor package
* [ORA]() is available in PathwayStudios or David

TO DO: Write about each method

In [18]:
# Create "<local_dir>/GSEA_stats/" subdirectory
os.makedirs(os.path.join(local_dir, "GSA_stats"), exist_ok=True)

In [19]:
# Load pathway data
hallmark_DB_filename = params["pathway_DB_filename"]

**Apply enrichment to template experiment**

See supplementary tables: https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbz158/5722384

In [22]:
%%R -i base_dir -i local_dir -i project_id -i template_expression_filename -i hallmark_DB_filename -i metadata_filename -i enrichment_method -o template_enriched_pathways

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

out_filename <- paste(local_dir, 
                      "GSA_stats/",
                      enrichment_method,
                      "_stats_template_data_",
                      project_id,
                      "_real.txt", 
                      sep = "")

if (enrichment_method == "GSVA"){
    
    template_enriched_pathways <- find_enriched_pathways_GSVA(
        template_expression_filename,
        hallmark_DB_filename
    )
}
else if (enrichment_method == "ROAST"){
    
    template_enriched_pathways <- find_enriched_pathways_ROAST(
        template_expression_filename,
        metadata_filename,
        hallmark_DB_filename
    )
}
else if (enrichment_method == "CAMERA"){
    
    template_enriched_pathways <- find_enriched_pathways_CAMERA(
        template_expression_filename,
        metadata_filename,
        hallmark_DB_filename
    )
}
else if (enrichment_method == "ORA"){
    
    template_enriched_pathways <- find_enriched_pathways_ORA(
        template_expression_filename,
        metadata_filename, 
        hallmark_DB_filename
    )
}
write.table(as.data.frame(template_enriched_pathways), file = out_filename, row.names = F, sep = "\t")




[1] "Checking sample ordering..."
[1] TRUE


In [21]:
# Quick check
#print(template_enriched_pathways.shape)
#template_enriched_pathways

NameError: name 'template_enriched_pathways' is not defined

**Apply enrichment to simulated experiments**

In [None]:
## TO DO: Check issues
## EA stats not outputting in correct location for some reason for GSVA.
## All stats are the same using ROAST
## ORA returning 0 pathways as signficantly over-represented, note error message coming from -o template_enriched_pathways since its null

In [9]:
%%R -i project_id -i local_dir -i hallmark_DB_filename -i metadata_filename -i num_runs -i base_dir -i enrichment_method

source(paste0(base_dir, '/generic_expression_patterns_modules/other_enrichment_methods.R'))

for (i in 0:(num_runs-1)){
    simulated_expression_filename <- paste(local_dir, 
                                           "pseudo_experiment/selected_simulated_data_",
                                           project_id,
                                           "_", 
                                           i,
                                           "_processed.txt",
                                           sep = "")

    out_filename <- paste(local_dir,
                          "GSA_stats/",
                          enrichment_method,
                          "_stats_simulated_data_",
                          project_id,
                          "_",
                          i,
                          ".txt", 
                          sep = "")
    
    if (enrichment_method == "GSVA"){
        enriched_pathways <- find_enriched_pathways_GSVA(
            simulated_expression_filename, 
            hallmark_DB_filename
        ) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in GSVA")
    }
    else if (enrichment_method == "ROAST"){
        enriched_pathways <- find_enriched_pathways_ROAST(
            simulated_expression_filename,
            metadata_filename,
            hallmark_DB_filename
        ) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in ROAST")
    }
    else if (enrichment_method == "CAMERA"){
        enriched_pathways <- find_enriched_pathways_CAMERA(
            simulated_expression_filename,
            metadata_filename, 
            hallmark_DB_filename
        ) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in CAMERA")
    }
    else if (enrichment_method == "ORA"){
        enriched_pathways <- find_enriched_pathways_ORA(
            simulated_expression_filename,
            metadata_filename, 
            hallmark_DB_filename
        ) 
        write.table(as.data.frame(enriched_pathways), file = out_filename, row.names = F, sep = "\t")
        print("in ORA")
    }
}

-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)



-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)




-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESe


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)



-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)


-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)





[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "in ORA"
[1] "Checking sample ordering..."
[1] TRUE
[1] "

### TO DO:
Validate results. Looks like all pathways have the same statistic for ROAST

## Format enrichment output

Each method yields a different output format so we will need to format the data before we can rank and summarize it

In [10]:
%%R -i hallmark_DB_filename -o hallmark_DB_names
library("GSA")

hallmark_DB <- GSA.read.gmt(hallmark_DB_filename)

hallmark_DB_names <- as.data.frame(hallmark_DB$geneset.names)





12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849501
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [11]:
ranking.format_enrichment_output(
    local_dir, 
    project_id, 
    enrichment_method, 
    hallmark_DB_names,
    num_runs
)

## Rank pathways

In [13]:
analysis_type = "GSA"

template_GSEA_stats_filename = os.path.join(
    local_dir,
    "GSA_stats",
    f"{enrichment_method}_stats_template_data_{project_id}_real.txt"    
)
template_GSEA_stats, simulated_GSEA_summary_stats = ranking.process_and_rank_genes_pathways(
    template_GSEA_stats_filename,
    local_dir,
    num_runs,
    project_id,
    analysis_type,
    col_to_rank_pathways,
    enrichment_method
)

KeyError: 'padj'

## Pathway summary table

In [None]:
# Create intermediate file: "<local_dir>/gene_summary_table_<col_to_rank_pathways>.tsv"
summary_pathway_ranks = ranking.generate_summary_table(
    template_GSEA_stats_filename,
    template_GSEA_stats,
    simulated_GSEA_summary_stats,
    col_to_rank_pathways,
    local_dir,
    'pathway',
    params
)

summary_pathway_ranks.sort_values(by="Z score", ascending=False).head()

In [None]:
# Create `pathway_summary_filename`
summary_pathway_ranks.to_csv(pathway_summary_filename, sep='\t')