# Gene Overrrepresentation Analysis

### Setup

#### Libraries

In [1]:
library(dplyr)
library(ggVennDiagram)
library(ggplot2)
library(ggsci)
library(fgsea)
library(ComplexHeatmap)
library(matrixStats)
library(ggpubr)
library(msigdbr)
library(patchwork)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: grid

ComplexHeatmap version 2.8.0
Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
Github page: https://github.com/jokergoo/ComplexHeatmap
Documentation: http://jokergoo.github.io/ComplexHeatmap-reference

If you use it in published research, please cite:
Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
  genomic data. Bioinformatics 2016.

The new InteractiveComplexHeatmap package can directly export static 
complex heatmaps into an interactive Shiny app with zero effort. Have a try!

This message can be suppressed by:
  suppressPackageStartupMessages(library(ComplexHeatmap))



Attaching package: ‘matrixStats’


The following object is masked from ‘package:dplyr’:

    count




#### Parameters

In [2]:
# Analysis type
#analysis<-"tumorStageContinuous"
#voi<-"stage"
analysis<-"condition"
voi<-"conditiontumor"

# Event information
out_obj_dir<-"../input/trex_objects"
flags_dir<-"../output/psi_flags"
events_used<-c("SE","A3","A5","MX","RI","AF","AL")
cancer_order_set<-c("SKCM","UVM","SARC","LAML","TGCT","UCEC","CESC","PRAD","READ","BLCA",
                "COAD","PAAD","CHOL","KIRP","KIRC","KICH","ACC","PCPG","STAD","LIHC",
                "MESO","LUSC","LUAD","BRCA","THCA","ESCA","HNSC")

# Input reference files 
gmapfile<-"../input/references/gencode.v37.primary_assembly.annotation.geneIDmap.tsv"
cell_file<-"../input/references/CellMarker_Augmented_2021.parsed.tsv"

# Cutoffs 
p.signif.event<-0.05
p.signif.pathway<-0.2
min.lfc.gsea<-0.05

# Internal
fig_obj_dir<-paste0("../output/figure_objects/gora_",analysis)
vars<-c(voi,"impurity")
res.file<-paste0(out_obj_dir,"/tcga.",analysis,".res.lfcShrink.RDS")
dir.create(fig_obj_dir, showWarnings = FALSE)
set.seed(7)

#### Functions

In [3]:
# GORA analysis function
fora_test<-function(d,event,...){
    d <- d %>% 
          distinct(gene_id) 
    gene_univ<-gene_universe[gene_universe$event_type==event,"gene_id"]
    res<-fora(pathways = hm_list,
              genes = d$gene_id,
              universe = gene_univ,
              minSize = 15, 
              maxSize = 250) 
    return(res)
}

# Wrapper to test each event type 
gora_test_event<-function(res.shared,event,...){
    
    data.ev <- res.shared %>% 
                ungroup() %>%
                filter(event_type==event) %>%
                mutate(gene_id = sub("\\..*","",gene_id))
   
    ora<-data.ev %>% 
            tidyr::nest(data = c(event_id,gene_id)) %>%
            mutate(gora_res = lapply(data,function(d,...){fora_test(d,event)})) %>%
            select(-data) %>%
            tidyr::unnest(gora_res) %>% 
            ungroup() %>%
            mutate(significant=padj<p.signif.pathway)
    
    gora_res.plt <- ora %>%
                    ungroup() %>%
                    mutate(pathway=sub("HALLMARK_","",pathway),
                           poverlap=round(overlap/size,2)*100) 
    
    return(gora_res.plt)
}

gora_shared_events<-function(ev.sum,n=2,h=6,...){
    ev.shared<-ev.sum %>%
                 filter(ncancers_signif>=n)  %>%
                 select(-ncancers_signif,-ncancers) %>%
                 ungroup() %>%
                 distinct() 
    
    if(nrow(ev.shared)!=0){

        gora_res<-lapply(events_used,function(evn,...){
                     gora<-gora_test_event(event = evn,res.shared = ev.shared)}) %>%
                  do.call(rbind,.) %>%
                  group_by(pathway) %>%
                  mutate(nsig=sum(significant)) %>%
                  filter(nsig>=1) %>%
                  group_by(event_type,pathway) %>%
                  slice_min(padj)
        if(nrow(gora_res)>=2){
            pord<-gora_res %>%
                  select(-overlapGenes) %>%
                  group_by(pathway)%>%
                  summarize(n=sum(significant),s=sum(overlap)) %>%
                  arrange(desc(n)) 
            cord<-gora_res %>%
                  select(-overlapGenes) %>%
                  group_by(event_type) %>%
                  summarize(nsig=sum(significant))%>%
                  arrange(nsig)

            gora_res.plt<-gora_res %>%
                          mutate(pathway=factor(pathway,levels=pord$pathway),
                                 event_type=factor(event_type,levels=cord$event_type))
            np<-length(unique(gora_res.plt$pathway))

        }else{
            message("No significant pathways found")
            return(NULL)
        }
    }else{
        message("No events are shared in ",n," cancers")
        return(NULL)
    }
    return(gora_res.plt)
}

### Load data

In [4]:
trex.res.lfsh<-readRDS(file = res.file) %>%
               filter(event_type %in% events_used)

In [5]:
flags<-lapply(unique(trex.res.lfsh$cancer),function(can){
            d<-list.files(flags_dir,pattern=paste0(can,"*"),full.names=T) %>%
                lapply(.,function(fl,...){
                    d<-data.table::fread(fl,data.table=F) %>%
                        mutate(event_type=sub("_psi.*","",basename(fl)) %>% sub(".*_","",.),
                               cancer=can) 
                    return(d)
                }) %>%
            do.call(rbind,.)
        }) %>% 
        do.call(rbind,.) %>%
        filter(event_type %in% events_used) 

In [6]:
trex.res.fg<-left_join(trex.res.lfsh,flags)
trex.res.sig<-trex.res.fg %>%
              filter(exp_var==voi) %>%
              filter(event_flag=="valid") %>%
              mutate(significant=padj<p.signif.event)

[1m[22mJoining, by = c("event_type", "cancer", "event_id")


In [7]:
colnames(trex.res.sig)

In [8]:
sevs<-trex.res.sig %>%
      filter(significant) %>%
      distinct(event_id)

ev.sum<-trex.res.sig %>%
        filter(event_id %in% sevs$event_id) %>%
        group_by(event_type,gene_id,event_id) %>%
        summarize(ncancers=length(unique(cancer)),
                  ncancers_signif=length(unique(cancer[significant]))) %>%
        arrange(desc(ncancers)) %>%
        mutate(event_type=factor(event_type,levels=events_used))

[1m[22m`summarise()` has grouped output by 'event_type', 'gene_id'. You can override
using the `.groups` argument.


In [9]:
head(ev.sum)

event_type,gene_id,event_id,ncancers,ncancers_signif
<fct>,<chr>,<chr>,<int>,<int>
MX,ENSG00000001497.18,ENSG00000001497.18;MX:chrX:65531438-65532561:65532630-65534480:65531438-65533610:65533735-65534480:-,20,3
MX,ENSG00000001631.16,ENSG00000001631.16;MX:chr7:92242137-92244902:92245171-92245790:92242137-92245427:92245595-92245790:-,20,1
MX,ENSG00000003436.16,ENSG00000003436.16;MX:chr2:187503770-187513616:187513646-187554200:187503770-187520542:187520613-187554200:-,20,2
MX,ENSG00000003436.16,ENSG00000003436.16;MX:chr2:187503770-187513616:187513646-187554200:187503770-187529364:187529485-187554200:-,20,3
MX,ENSG00000003436.16,ENSG00000003436.16;MX:chr2:187503770-187520542:187520613-187554200:187503770-187529364:187529485-187554200:-,20,3
MX,ENSG00000003509.16,ENSG00000003509.16;MX:chr2:37231760-37232106:37232266-37237757:37231760-37236096:37236176-37237757:+,20,5


## Gene overrepresentation analysis

#### Defining gene sets

In [10]:
# Hallmark pathways
hm_gene_sets <- msigdbr(species = "human",category="H") 
hm_df <- hm_gene_sets %>% 
         distinct(gs_cat,gs_name,ensembl_gene) %>%
         as.data.frame() 
hm_list <- split(x = hm_df$ensembl_gene, f = hm_df$gs_name)

In [11]:
# Cell type signatures
genemap<-data.table::fread(gmapfile,data.table=F)
cell_genes<-read.table(cell_file,sep = "\t") %>%
            rename("gs_name"="V1",
                  "SYMBOL"="V2") %>%
            rowwise() %>%
            mutate(SYMBOL=strsplit(SYMBOL,",")) %>%
            group_by(gs_name) %>%
            tidyr::unnest(cols = c(SYMBOL)) %>%
            left_join(.,genemap) %>%
            select(-GENENAME,-SYMBOL) %>%
            rename("ensembl_gene"="GENEID") %>%
            mutate(tissue=sub(".*:","",gs_name)) %>%
            filter(tissue %in% c('Bone Marrow','Skeletal Muscle','Muscle')) %>%
            select(-tissue) %>%
            mutate(gs_name=sub("Bone Marrow","BM",gs_name) %>% sub("Skeletal Muscle","SM",.) %>% sub("Muscle","M",.),
                   gs_cat="C") 

[1m[22mJoining, by = "SYMBOL"


In [12]:
head(cell_genes,2)

gs_name,ensembl_gene,gs_cat
<chr>,<chr>,<chr>
Dendritic cell:BM,ENSG00000019582,C
Dendritic cell:BM,ENSG00000167851,C


In [13]:
gene_universe <- trex.res.lfsh %>%
                   filter(exp_var==voi) %>%
                   select(event_type,gene_id) %>%
                   mutate(gene_id = sub("\\..*","",gene_id)) %>%
                   distinct() %>%
                   arrange(event_type)

In [14]:
cancer_order<-cancer_order_set[cancer_order_set%in%unique(trex.res.lfsh$cancer)]

### Analysis of tumour events shared across cancers

In [15]:
sizes<-sort(unique(ev.sum$ncancers_signif[ev.sum$ncancers_signif>=2 & ev.sum$ncancers_signif<=14]))
for(nc in sizes){
    message("GORA of events shared in ",nc," cancer types...")
    gres<-gora_shared_events(ev.sum,n=nc,h=6)
    if(is.null(gres)){
        message("Maxed out")
        break
    }else{
        message("Storing results object...")
        outfile<-paste0(fig_obj_dir,"/res.",analysis,".shared.",nc,".cancers.RDS")
        saveRDS(gres,file = outfile)
    }
}

GORA of events shared in 2 cancer types...

Storing results object...

GORA of events shared in 3 cancer types...

Storing results object...

GORA of events shared in 4 cancer types...

Storing results object...

GORA of events shared in 5 cancer types...

Storing results object...

GORA of events shared in 6 cancer types...

Storing results object...

GORA of events shared in 7 cancer types...

Storing results object...

GORA of events shared in 8 cancer types...

Storing results object...

GORA of events shared in 9 cancer types...

Storing results object...

GORA of events shared in 10 cancer types...

Storing results object...

GORA of events shared in 11 cancer types...

Storing results object...

GORA of events shared in 12 cancer types...

Storing results object...

GORA of events shared in 13 cancer types...

Storing results object...

GORA of events shared in 14 cancer types...

Storing results object...



### Analysis per coefficient and event type

For all covariates and event types using only events that pass the thresholds defined in the input parameters and are flagged as valid PSIs.

In [16]:
hm_df <- rbind(cell_genes,hm_df)

In [17]:
gora_res<-list()
for(cat in c("H","C")){

    cat_df <- hm_df %>% filter(gs_cat==cat)
    hm_list <- split(x = cat_df$ensembl_gene, f = cat_df$gs_name)
    message("Testing enrichment of ",cat," gene sets...")
    
    for(var in vars){
        
        # Filter variable data
        message("Using ",var," coefficients...")
        trex.res.fg.var<-trex.res.fg %>%
                          filter(exp_var==var)
    
        for(event in events_used){

            # Filter event data
            message("Analyzing ",event," events...")
            data.vev <- trex.res.fg.var %>% 
                          filter(event_type==event) %>%
                          mutate(gene_id = sub("\\..*","",gene_id))

            events.ora <- data.vev %>%
                          filter(event_flag=="valid",
                                  padj<p.signif.event,
                                  abs(log2FoldChange)>=min.lfc.gsea) 

            # Get universe

            gene_univ<-gene_universe %>% filter(event_type==event) %>% select(gene_id) %>% unlist()

            ora <- events.ora %>% 
                    dplyr::distinct(cancer,event_id,gene_id) %>%
                    dplyr::group_by(cancer) %>%
                    tidyr::nest(data = c(event_id,gene_id)) %>%
                    mutate(gora_res = lapply(data,function(d,...){fora_test(d,event = event)})) 

            ora <- ora %>%
                    select(-data) %>%
                    tidyr::unnest(cols = gora_res) %>% 
                    mutate(significant = padj<p.signif.pathway)

            pord <- ora %>%
                      select(-overlapGenes) %>%
                      mutate(pathway=sub("HALLMARK_","",pathway)) %>%
                      group_by(pathway)%>%
                      summarize(n=sum(significant),
                                s=sum(overlap)) %>%
                      filter(n>=1) %>%
                      arrange(n) 
            gora_res.plt <- ora %>%
                            mutate(pathway=factor(sub("HALLMARK_","",pathway),levels=pord$pathway),
                                   cancer=factor(cancer,levels=cancer_order),
                                   poverlap=round(overlap/size,2)*100,
                                   event_type=event) %>% 
                            filter(pathway%in%pord$pathway)
            gora_res[[paste(cat,var,event,sep="_")]]<-gora_res.plt
        }
    }
}
gora_res<-gora_res[unlist(lapply(gora_res,nrow))!=0]

Testing enrichment of H gene sets...

Using conditiontumor coefficients...

Analyzing SE events...

Analyzing A3 events...

Analyzing A5 events...

Analyzing MX events...

Analyzing RI events...

Analyzing AF events...

Analyzing AL events...

Using impurity coefficients...

Analyzing SE events...

Analyzing A3 events...

Analyzing A5 events...

Analyzing MX events...

Analyzing RI events...

Analyzing AF events...

Analyzing AL events...

Testing enrichment of C gene sets...

Using conditiontumor coefficients...

Analyzing SE events...

Analyzing A3 events...

Analyzing A5 events...

Analyzing MX events...

Analyzing RI events...

Analyzing AF events...

Analyzing AL events...

Using impurity coefficients...

Analyzing SE events...

Analyzing A3 events...

Analyzing A5 events...

Analyzing MX events...

Analyzing RI events...

Analyzing AF events...

Analyzing AL events...



In [18]:
for(vev in names(gora_res)){
    gora_res.plt<-gora_res[[vev]]
    saveRDS(gora_res.plt,file = paste0(fig_obj_dir,"/res.",analysis,".",vev,".RDS"))
}