# Co-Expression Module Annotation

**Created**: 16 October 2021

## Environment

In [44]:
if (!requireNamespace("clusterProfiler", quietly=TRUE)) {
    require(devtools)
    install_version("rvcheck", version = "0.1.8", repos = "http://cran.us.r-project.org")
    BiocManager::install("clusterProfiler")
}

if (!requireNamespace("ReactomePA", quietly=TRUE)) {
    BiocManager::install("ReactomePA")
}

if (!requireNamespace("org.Hs.eg.db", quietly=TRUE)) {
    BiocManager::install("org.Hs.eg.db")
}

'getOption("repos")' replaces Bioconductor standard repositories, see
'?repositories' for details

replacement repositories:
    CRAN: https://cran.r-project.org


Bioconductor version 3.12 (BiocManager 1.30.18), R 4.0.5 (2021-03-31)

Installing package(s) 'ReactomePA'

also installing the dependencies ‘reactome.db’, ‘graphite’


Old packages: 'blob', 'brew', 'broom', 'Cairo', 'car', 'caret', 'checkmate',
  'circlize', 'cli', 'clipr', 'clue', 'cluster', 'conquer', 'DBI', 'dbplyr',
  'DEoptimR', 'dequer', 'desc', 'doParallel', 'dplyr', 'e1071', 'energy',
  'exactRankTests', 'FNN', 'foreach', 'formatR', 'future', 'future.apply',
  'gdtools', 'gert', 'ggplot2', 'globals', 'gower', 'gplots', 'gtools',
  'haven', 'Hmisc', 'httr', 'igraph', 'ipred', 'km.ci', 'knitr', 'leiden',
  'lme4', 'maptools', 'MASS', 'Matrix', 'matrixStats', 'mclogit', 'mgcv',
  'mnormt', 'nlme', 'nloptr', 'openssl', 'parallelly', 'pbdZMQ', 'pcaPP',
  'polynom', 'pracma', 'processx', 'progressr', 'proxy', 'ps', 'psych'

In [47]:
library(tidyverse)
library(clusterProfiler)
library(ReactomePA)
library(org.Hs.eg.db)

options(stringsAsFactors = FALSE)

setwd("~/eQTL_pQTL_Characterization/")

source("04_Expression/scripts/utils/ggplot_theme.R")

ReactomePA v1.34.0  For help: https://guangchuangyu.github.io/ReactomePA

If you use ReactomePA in published research, please cite:
Guangchuang Yu, Qing-Yu He. ReactomePA: an R/Bioconductor package for reactome pathway analysis and visualization. Molecular BioSystems 2016, 12(2):477-479



## Load Data

In [7]:
gene.info <- read.table("/nfs/team282/data/gains_team282/Gene_info_864_20416.txt")
modules <- read.csv("~/gains_team282/nikhil/expression/gene_expression/modules.csv")
eigengenes <- read.csv("~/gains_team282/nikhil/expression/gene_expression/eigengenes.csv", row.names=1)
variance.explained <- read.csv("~/gains_team282/nikhil/expression/gene_expression/variance.explained.csv")

## Setup

Set up Ensembl IDs and Entrez IDs for each module. Also identify the set of background genes.

In [19]:
module.names <- paste0("Module_", 1:dim(eigengenes)[2])
module.list <- lapply(module.names, function(module.name) {
    
    module.info = modules %>%
        dplyr::filter(Module==module.name) %>%
        merge(., gene.info, by.x="Gene", by.y="gene_id") %>%
        dplyr::select(Ensembl.ID=Gene, Gene.Name=gene_name)
    
    genes.ensembl = module.info$Ensembl.ID
    
    genes.map = suppressMessages(suppressWarnings(
        bitr(genes.ensembl, fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
    ))
    
    module.info = merge(module.info, genes.map, by.x="Ensembl.ID", by.y="ENSEMBL", all.x=TRUE)
})
names(module.list) <- module.names

In [36]:
universe.ensembl = modules$Gene
universe.entrez = unique(do.call(c, lapply(module.list, function(x) { x$ENTREZID[!is.na(x$ENTREZID)] })))

In [70]:
full.gene.id.map <- do.call(rbind, module.list)

## GO Term Enrichment

I will test for enrichment with GO terms in Cellular Components (CC), Molecular Functions (MF), and Biological Processes (BP).

In [55]:
go.cc.all <- lapply(names(module.list), function(module) {

    enrichGO(
        gene = module.list[[module]]$Ensembl.ID, 
        OrgDb = org.Hs.eg.db, keyType = "ENSEMBL", ont = "CC", 
        pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
        universe = universe.ensembl,
        readable=TRUE
    ) %>%
        as.data.frame() %>%
        dplyr::mutate(Module = module) %>%
        dplyr::select(Module, everything())
}) %>%
    do.call(rbind, .)

No gene set have size > 10 ...

--> return NULL...

No gene set have size > 10 ...

--> return NULL...

No gene set have size > 10 ...

--> return NULL...



In [56]:
go.mf.all <- lapply(names(module.list), function(module) {

    enrichGO(
        gene = module.list[[module]]$Ensembl.ID, 
        OrgDb = org.Hs.eg.db, keyType = "ENSEMBL", ont = "MF", 
        pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
        universe = universe.ensembl,
        readable=TRUE
    ) %>%
        as.data.frame() %>%
        dplyr::mutate(Module = module) %>%
        dplyr::select(Module, everything())
}) %>%
    do.call(rbind, .)

No gene set have size > 10 ...

--> return NULL...



In [57]:
go.bp.all <- lapply(names(module.list), function(module) {

    enrichGO(
        gene = module.list[[module]]$Ensembl.ID, 
        OrgDb = org.Hs.eg.db, keyType = "ENSEMBL", ont = "BP", 
        pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
        universe = universe.ensembl,
        readable=TRUE
    ) %>%
        as.data.frame() %>%
        dplyr::mutate(Module = module) %>%
        dplyr::select(Module, everything())
}) %>%
    do.call(rbind, .)

## KEGG Enrichment

In [59]:
kegg.all <- lapply(names(module.list), function(module) {

    genes = module.list[[module]]$ENTREZID
    genes = genes[!is.na(genes)]
    
    enrichKEGG(
        gene = genes, 
        organism = "hsa", keyType = "ncbi-geneid",
        pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
        universe = universe.entrez
    ) %>%
        as.data.frame() %>%
        dplyr::mutate(Module = module) %>%
        dplyr::select(Module, everything())
}) %>%
    do.call(rbind, .)

--> No gene can be mapped....

--> Expected input gene ID: 10327,5213,9365,2538,23205,51102

--> return NULL...

--> No gene can be mapped....

--> Expected input gene ID: 2182,2821,5213,8789,2180,7366

--> return NULL...



## Reactome Enrichment

In [60]:
reactome.all <- lapply(names(module.list), function(module) {

    genes = module.list[[module]]$ENTREZID
    genes = genes[!is.na(genes)]
    
    enrichPathway(
        gene = genes, 
        organism = "human",
        pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.05,
        universe = universe.entrez,
        readable=TRUE
    ) %>%
        as.data.frame() %>%
        dplyr::mutate(Module = module) %>%
        dplyr::select(Module, everything())
}) %>%
    do.call(rbind, .)

--> No gene can be mapped....

--> Expected input gene ID: 255488,130013,3921,51071,57119,55902

--> return NULL...

No gene set have size > 10 ...

--> return NULL...

No gene set have size > 10 ...

--> return NULL...

No gene set have size > 10 ...

--> return NULL...

--> No gene can be mapped....

--> Expected input gene ID: 5685,10048,54511,54498,54996,727

--> return NULL...



## Save Annotations

In [68]:
go.cc.all %>%
    dplyr::select(
        Module, ID, Description,
        Gene_Ratio=GeneRatio, Background_Ratio=BgRatio,
        P_Value=pvalue, Adjusted_P_Value=p.adjust, Q_Value=qvalue,
        Gene_ID=geneID, Count
    ) %>% 
    dplyr::filter(Adjusted_P_Value < 0.05) %>%
    write.csv(., paste0("~/gains_team282/nikhil/expression/gene_expression_annotations/WGCNA_GO_Cellular_Component.csv"), row.names=F)

go.bp.all %>%
    dplyr::select(
        Module, ID, Description,
        Gene_Ratio=GeneRatio, Background_Ratio=BgRatio,
        P_Value=pvalue, Adjusted_P_Value=p.adjust, Q_Value=qvalue,
        Gene_ID=geneID, Count
    ) %>% 
    dplyr::filter(Adjusted_P_Value < 0.05) %>%
    write.csv(., paste0("~/gains_team282/nikhil/expression/gene_expression_annotations/WGCNA_GO_Biological_Process.csv"), row.names=F)

go.mf.all %>%
    dplyr::select(
        Module, ID, Description,
        Gene_Ratio=GeneRatio, Background_Ratio=BgRatio,
        P_Value=pvalue, Adjusted_P_Value=p.adjust, Q_Value=qvalue,
        Gene_ID=geneID, Count
    ) %>% 
    dplyr::filter(Adjusted_P_Value < 0.05) %>%
    write.csv(., paste0("~/gains_team282/nikhil/expression/gene_expression_annotations/WGCNA_GO_Molecular_Function.csv"), row.names=F)

In [83]:
kegg.all %>%
    dplyr::mutate(geneID = sapply(strsplit(kegg.all$geneID, "/"), function(entrez.ids) { 
        paste0(
            merge(as.data.frame(entrez.ids), full.gene.id.map, by.x=1, by.y="ENTREZID")$Gene.Name,
            collapse="/"
        )
    })) %>% dplyr::select(
        Module, ID, Description,
        Gene_Ratio=GeneRatio, Background_Ratio=BgRatio,
        P_Value=pvalue, Adjusted_P_Value=p.adjust, Q_Value=qvalue,
        Gene_ID=geneID, Count
    ) %>% 
    dplyr::filter(Adjusted_P_Value < 0.05) %>%
    write.csv(., paste0("~/gains_team282/nikhil/expression/gene_expression_annotations/WGCNA_KEGG_Human.csv"), row.names=F)

In [85]:
reactome.all %>%
    dplyr::select(
        Module, ID, Description,
        Gene_Ratio=GeneRatio, Background_Ratio=BgRatio,
        P_Value=pvalue, Adjusted_P_Value=p.adjust, Q_Value=qvalue,
        Gene_ID=geneID, Count
    ) %>% 
    dplyr::filter(Adjusted_P_Value < 0.05) %>%
    write.csv(., paste0("~/gains_team282/nikhil/expression/gene_expression_annotations/WGCNA_Reactome.csv"), row.names=F)