From 67f2679822578d66bbe67166fb82f52e5903be98 Mon Sep 17 00:00:00 2001
From: Charlotte Soneson <charlottesoneson@gmail.com>
Date: Tue, 23 Apr 2024 22:33:51 +0200
Subject: [PATCH] Include pathways from MSigDB among supported feature
 collections

---
 NEWS.md                          |   1 +
 R/checkArgumentsDIANN.R          |   2 +-
 R/checkArgumentsFragPipe.R       |   2 +-
 R/checkArgumentsMaxQuant.R       |   2 +-
 R/checkArgumentsPDTMT.R          |   2 +-
 R/checkArgumentsSpectronaut.R    |   2 +-
 R/plotVolcano.R                  | 113 ++++++++++++++++---------------
 R/prepareFeatureCollections.R    |  63 ++++++++++++++---
 man/prepareFeatureCollections.Rd |  18 +++--
 9 files changed, 130 insertions(+), 75 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index bf5d913..8281966 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,6 +2,7 @@
 
 * Add details about DIA-NN command line to report
 * Add column with links to ComplexPortal query to link table
+* Include pathways from MSigDB among supported feature collections
 
 # einprot 0.9.3
 
diff --git a/R/checkArgumentsDIANN.R b/R/checkArgumentsDIANN.R
index 004714a..2d7468a 100644
--- a/R/checkArgumentsDIANN.R
+++ b/R/checkArgumentsDIANN.R
@@ -196,7 +196,7 @@
 
     ## Complexes
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE)
     .assertVector(x = customComplexes, type = "list")
     if (length(customComplexes) > 0) {
         .assertVector(x = names(customComplexes), type = "character")
diff --git a/R/checkArgumentsFragPipe.R b/R/checkArgumentsFragPipe.R
index 3c91142..355bf1f 100755
--- a/R/checkArgumentsFragPipe.R
+++ b/R/checkArgumentsFragPipe.R
@@ -201,7 +201,7 @@
 
     ## Complexes
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE)
     .assertVector(x = customComplexes, type = "list")
     if (length(customComplexes) > 0) {
         .assertVector(x = names(customComplexes), type = "character")
diff --git a/R/checkArgumentsMaxQuant.R b/R/checkArgumentsMaxQuant.R
index 735c186..2c20f4c 100644
--- a/R/checkArgumentsMaxQuant.R
+++ b/R/checkArgumentsMaxQuant.R
@@ -186,7 +186,7 @@
 
     ## Complexes
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE)
     .assertVector(x = customComplexes, type = "list")
     if (length(customComplexes) > 0) {
         .assertVector(x = names(customComplexes), type = "character")
diff --git a/R/checkArgumentsPDTMT.R b/R/checkArgumentsPDTMT.R
index 0f4d8cd..1474a65 100644
--- a/R/checkArgumentsPDTMT.R
+++ b/R/checkArgumentsPDTMT.R
@@ -209,7 +209,7 @@
 
     ## Complexes
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE)
     .assertVector(x = customComplexes, type = "list")
     if (length(customComplexes) > 0) {
         .assertVector(x = names(customComplexes), type = "character")
diff --git a/R/checkArgumentsSpectronaut.R b/R/checkArgumentsSpectronaut.R
index 2bfa4ef..2417cd6 100644
--- a/R/checkArgumentsSpectronaut.R
+++ b/R/checkArgumentsSpectronaut.R
@@ -191,7 +191,7 @@
 
     ## Complexes
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE)
     .assertVector(x = customComplexes, type = "list")
     if (length(customComplexes) > 0) {
         .assertVector(x = names(customComplexes), type = "character")
diff --git a/R/plotVolcano.R b/R/plotVolcano.R
index 0a32159..8314459 100644
--- a/R/plotVolcano.R
+++ b/R/plotVolcano.R
@@ -705,69 +705,72 @@ plotVolcano <- function(sce, res, testType, xv = NULL, yv = NULL, xvma = NULL,
     }
 
     ## -------------------------------------------------------------------------
-    ## Create a volcano plot for each significantly enriched complex
+    ## Create a volcano plot for each significantly enriched complex/GO/pathway
     ## -------------------------------------------------------------------------
-    if ("complexes" %in% names(featureCollections)) {
-        ## Find significant complexes
-        idx <- which(
-            mcols(featureCollections$complexes)[, paste0(comparisonString,
-                                                         "_FDR")] <
-                complexFDRThr &
-                mcols(featureCollections$complexes)[, paste0(comparisonString,
-                                                             "_NGenes")] > 1
-        )
-        tmpcomplx <- mcols(featureCollections$complexes)[idx, , drop = FALSE]
-        tmpcomplx <- tmpcomplx[order(tmpcomplx[paste0(comparisonString,
-                                                      "_PValue")]), ,
-                               drop = FALSE]
-        cplxs <- rownames(tmpcomplx)
-        cplxs <- cplxs[seq_len(min(length(cplxs), maxNbrComplexesToPlot))]
+    for (ftype in c("complexes", "GO", "pathways")) {
+        if (ftype %in% names(featureCollections)) {
+            ## Find significant complexes
+            idx <- which(
+                mcols(featureCollections[[ftype]])[, paste0(comparisonString,
+                                                            "_FDR")] <
+                    complexFDRThr &
+                    mcols(featureCollections[[ftype]])[, paste0(comparisonString,
+                                                                "_NGenes")] > 1
+            )
+            tmpcomplx <- mcols(featureCollections[[ftype]])[idx, , drop = FALSE]
+            tmpcomplx <- tmpcomplx[order(tmpcomplx[paste0(comparisonString,
+                                                          "_PValue")]), ,
+                                   drop = FALSE]
+            cplxs <- rownames(tmpcomplx)
+            cplxs <- cplxs[seq_len(min(length(cplxs), maxNbrComplexesToPlot))]
 
-        if (length(cplxs) > 0 && !is.null(baseFileName)) {
-            grDevices::pdf(paste0(baseFileName, "_complexes.pdf"),
-                           width = 10.5, height = 7.5)
-            for (cplx in cplxs) {
-                prs <- featureCollections$complexes[[cplx]]
-                cplxpval <- signif(mcols(
-                    featureCollections$complexes)[cplx, paste0(comparisonString,
-                                                               "_PValue")],
-                    digits = 3)
-                cplxfdr <- signif(mcols(
-                    featureCollections$complexes)[cplx, paste0(comparisonString,
-                                                               "_FDR")],
-                    digits = 3)
-                if (length(intersect(prs, res$pid)) > 1) {
-                    gg <- ggbase +
-                        ggplot2::geom_point(
-                            fill = "lightgrey", color = "grey",
-                            pch = 21, size = 1.5) +
-                        ggplot2::geom_point(
-                            data = res %>%
-                                dplyr::filter(.data$pid %in% prs),
-                            fill = "red", color = "grey", pch = 21,
-                            size = 1.5) +
-                        ggrepel::geom_text_repel(
-                            data = res %>%
-                                dplyr::filter(.data$pid %in% prs),
-                            aes(label = .data$pid), max.overlaps = Inf,
-                            size = 4,
-                            min.segment.length = 0, force = 1) +
-                        ggplot2::labs(caption = paste0(cplx, ", PValue = ",
-                                                       cplxpval,
-                                                       ", FDR = ", cplxfdr))
-                    print(gg)
+            if (length(cplxs) > 0 && !is.null(baseFileName)) {
+                grDevices::pdf(paste0(baseFileName, "_", ftype, ".pdf"),
+                               width = 10.5, height = 7.5)
+                for (cplx in cplxs) {
+                    prs <- featureCollections[[ftype]][[cplx]]
+                    cplxpval <- signif(mcols(
+                        featureCollections[[ftype]])[cplx, paste0(comparisonString,
+                                                                  "_PValue")],
+                        digits = 3)
+                    cplxfdr <- signif(mcols(
+                        featureCollections[[ftype]])[cplx, paste0(comparisonString,
+                                                                  "_FDR")],
+                        digits = 3)
+                    if (length(intersect(prs, res$pid)) > 1) {
+                        gg <- ggbase +
+                            ggplot2::geom_point(
+                                fill = "lightgrey", color = "grey",
+                                pch = 21, size = 1.5) +
+                            ggplot2::geom_point(
+                                data = res %>%
+                                    dplyr::filter(.data$pid %in% prs),
+                                fill = "red", color = "grey", pch = 21,
+                                size = 1.5) +
+                            ggrepel::geom_text_repel(
+                                data = res %>%
+                                    dplyr::filter(.data$pid %in% prs),
+                                aes(label = .data$pid), max.overlaps = Inf,
+                                size = 4,
+                                min.segment.length = 0, force = 1) +
+                            ggplot2::labs(caption = paste0(cplx, ", PValue = ",
+                                                           cplxpval,
+                                                           ", FDR = ", cplxfdr))
+                        print(gg)
 
-                    ## Bar plot
-                    for (acp in setdiff(abundanceColPat, "")) {
-                        print(.complexBarPlot(
-                            res = res, prs = prs, sce = sce, cplx = cplx,
-                            colpat = acp, groupmap = groupmap))
+                        ## Bar plot
+                        for (acp in setdiff(abundanceColPat, "")) {
+                            print(.complexBarPlot(
+                                res = res, prs = prs, sce = sce, cplx = cplx,
+                                colpat = acp, groupmap = groupmap))
+                        }
                     }
                 }
+                grDevices::dev.off()
             }
-            grDevices::dev.off()
         }
     }
+
     return(list(gg = ggtest, ggint = ggint,
                 ggma = ggma, ggwf = ggwf, ggbar = ggbar,
                 pidLabelVolcano = pidLabelVolcano))
diff --git a/R/prepareFeatureCollections.R b/R/prepareFeatureCollections.R
index 0539634..bdda28c 100644
--- a/R/prepareFeatureCollections.R
+++ b/R/prepareFeatureCollections.R
@@ -27,11 +27,14 @@
 #' Prepare feature collections for testing with camera
 #'
 #' Prepare feature collections for testing with \code{limma::camera}. The
-#' function maps the feature IDs in the collections (complexes or GO terms) to
-#' the values in the specified \code{idCol} column of \code{rowData(sce)},
-#' and subsequently replaces them with the corresponding row names of the
-#' \code{SummarizedExperiment} object. Feature sets with too few features
-#' (after the matching) are removed.
+#' function maps the feature IDs in the collections (complexes, GO terms
+#' or pathways) to the values in the specified \code{idCol} column of
+#' \code{rowData(sce)}, and subsequently replaces them with the corresponding
+#' row names of the \code{SummarizedExperiment} object. Feature sets with
+#' too few features (after the matching) are removed.
+#' Complexes are obtained from the database provided via `complexDbPath`.
+#' GO terms and pathways (BIOCARTA, KEGG, PID, REACTOME and WIKIPATHWAYS) are
+#' retrieved from `MSigDB` via the `msigdbr` package.
 #'
 #' @param sce A \code{SummarizedExperiment} object (or a derivative).
 #' @param idCol Character scalar, indicating which column in
@@ -39,10 +42,11 @@
 #'     feature collections (gene symbols).
 #' @param includeFeatureCollections Character vector indicating the types
 #'     of feature collections to prepare. Should be a subset of
-#'     \code{c("complexes", "GO")} or \code{NULL}.
+#'     \code{c("complexes", "GO", "pathways")} or \code{NULL}.
 #' @param complexDbPath Character scalar providing the path to the database
 #'     of complexes, generated using \code{makeComplexDB()} and serialized
-#'     to a .rds file.
+#'     to a .rds file. If `NULL`, the complex database provided with
+#'     einprot will be used.
 #' @param speciesInfo List with at least two entries (\code{species} and
 #'     \code{speciesCommon}), providing the species information. Typically
 #'     generated using \code{getSpeciesInfo()}.
@@ -95,7 +99,8 @@ prepareFeatureCollections <- function(sce, idCol, includeFeatureCollections,
     .assertScalar(x = idCol, type = "character",
                   validValues = colnames(SummarizedExperiment::rowData(sce)))
     .assertVector(x = includeFeatureCollections, type = "character",
-                  validValues = c("complexes", "GO"), allowNULL = TRUE)
+                  validValues = c("complexes", "GO", "pathways"),
+                  allowNULL = TRUE)
     .assertScalar(x = complexDbPath, type = "character", allowNULL = TRUE)
     if (is.null(complexDbPath) && "complexes" %in% includeFeatureCollections) {
         complexDbPath <- system.file(EINPROT_COMPLEXES_FILE,
@@ -208,5 +213,47 @@ prepareFeatureCollections <- function(sce, idCol, includeFeatureCollections,
         featureCollections$GO <- goannots
     }
 
+    ## -------------------------------------------------------------------------
+    ## Pathways
+    ## -------------------------------------------------------------------------
+    if ("pathways" %in% includeFeatureCollections &&
+        speciesInfo$species %in% getSupportedSpecies()$species) {
+        pws <- msigdbr::msigdbr(species = speciesInfo$species,
+                                category = "C2", subcategory = "CP:BIOCARTA") %>%
+            dplyr::select("gs_name", "gene_symbol") %>%
+            dplyr::bind_rows(
+                msigdbr::msigdbr(species = speciesInfo$species,
+                                 category = "C2", subcategory = "CP:KEGG") %>%
+                    dplyr::select("gs_name", "gene_symbol")
+            ) %>%
+            dplyr::bind_rows(
+                msigdbr::msigdbr(species = speciesInfo$species,
+                                 category = "C2", subcategory = "CP:PID") %>%
+                    dplyr::select("gs_name", "gene_symbol")
+            ) %>%
+            dplyr::bind_rows(
+                msigdbr::msigdbr(species = speciesInfo$species,
+                                 category = "C2", subcategory = "CP:REACTOME") %>%
+                    dplyr::select("gs_name", "gene_symbol")
+            ) %>%
+            dplyr::bind_rows(
+                msigdbr::msigdbr(species = speciesInfo$species,
+                                 category = "C2", subcategory = "CP:WIKIPATHWAYS") %>%
+                    dplyr::select("gs_name", "gene_symbol")
+            )
+        pws <- methods::as(lapply(split(pws, f = pws$gs_name),
+                                  function(w) unique(w$gene_symbol)),
+                           "CharacterList")
+        S4Vectors::mcols(pws)$genes <- vapply(pws, function(w)
+            gsub(pat, "\\1; ", paste(w, collapse = ";")), ""
+        )
+        S4Vectors::mcols(pws)$nGenes <- lengths(pws)
+        pws <- .replaceIdsInList(chl = pws, dfConv = dfGene,
+                                 currentIdCol = "genes",
+                                 newIdCol = "rowName", pat = pat)
+        pws <- pws[lengths(pws) >= minSizeToKeep]
+        featureCollections$pathways <- pws
+    }
+
     featureCollections
 }
diff --git a/man/prepareFeatureCollections.Rd b/man/prepareFeatureCollections.Rd
index 963ded3..9fa4b6e 100644
--- a/man/prepareFeatureCollections.Rd
+++ b/man/prepareFeatureCollections.Rd
@@ -24,11 +24,12 @@ feature collections (gene symbols).}
 
 \item{includeFeatureCollections}{Character vector indicating the types
 of feature collections to prepare. Should be a subset of
-\code{c("complexes", "GO")} or \code{NULL}.}
+\code{c("complexes", "GO", "pathways")} or \code{NULL}.}
 
 \item{complexDbPath}{Character scalar providing the path to the database
 of complexes, generated using \code{makeComplexDB()} and serialized
-to a .rds file.}
+to a .rds file. If `NULL`, the complex database provided with
+einprot will be used.}
 
 \item{speciesInfo}{List with at least two entries (\code{species} and
 \code{speciesCommon}), providing the species information. Typically
@@ -50,11 +51,14 @@ A list of \code{CharacterList}s (one for each feature collection).
 }
 \description{
 Prepare feature collections for testing with \code{limma::camera}. The
-function maps the feature IDs in the collections (complexes or GO terms) to
-the values in the specified \code{idCol} column of \code{rowData(sce)},
-and subsequently replaces them with the corresponding row names of the
-\code{SummarizedExperiment} object. Feature sets with too few features
-(after the matching) are removed.
+function maps the feature IDs in the collections (complexes, GO terms
+or pathways) to the values in the specified \code{idCol} column of
+\code{rowData(sce)}, and subsequently replaces them with the corresponding
+row names of the \code{SummarizedExperiment} object. Feature sets with
+too few features (after the matching) are removed.
+Complexes are obtained from the database provided via `complexDbPath`.
+GO terms and pathways (BIOCARTA, KEGG, PID, REACTOME and WIKIPATHWAYS) are
+retrieved from `MSigDB` via the `msigdbr` package.
 }
 \examples{
 sce <- readRDS(system.file("extdata", "mq_example", "1356_sce.rds",