From 67f2679822578d66bbe67166fb82f52e5903be98 Mon Sep 17 00:00:00 2001 From: Charlotte Soneson Date: Tue, 23 Apr 2024 22:33:51 +0200 Subject: [PATCH] Include pathways from MSigDB among supported feature collections --- NEWS.md | 1 + R/checkArgumentsDIANN.R | 2 +- R/checkArgumentsFragPipe.R | 2 +- R/checkArgumentsMaxQuant.R | 2 +- R/checkArgumentsPDTMT.R | 2 +- R/checkArgumentsSpectronaut.R | 2 +- R/plotVolcano.R | 113 ++++++++++++++++--------------- R/prepareFeatureCollections.R | 63 ++++++++++++++--- man/prepareFeatureCollections.Rd | 18 +++-- 9 files changed, 130 insertions(+), 75 deletions(-) diff --git a/NEWS.md b/NEWS.md index bf5d913..8281966 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,7 @@ * Add details about DIA-NN command line to report * Add column with links to ComplexPortal query to link table +* Include pathways from MSigDB among supported feature collections # einprot 0.9.3 diff --git a/R/checkArgumentsDIANN.R b/R/checkArgumentsDIANN.R index 004714a..2d7468a 100644 --- a/R/checkArgumentsDIANN.R +++ b/R/checkArgumentsDIANN.R @@ -196,7 +196,7 @@ ## Complexes .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE) .assertVector(x = customComplexes, type = "list") if (length(customComplexes) > 0) { .assertVector(x = names(customComplexes), type = "character") diff --git a/R/checkArgumentsFragPipe.R b/R/checkArgumentsFragPipe.R index 3c91142..355bf1f 100755 --- a/R/checkArgumentsFragPipe.R +++ b/R/checkArgumentsFragPipe.R @@ -201,7 +201,7 @@ ## Complexes .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE) .assertVector(x = customComplexes, type = "list") if (length(customComplexes) > 0) { .assertVector(x = names(customComplexes), type = "character") diff --git a/R/checkArgumentsMaxQuant.R b/R/checkArgumentsMaxQuant.R index 735c186..2c20f4c 100644 --- a/R/checkArgumentsMaxQuant.R +++ b/R/checkArgumentsMaxQuant.R @@ -186,7 +186,7 @@ ## Complexes .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE) .assertVector(x = customComplexes, type = "list") if (length(customComplexes) > 0) { .assertVector(x = names(customComplexes), type = "character") diff --git a/R/checkArgumentsPDTMT.R b/R/checkArgumentsPDTMT.R index 0f4d8cd..1474a65 100644 --- a/R/checkArgumentsPDTMT.R +++ b/R/checkArgumentsPDTMT.R @@ -209,7 +209,7 @@ ## Complexes .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE) .assertVector(x = customComplexes, type = "list") if (length(customComplexes) > 0) { .assertVector(x = names(customComplexes), type = "character") diff --git a/R/checkArgumentsSpectronaut.R b/R/checkArgumentsSpectronaut.R index 2bfa4ef..2417cd6 100644 --- a/R/checkArgumentsSpectronaut.R +++ b/R/checkArgumentsSpectronaut.R @@ -191,7 +191,7 @@ ## Complexes .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), allowNULL = TRUE) .assertVector(x = customComplexes, type = "list") if (length(customComplexes) > 0) { .assertVector(x = names(customComplexes), type = "character") diff --git a/R/plotVolcano.R b/R/plotVolcano.R index 0a32159..8314459 100644 --- a/R/plotVolcano.R +++ b/R/plotVolcano.R @@ -705,69 +705,72 @@ plotVolcano <- function(sce, res, testType, xv = NULL, yv = NULL, xvma = NULL, } ## ------------------------------------------------------------------------- - ## Create a volcano plot for each significantly enriched complex + ## Create a volcano plot for each significantly enriched complex/GO/pathway ## ------------------------------------------------------------------------- - if ("complexes" %in% names(featureCollections)) { - ## Find significant complexes - idx <- which( - mcols(featureCollections$complexes)[, paste0(comparisonString, - "_FDR")] < - complexFDRThr & - mcols(featureCollections$complexes)[, paste0(comparisonString, - "_NGenes")] > 1 - ) - tmpcomplx <- mcols(featureCollections$complexes)[idx, , drop = FALSE] - tmpcomplx <- tmpcomplx[order(tmpcomplx[paste0(comparisonString, - "_PValue")]), , - drop = FALSE] - cplxs <- rownames(tmpcomplx) - cplxs <- cplxs[seq_len(min(length(cplxs), maxNbrComplexesToPlot))] + for (ftype in c("complexes", "GO", "pathways")) { + if (ftype %in% names(featureCollections)) { + ## Find significant complexes + idx <- which( + mcols(featureCollections[[ftype]])[, paste0(comparisonString, + "_FDR")] < + complexFDRThr & + mcols(featureCollections[[ftype]])[, paste0(comparisonString, + "_NGenes")] > 1 + ) + tmpcomplx <- mcols(featureCollections[[ftype]])[idx, , drop = FALSE] + tmpcomplx <- tmpcomplx[order(tmpcomplx[paste0(comparisonString, + "_PValue")]), , + drop = FALSE] + cplxs <- rownames(tmpcomplx) + cplxs <- cplxs[seq_len(min(length(cplxs), maxNbrComplexesToPlot))] - if (length(cplxs) > 0 && !is.null(baseFileName)) { - grDevices::pdf(paste0(baseFileName, "_complexes.pdf"), - width = 10.5, height = 7.5) - for (cplx in cplxs) { - prs <- featureCollections$complexes[[cplx]] - cplxpval <- signif(mcols( - featureCollections$complexes)[cplx, paste0(comparisonString, - "_PValue")], - digits = 3) - cplxfdr <- signif(mcols( - featureCollections$complexes)[cplx, paste0(comparisonString, - "_FDR")], - digits = 3) - if (length(intersect(prs, res$pid)) > 1) { - gg <- ggbase + - ggplot2::geom_point( - fill = "lightgrey", color = "grey", - pch = 21, size = 1.5) + - ggplot2::geom_point( - data = res %>% - dplyr::filter(.data$pid %in% prs), - fill = "red", color = "grey", pch = 21, - size = 1.5) + - ggrepel::geom_text_repel( - data = res %>% - dplyr::filter(.data$pid %in% prs), - aes(label = .data$pid), max.overlaps = Inf, - size = 4, - min.segment.length = 0, force = 1) + - ggplot2::labs(caption = paste0(cplx, ", PValue = ", - cplxpval, - ", FDR = ", cplxfdr)) - print(gg) + if (length(cplxs) > 0 && !is.null(baseFileName)) { + grDevices::pdf(paste0(baseFileName, "_", ftype, ".pdf"), + width = 10.5, height = 7.5) + for (cplx in cplxs) { + prs <- featureCollections[[ftype]][[cplx]] + cplxpval <- signif(mcols( + featureCollections[[ftype]])[cplx, paste0(comparisonString, + "_PValue")], + digits = 3) + cplxfdr <- signif(mcols( + featureCollections[[ftype]])[cplx, paste0(comparisonString, + "_FDR")], + digits = 3) + if (length(intersect(prs, res$pid)) > 1) { + gg <- ggbase + + ggplot2::geom_point( + fill = "lightgrey", color = "grey", + pch = 21, size = 1.5) + + ggplot2::geom_point( + data = res %>% + dplyr::filter(.data$pid %in% prs), + fill = "red", color = "grey", pch = 21, + size = 1.5) + + ggrepel::geom_text_repel( + data = res %>% + dplyr::filter(.data$pid %in% prs), + aes(label = .data$pid), max.overlaps = Inf, + size = 4, + min.segment.length = 0, force = 1) + + ggplot2::labs(caption = paste0(cplx, ", PValue = ", + cplxpval, + ", FDR = ", cplxfdr)) + print(gg) - ## Bar plot - for (acp in setdiff(abundanceColPat, "")) { - print(.complexBarPlot( - res = res, prs = prs, sce = sce, cplx = cplx, - colpat = acp, groupmap = groupmap)) + ## Bar plot + for (acp in setdiff(abundanceColPat, "")) { + print(.complexBarPlot( + res = res, prs = prs, sce = sce, cplx = cplx, + colpat = acp, groupmap = groupmap)) + } } } + grDevices::dev.off() } - grDevices::dev.off() } } + return(list(gg = ggtest, ggint = ggint, ggma = ggma, ggwf = ggwf, ggbar = ggbar, pidLabelVolcano = pidLabelVolcano)) diff --git a/R/prepareFeatureCollections.R b/R/prepareFeatureCollections.R index 0539634..bdda28c 100644 --- a/R/prepareFeatureCollections.R +++ b/R/prepareFeatureCollections.R @@ -27,11 +27,14 @@ #' Prepare feature collections for testing with camera #' #' Prepare feature collections for testing with \code{limma::camera}. The -#' function maps the feature IDs in the collections (complexes or GO terms) to -#' the values in the specified \code{idCol} column of \code{rowData(sce)}, -#' and subsequently replaces them with the corresponding row names of the -#' \code{SummarizedExperiment} object. Feature sets with too few features -#' (after the matching) are removed. +#' function maps the feature IDs in the collections (complexes, GO terms +#' or pathways) to the values in the specified \code{idCol} column of +#' \code{rowData(sce)}, and subsequently replaces them with the corresponding +#' row names of the \code{SummarizedExperiment} object. Feature sets with +#' too few features (after the matching) are removed. +#' Complexes are obtained from the database provided via `complexDbPath`. +#' GO terms and pathways (BIOCARTA, KEGG, PID, REACTOME and WIKIPATHWAYS) are +#' retrieved from `MSigDB` via the `msigdbr` package. #' #' @param sce A \code{SummarizedExperiment} object (or a derivative). #' @param idCol Character scalar, indicating which column in @@ -39,10 +42,11 @@ #' feature collections (gene symbols). #' @param includeFeatureCollections Character vector indicating the types #' of feature collections to prepare. Should be a subset of -#' \code{c("complexes", "GO")} or \code{NULL}. +#' \code{c("complexes", "GO", "pathways")} or \code{NULL}. #' @param complexDbPath Character scalar providing the path to the database #' of complexes, generated using \code{makeComplexDB()} and serialized -#' to a .rds file. +#' to a .rds file. If `NULL`, the complex database provided with +#' einprot will be used. #' @param speciesInfo List with at least two entries (\code{species} and #' \code{speciesCommon}), providing the species information. Typically #' generated using \code{getSpeciesInfo()}. @@ -95,7 +99,8 @@ prepareFeatureCollections <- function(sce, idCol, includeFeatureCollections, .assertScalar(x = idCol, type = "character", validValues = colnames(SummarizedExperiment::rowData(sce))) .assertVector(x = includeFeatureCollections, type = "character", - validValues = c("complexes", "GO"), allowNULL = TRUE) + validValues = c("complexes", "GO", "pathways"), + allowNULL = TRUE) .assertScalar(x = complexDbPath, type = "character", allowNULL = TRUE) if (is.null(complexDbPath) && "complexes" %in% includeFeatureCollections) { complexDbPath <- system.file(EINPROT_COMPLEXES_FILE, @@ -208,5 +213,47 @@ prepareFeatureCollections <- function(sce, idCol, includeFeatureCollections, featureCollections$GO <- goannots } + ## ------------------------------------------------------------------------- + ## Pathways + ## ------------------------------------------------------------------------- + if ("pathways" %in% includeFeatureCollections && + speciesInfo$species %in% getSupportedSpecies()$species) { + pws <- msigdbr::msigdbr(species = speciesInfo$species, + category = "C2", subcategory = "CP:BIOCARTA") %>% + dplyr::select("gs_name", "gene_symbol") %>% + dplyr::bind_rows( + msigdbr::msigdbr(species = speciesInfo$species, + category = "C2", subcategory = "CP:KEGG") %>% + dplyr::select("gs_name", "gene_symbol") + ) %>% + dplyr::bind_rows( + msigdbr::msigdbr(species = speciesInfo$species, + category = "C2", subcategory = "CP:PID") %>% + dplyr::select("gs_name", "gene_symbol") + ) %>% + dplyr::bind_rows( + msigdbr::msigdbr(species = speciesInfo$species, + category = "C2", subcategory = "CP:REACTOME") %>% + dplyr::select("gs_name", "gene_symbol") + ) %>% + dplyr::bind_rows( + msigdbr::msigdbr(species = speciesInfo$species, + category = "C2", subcategory = "CP:WIKIPATHWAYS") %>% + dplyr::select("gs_name", "gene_symbol") + ) + pws <- methods::as(lapply(split(pws, f = pws$gs_name), + function(w) unique(w$gene_symbol)), + "CharacterList") + S4Vectors::mcols(pws)$genes <- vapply(pws, function(w) + gsub(pat, "\\1; ", paste(w, collapse = ";")), "" + ) + S4Vectors::mcols(pws)$nGenes <- lengths(pws) + pws <- .replaceIdsInList(chl = pws, dfConv = dfGene, + currentIdCol = "genes", + newIdCol = "rowName", pat = pat) + pws <- pws[lengths(pws) >= minSizeToKeep] + featureCollections$pathways <- pws + } + featureCollections } diff --git a/man/prepareFeatureCollections.Rd b/man/prepareFeatureCollections.Rd index 963ded3..9fa4b6e 100644 --- a/man/prepareFeatureCollections.Rd +++ b/man/prepareFeatureCollections.Rd @@ -24,11 +24,12 @@ feature collections (gene symbols).} \item{includeFeatureCollections}{Character vector indicating the types of feature collections to prepare. Should be a subset of -\code{c("complexes", "GO")} or \code{NULL}.} +\code{c("complexes", "GO", "pathways")} or \code{NULL}.} \item{complexDbPath}{Character scalar providing the path to the database of complexes, generated using \code{makeComplexDB()} and serialized -to a .rds file.} +to a .rds file. If `NULL`, the complex database provided with +einprot will be used.} \item{speciesInfo}{List with at least two entries (\code{species} and \code{speciesCommon}), providing the species information. Typically @@ -50,11 +51,14 @@ A list of \code{CharacterList}s (one for each feature collection). } \description{ Prepare feature collections for testing with \code{limma::camera}. The -function maps the feature IDs in the collections (complexes or GO terms) to -the values in the specified \code{idCol} column of \code{rowData(sce)}, -and subsequently replaces them with the corresponding row names of the -\code{SummarizedExperiment} object. Feature sets with too few features -(after the matching) are removed. +function maps the feature IDs in the collections (complexes, GO terms +or pathways) to the values in the specified \code{idCol} column of +\code{rowData(sce)}, and subsequently replaces them with the corresponding +row names of the \code{SummarizedExperiment} object. Feature sets with +too few features (after the matching) are removed. +Complexes are obtained from the database provided via `complexDbPath`. +GO terms and pathways (BIOCARTA, KEGG, PID, REACTOME and WIKIPATHWAYS) are +retrieved from `MSigDB` via the `msigdbr` package. } \examples{ sce <- readRDS(system.file("extdata", "mq_example", "1356_sce.rds",