R/topDirs.R

#' Filter results of multiHiCcompare
#' 
#' @param hicexp A hicexp object which has been compared.
#' @param logfc_cutoff The logFC value you wish to filter
#'    by. Defaults to 1.
#' @param logcpm_cutoff The logCPM cutoff you wish to
#'    filter by. Defaults to 1.
#' @param p.adj_cutoff The adjusted p-value cutoff you wish to filter
#'    by. Defaults to 0.01.
#' @param D_cutoff The distance cutoff you wish to filter
#'    by. Interactions with a D < D_cutoff will be filtered.
#'    Defaults to 1. 
#' @param return_df The format for the data.frame returned
#'    by the function. Options are "bed" and "pairedbed" (Default).
#' @param pval_aggregate Method to aggregate region-specific p-values.
#' If a region differentially interacts with several other regions,
#' the p-values are aggregated using a 'max' method (Default, select maximum
#' p-value, most conservative), or the Fisher ('fisher'), Lancaster ('lancaster'),
#' or Sidak ('sidak') methods (see 'aggregate' package).
#' regions, it is assigned a single p-value aggregated from several 
#' @details This function is meant to filter the results of
#'     multiHiCcompare. The top differentially interacting 
#'     regions (DIRs) can be returned by using this function.
#'     When the return_df = "bed" option is set the resulting
#'     data.frame can be input into the plot_pvals or plot_counts
#'     functions to visualize the top DIRs. 
#' @return A data.table containing the filtered results.
#' @export
#' @import aggregation
#' @examples 
#' data('hicexp_diff')
#' topDirs(hicexp_diff)

topDirs <- function(hicexp, logfc_cutoff = 1, logcpm_cutoff = 1, p.adj_cutoff = 0.01,
                           D_cutoff = 1, return_df = "pairedbed", pval_aggregate = "max") { 
  # New in v.2.0 - remove alpha parameter
  # @param alpha The p-value cutoff for determining the count
  #     of number of times a region is significant. Used to calculate
  #     the number of times a region was detected as significantly interacting. 
  #     Defaults to 0.05.
  # alpha = 0.05, 
  # check that data has been compared
  if (nrow(results(hicexp)) < 1) {
    stop("Differences must be detected before making a manhattan plot.")
  }
  # check input
  return_df <- match.arg(return_df, c("bed", "pairedbed"), 
                      several.ok = FALSE)
  
  # make results object
  res <- results(hicexp)
  # filter (New in v2.0 - use global filtering)
  res <- res[logCPM >= logcpm_cutoff & abs(logFC) >= logfc_cutoff & p.adj <= p.adj_cutoff & D >= D_cutoff, ]
  
  # if pairedbed just need to filter
  if (return_df == "pairedbed") {
    # filter (Changed from v2.0 - use global filtering)
    # res <- res[logCPM >= logcpm_cutoff & abs(logFC) >= logfc_cutoff & p.adj <= p.adj_cutoff & D >= D_cutoff, ]
    # reformat
    res$chr1 <- paste0('chr', res$chr)
    res$chr2 <- paste0('chr', res$chr)
    res$end1 <- res$region1 + resolution(hicexp) - 1
    res$end2 <- res$region2 + resolution(hicexp) - 1
    res <- res[, c("chr1", "region1", "end1", 'chr2', 'region2', 'end2', 'D', 'logFC', 'logCPM', 'p.value', 'p.adj')]
    colnames(res)[c(2,5)] <- c('start1', 'start2')
    # make p-values in scientific notation
    res$p.value <- formatC(res$p.value, digits = 4, format = "E")
    res$p.adj <- formatC(res$p.adj, digits = 4, format = "E")
    # round logFc and logCPM
    res$logCPM <- round(res$logCPM, digits = 4)
    res$logFC <- round(res$logFC, digits = 4)
  }
  
  # if BED need to aggregate regions
  if (return_df == "bed") {
    # subset to only significant interactions (Changed from v2.0 - use global filtering)
    # res <- res[p.adj < alpha, ]
    # make vector of regions and p-values
    regions <- c(paste0(res$chr, ':', res$region1),
                 paste0(res$chr, ':', res$region2))
    p.values <- c(res$p.adj, res$p.adj)
    dist <- c(res$D, res$D)
    logfc <- c(res$logFC, res$logFC)
    logcpm <- c(res$logCPM, res$logCPM)
    
    
    # aggregate into fisher pvalue
    # fisher_aggregate <- aggregate(p.values, 
    #                               by = list(regions), 
    #                               FUN = function(x) {
    #                                 if (length(x) > 1) {
    #                                   metap::sumlog(x)$p
    #                                 } else {
    #                                   x
    #                                 }
    #                               })
    
    # Use addCLT instead of fisher method
    fisher_aggregate <- aggregate(p.values, 
                                  by = list(regions), 
                                  FUN = function(x) {
                                    if (length(x) > 1) {
                                      if (pval_aggregate == "max") {
                                        max(x, na.rm = TRUE)
                                      } else if (pval_aggregate == "fisher") {
                                        aggregation::fisher(x)
                                      } else if (pval_aggregate == "lancaster") {
                                        aggregation::lancaster(x)
                                      } else if (pval_aggregate == "sidak") {
                                        aggregation::sidak(x)
                                      }
                                    } else {
                                      x
                                    }
                                  })
    
    fisher_aggregate <- cbind(read.table(text = fisher_aggregate$Group.1, 
                                         sep = ":"), fisher_aggregate$x)
    fisher_aggregate$`fisher_aggregate$x`[fisher_aggregate$`fisher_aggregate$x` == 0] <- .Machine$double.xmin
    # # aggregate into stouffer pvalue
    # p.values[p.values == 1] <- 0.99999 # change pvalues from 1 so sumz works correctly
    # stouffer_liptak_aggregate <- aggregate(p.values, 
    #                                        by = list(regions), 
    #                                        FUN = function(x) {
    #                                          if (length(x) > 1) {
    #                                            suppressWarnings(metap::sumz(x)$p) 
    #                                          } else {
    #                                            x
    #                                          }
    #                                        })
    # 
    # stouffer_liptak_aggregate <- cbind(read.table(text = stouffer_liptak_aggregate$Group.1,
    #                                               sep = ":"), stouffer_liptak_aggregate$x)
    
    # aggregate counts
    count <- ifelse(p.values < p.adj_cutoff, 1, 0)
    count_aggregate <- aggregate(count, by = list(regions), sum)
    count_aggregate <- cbind(read.table(text = count_aggregate$Group.1, sep = ":"),
                             count_aggregate$x)
    
    # aggregate D
    dist_aggregate <- aggregate(dist, by = list(regions), mean)
    dist_aggregate <- cbind(read.table(text = dist_aggregate$Group.1, sep = ":"),
                             dist_aggregate$x)
    # aggregate logfc
    logfc_aggregate <- aggregate(logfc, by = list(regions), mean)
    logfc_aggregate <- cbind(read.table(text = logfc_aggregate$Group.1, sep = ":"),
                             logfc_aggregate$x)
    # aggregate logcpm
    logcpm_aggregate <- aggregate(logcpm, by = list(regions), mean)
    logcpm_aggregate <- cbind(read.table(text = logcpm_aggregate$Group.1, sep = ":"),
                             logcpm_aggregate$x)
    # Format results
    res <- dplyr::left_join(count_aggregate, dist_aggregate, by = c('V1' = 'V1', 'V2' = 'V2'))
    res <- dplyr::left_join(res, logfc_aggregate, by = c('V1' = 'V1', 'V2' = 'V2'))
    res <- dplyr::left_join(res, logcpm_aggregate, by = c('V1' = 'V1', 'V2' = 'V2'))
    # res <- dplyr::left_join(res, stouffer_liptak_aggregate, by = c('V1' = 'V1', 'V2' = 'V2'))
    res <- dplyr::left_join(res, fisher_aggregate, by = c('V1' = 'V1', 'V2' = 'V2'))
    res$V1 <- paste0('chr', res$V1)
    res$end <- res$V2 + resolution(hicexp) - 1
    colnames(res) <- c('chr', 'start', 'count', 'avgD', 'avgLogFC', 'avgLogCPM',  'avgP.adj', 'end')
    res <- res[, c('chr', 'start', 'end', 'count', 'avgD', 'avgLogFC', 'avgLogCPM',  'avgP.adj')]
    res <- data.table::as.data.table(res)
    res <- res[order(res$count, decreasing = TRUE), ]
    
    # filter (Changed from v2.0 - use global filtering)
    # res <- res[avgLogCPM >= logcpm_cutoff & abs(avgLogFC) >= logfc_cutoff & avgP.adj <= p.adj_cutoff & avgD >= D_cutoff, ]
    
    # format pvalues
    res$avgP.adj <- formatC(res$avgP.adj, digits = 4, format = "E")
    # round logFc and logCPM
    res$avgLogCPM <- round(res$avgLogCPM, digits = 4)
    res$avgLogFC <- round(res$avgLogFC, digits = 4)
  }
  
  return(res)
}