Counting summary (#376)

* summary plots simplified * sex qc added
gagneurlab · Oct 4, 2022 · 402c321 · 402c321
1 parent bba2c24
commit 402c321
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 44 deletions.
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Summary.R b/drop/modules/aberrant-expression-pipeline/Counting/Summary.R
@@ -43,69 +43,59 @@ cnts_mtx <- counts(ods, normalized = F)
 #' 
 #' # Count Quality Control
 #' 
-#' Compare number of records vs. read counts  
-#' The `Obtained Read Count Ratio` plot does not include external counts
-#' because there are no raw reads to be counted.
-#' 
+#' Compares total mapped vs counted reads.  
+#' The `Mapped vs Counted Reads` plot does not include external counts.  
+#' Consider removing samples with too low or too high size factors.
+#'  
 bam_coverage <- fread(snakemake@input$bam_cov)
 bam_coverage[, sampleID := as.character(sampleID)]
+setnames(bam_coverage, 'record_count', 'total_count')
 coverage_dt <- merge(bam_coverage,
                      data.table(sampleID = colnames(ods),
                                 read_count = colSums(cnts_mtx),
                                 isExternal = ods@colData$isExternal),
                      by = "sampleID", sort = FALSE)
-# read count
-setorder(coverage_dt, read_count)
-coverage_dt[, count_rank := .I]
-# ratio
-coverage_dt[, counted_frac := read_count/record_count]
-setorder(coverage_dt, counted_frac)
-coverage_dt[, frac_rank := .I]
+# read counts
+coverage_dt[, count_rank := rank(read_count)]
 
 # size factors 
 ods <- estimateSizeFactors(ods)
-coverage_dt[, size_factors := sizeFactors(ods)]
-setorder(coverage_dt, size_factors)
-coverage_dt[, sf_rank := 1:.N]
+coverage_dt[, size_factors := round(sizeFactors(ods), 3)]
+coverage_dt[, sf_rank := rank(size_factors)]
+
+# Show this plot only if there are external samples, as the next plot contains the same info
+if(has_external == T){
+  p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count, col = isExternal)) +
+    geom_point(size = 3,show.legend = has_external) +
+    theme_cowplot() + background_grid() +
+    labs(title = "Obtained Read Counts", x="Sample Rank", y = "Counted Reads") +
+    ylim(c(0,NA)) +
+    scale_color_brewer(palette="Dark2")
+  p_depth
+}
 
-p_depth <- ggplot(coverage_dt, aes(x = count_rank, y = read_count, col= isExternal )) +
-  geom_point(size = 3,show.legend = has_external) +
-  theme_cowplot() +
-  background_grid() +
-  labs(title = "Read Counts", x="Sample Rank", y = "Reads Counted") +
-  ylim(c(0,NA)) +
-  scale_color_brewer(palette="Dark2")
 
-p_frac <- ggplot(coverage_dt, aes(x = frac_rank, y = counted_frac, col = isExternal)) +
-  geom_point(size = 3,show.legend = has_external) +
-  theme_cowplot() +
-  background_grid() +
-  labs(title = "Read Count Ratio", x = "Sample Rank", y = "Percent Reads Counted") +
-  ylim(c(0,NA)) +
-  scale_color_brewer(palette="Dark2")
-
-#+ QC, fig.height=6, fig.width=12
-plot_grid(p_depth, p_frac) 
+p_comp <- ggplot(coverage_dt[isExternal == F], aes(x = total_count, y = read_count)) +
+  geom_point(size = 3, show.legend = has_external, color = "#1B9E77") +
+  theme_cowplot() + background_grid() +
+  labs(title = "Total mapped vs. Counted Reads", x = "Mapped Reads", y = "Counted Reads") +
+  xlim(c(0,NA)) + ylim(c(0,NA))
+p_comp
+# ggExtra::ggMarginal(p_comp, type = "histogram") # could be a nice add-on
 
 p_sf <- ggplot(coverage_dt, aes(sf_rank, size_factors, col = isExternal)) +
   geom_point(size = 3,show.legend = has_external) +
   ylim(c(0,NA)) +
-  theme_cowplot() +
-  background_grid() +
+  theme_cowplot() + background_grid() +
   labs(title = 'Size Factors', x = 'Sample Rank', y = 'Size Factors') +
   scale_color_brewer(palette="Dark2")
 
-p_sf_cov <- ggplot(coverage_dt, aes(read_count, size_factors, col = isExternal)) +
-  geom_point(size = 3,show.legend = has_external) +
-  ylim(c(0,NA)) +
-  theme_cowplot() +
-  background_grid() +
-  labs(title = 'Size Factors vs. Read Counts',
-       x = 'Reads Counted', y = 'Size Factors') +
-  scale_color_brewer(palette="Dark2")
+p_sf
 
-#+ sizeFactors, fig.height=6, fig.width=12
-plot_grid(p_sf, p_sf_cov)
+setnames(coverage_dt, old = c('total_count', 'read_count', 'size_factors'),
+         new = c('Reads Mapped', 'Reads Counted', 'Size Factors'))
+DT::datatable(coverage_dt[, .(sampleID, `Reads Mapped`, `Reads Counted`, `Size Factors`)][order(`Reads Mapped`)],
+              caption = 'Reads summary statistics')
 
 #' # Filtering
 #' **local**: A pre-filtered summary of counts using only the local (from BAM) counts. Omitted if no external counts  
@@ -189,3 +179,64 @@ if(has_external){
 } else{
     DT::datatable(expressed_genes[order(Rank),-"Is External"],rownames = F)
 }
+
+#' **Considerations:**
+#' The verifying of the samples sex is performed by comparing the expression levels of 
+#' the genes XIST and UTY. In general, females should have high XIST and low UTY expression,
+#' and viceversa for males. For it to work, the sample annotation must have a column called 'SEX',
+#' with values male/female. If some other values exist, e.g., unknown, they will be matched. 
+#' The prediction is performed via a linear discriminant analysis on the log2 counts.
+
+# Get sex column and proceed if it exists
+sex_idx <- which('SEX' == toupper(colnames(colData(ods))))
+if(isEmpty(sex_idx)){
+  print('Sex column not found in sample annotation')
+} else{
+
+  # Verify if both XIST and UTY were counted
+  xist_id <- 'XIST'
+  uty_id <- 'UTY'
+
+  if(grepl('ENSG0', rownames(ods)[1])){
+    xist_id <- 'ENSG00000229807'
+    uty_id <- 'ENSG00000183878'
+  }
+  xist_idx <- grep(xist_id, rownames(ods))
+  uty_idx <- grep(uty_id, rownames(ods))
+
+  if(isEmpty(xist_idx) | isEmpty(uty_idx)){
+    print('Either XIST or UTY is not expressed')
+  }else{
+
+    sex_counts <- counts(ods)[c(xist_idx, uty_idx), ]
+    sex_dt <- data.table(sampleID = colnames(ods), 
+                         XIST = counts(ods)[xist_idx,], 
+                         UTY = counts(ods)[uty_idx,])
+    sex_dt <- merge(sex_dt, as.data.table(colData(ods))[,c(1, sex_idx), with = F], sort = F)
+    colnames(sex_dt) <- toupper(colnames(sex_dt))
+    sex_dt[, SEX := tolower(SEX)]
+    sex_dt[is.na(SEX), SEX := '']
+
+    # Train only in male/female in case there are other values
+    train_dt <- sex_dt[SEX %in% c('f', 'm', 'female', 'male')]
+
+    library("MASS")
+    lda <- lda(SEX ~ log2(XIST+1) + log2(UTY+1), data = train_dt)
+
+    sex_dt[, predicted_sex := predict(lda, sex_dt)$class]
+    sex_dt[, match_sex := SEX == predicted_sex]
+    table(sex_dt[, .(SEX, predicted_sex)])
+
+    g <- ggplot(sex_dt, aes(XIST+1, UTY+1)) + 
+      geom_point(aes(col = SEX, shape = predicted_sex, alpha = match_sex)) + 
+      scale_x_log10(limits = c(1,NA)) + scale_y_log10(limits = c(1,NA)) +
+      scale_alpha_manual(values = c(1, .1)) + 
+      theme_cowplot() + background_grid(major = 'xy', minor = 'xy') + 
+      annotation_logticks(sides = 'bl') + 
+      labs(color = 'Sex', shape = 'Predicted sex', alpha = 'Matches sex')
+    plot(g)
+
+    DT::datatable(sex_dt[match_sex == F], caption = 'Sex mismatches')
+
+  }
+}
diff --git a/drop/modules/aberrant-expression-pipeline/Counting/countReads.R b/drop/modules/aberrant-expression-pipeline/Counting/countReads.R
@@ -56,7 +56,7 @@ count_ranges <- readRDS(snakemake@input$count_ranges)
 seqlevelsStyle(count_ranges) <- seqlevelsStyle(bam_file)
 
 # show info
-message(paste("input:", snakemake@input$features))
+message(paste("input:", snakemake@input$sample_bam))
 message(paste("output:", snakemake@output$counts))
 message(paste('\tcount mode:', count_mode, sep = "\t"))
 message(paste('\tpaired end:', paired_end, sep = "\t"))

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R b/drop/modules/aberrant-expression-pipeline/Counting/mergeCounts.R
@@ -51,6 +51,7 @@ if( length(unique(row_names_objects)) > 1 ){
 # merge counts
 merged_assays <- do.call(cbind, counts_list)
 total_counts <- SummarizedExperiment(assays=list(counts=merged_assays))
+colnames(total_counts) <- gsub('.bam', '', colnames(total_counts))
 
 # assign ranges
 rowRanges(total_counts) <- count_ranges

diff --git a/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R b/drop/modules/aberrant-expression-pipeline/OUTRIDER/Summary.R
@@ -41,6 +41,7 @@ suppressPackageStartupMessages({
 dataset_title <- paste("Dataset:", paste(snakemake@wildcards$dataset, snakemake@wildcards$annotation, sep = '--'))
 
 ods <- readRDS(snakemake@input$ods)
+if(is.null(colData(ods)$isExternal)) colData(ods)$isExternal <- FALSE
 
 #' Number of samples: `r ncol(ods)`  
 #' Number of expressed genes: `r nrow(ods)`