Merge pull request #30 from vyepez88/code_cleaning

Code cleaning
gagneurlab · Apr 15, 2020 · ad1f791 · ad1f791
2 parents 89977ea + 8aa3137
commit ad1f791
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 57 deletions.
diff --git a/drop/configHelper.py b/drop/configHelper.py
@@ -108,6 +108,7 @@ def setDefaults(self, config, method=None):
         setKey(config, None, "projectTitle", "DROP: Detection of RNA Outlier Pipeline")
 
         setKey(config, None, "genomeAssembly", "hg19")
+        setKey(config, None, "scanBamParam", "null")
 
         if self.method is None:
             tmp_dir = submodules.getTmpDir()

diff --git a/drop/modules/aberrant-expression-pipeline b/drop/modules/aberrant-expression-pipeline
diff --git a/drop/modules/aberrant-splicing-pipeline b/drop/modules/aberrant-splicing-pipeline
diff --git a/drop/modules/mae-pipeline b/drop/modules/mae-pipeline
diff --git a/drop/template/Scripts/AberrantExpressionAnalysis/Overview.R b/drop/template/Scripts/AberrantExpressionAnalysis/Overview.R
@@ -22,7 +22,7 @@
 #'    - AE: '`sm drop.getTmpDir() + "/AE.done"`'
 #' output:
 #'   html_document:
-#'    code_folding: hide
+#'    code_folding: show
 #'    code_download: TRUE
 #'---
 
@@ -48,7 +48,7 @@ display_text <- function(caption, links) {
 }
 
 # get parameters
-datasets <- snakemake@params$datasets
+datasets <- sort(snakemake@params$datasets)
 annotations <- snakemake@params$annotations
 htmlDir <- snakemake@params$htmlDir
 
@@ -59,55 +59,57 @@ htmlDir <- snakemake@params$htmlDir
 #' 
 #' **Gene annotations:** `r paste(annotations, collapse = ', ')`
 #'
-#' # Count Summaries
+#' ## Summaries
+#' ### Counts summary
 #+ echo=FALSE
+htmlDir <- './AberrantExpression'
 count_links <- sapply(annotations, get_html_path, 
                       datasets = datasets,
                       htmlDir = file.path(htmlDir, "Counting"), 
                       fileName = paste0('Summary_', datasets, '.html'))
 #' 
 #' `r display_text(caption = 'Gene annotation version ', count_links)`
 #' 
-#' # OUTRIDER Results
+#' ### OUTRIDER summary
 #+ echo=FALSE
 outrider_links <- sapply(annotations, get_html_path, 
                       datasets = datasets,
                       htmlDir = file.path(htmlDir, "Outrider"), 
                       fileName = paste0('Summary_', datasets, '.html'))
 #' 
-#' 
 #' `r display_text(caption = 'Gene annotation version ', outrider_links)`
 #' 
-#' 
-#' OUTRIDER dataset (ods) files 
+#' ## Files
+#' ### OUTRIDER datasets (ods)
 #' 
 #' `r paste('* ', snakemake@params$odsFiles, collapse = '\n')`
 #' 
-#' 
-#' Results tables
+#' ### Results tables
 #' 
 #'  `r paste('* ', snakemake@params$resultTables, collapse = '\n')`
 #' 
-#'   
-#' # Analyze Individual Results
+
+#' ## Analyze Individual Results
 ods <- readRDS(snakemake@params$odsFiles[[1]])
 res <- fread(snakemake@params$resultTables[[1]])
 
-DT::datatable(res)
+#' Display the results table of the first dataset
+#+ echo=FALSE
+DT::datatable(res, filter = 'top')
 
-#' Choose a random gene and sample
+#' Choose a random gene and sample to plot. Outliers are in red.
 #+ echo=TRUE
 gene <- res[1, geneID]
 sample <- res[1, sampleID]
 
-#' ## Volcano plot
+#' ### Volcano plot
 #' setting basePlot = FALSE creates an interactive plot
 #' that allows finding the gene(s) of interest
 OUTRIDER::plotVolcano(ods, sample, basePlot = TRUE)
 
-#' ## Gene expression plot
+#' ### Gene expression plot (normalized counts)
 OUTRIDER::plotExpressionRank(ods, gene, basePlot = TRUE)
 
-#' ## Expected vs observed counts
+#' ### Expected vs observed counts
 OUTRIDER::plotExpectedVsObservedCounts(ods, gene, basePlot = TRUE)
 
diff --git a/drop/template/Scripts/AberrantSplicingAnalysis/Overview.R b/drop/template/Scripts/AberrantSplicingAnalysis/Overview.R
@@ -17,7 +17,7 @@
 #'    - AS: '`sm drop.getTmpDir() + "/AS.done"`'
 #' output:
 #'   html_document:
-#'    code_folding: hide
+#'    code_folding: show
 #'    code_download: TRUE
 #'---
 
@@ -27,38 +27,81 @@ saveRDS(snakemake, file.path(snakemake@params$tmpdir,
 # snakemake <- readRDS(".drop/tmp/AberrantSplicing_FRASER.snakemake")
 
 suppressPackageStartupMessages({
-    library(FRASER)
-    library(magrittr)
+  library(FRASER)
+  library(magrittr)
 })
 
-#' FRASER objects: `r paste(snakemake@params$fds_files)`  
+# define functions
+get_html_path <- function(datasets, htmlDir, fileName) {
+  file_paths <- file.path(htmlDir, fileName)
+  file_link <- paste0('\n* [', datasets ,'](', file_paths, 
+                      '){target="_blank"}\n', collapse = ' ')
+  file_link
+}
+
+display_text <- function(links) {
+  paste0(links, collapse = '\n')
+}
+
+# get parameters
+datasets <- sort(snakemake@config$aberrantSplicing$groups)
+
+## start html
+
+#'
+#' **Datasets:** `r paste(datasets, collapse = ', ')`
+#'
+#' ## Summaries
+#' ### Counts summary
+#+ echo=FALSE
+htmlDir <- './AberrantSplicing'
+count_links <- get_html_path(datasets = datasets,
+                             htmlDir = htmlDir, 
+                             fileName = paste0(datasets, '_countSummary.html'))
+#' 
+#' `r display_text(count_links)`
 #' 
-#' FRASER results: `r paste(snakemake@params$result_tables)`  
+#' ### FRASER summary
+#+ echo=FALSE
+fraser_links <- get_html_path(datasets = datasets,
+                              htmlDir = htmlDir, 
+                              fileName = paste0(datasets, '_summary.html'))
+#' 
+#' `r display_text(fraser_links)`
+
+#' ## Files
+#' ### FRASER datasets (fds)
+#' `r paste('* ', snakemake@params$fds_files, collapse = '\n')`  
 #' 
+#' ### Results tables
+#' `r paste('* ', snakemake@params$result_tables, collapse = '\n')`  
 
 #'
 #' ## Analyze individual results
 #' 
 
+# Load the files
 fds <- loadFraseRDataSet(file = snakemake@params$fds_files[[1]])
-fds <- readRDS(snakemake@params$fds_files[[1]])
 res <- fread(snakemake@params$result_tables[[1]])
 
-sample <- samples(fds)[1]
+#' Display the results table of the first dataset
+#+ echo=FALSE
+DT::datatable(res, filter = 'top')
 
-#' Get a splice site and sample of interest
+#' Get a splice site and sample of interest. Outliers are in red.
+#+ echo=TRUE
 sample <- res[1, sampleID]
 siteIndex <- 4
 
-#' ## Volcano plot
+#' ### Volcano plot
 #' setting basePlot = FALSE creates an interactive plot
 #' that allows finding the junction(s) of interest
 FRASER::plotVolcano(fds, sample, type = 'psi3', basePlot = TRUE)
 
-#' ## Expression plot
+#' ### Expression plot
 FRASER::plotExpression(fds, type = 'psi3', site = siteIndex, basePlot = TRUE)
 
-#' ## Expected vs observed PSI
+#' ### Expected vs observed PSI
 FRASER::plotExpectedVsObservedPsi(fds, type = 'psi3', 
                                   idx = siteIndex, basePlot = TRUE)
 
diff --git a/drop/template/Scripts/MAEAnalysis/Overview.R b/drop/template/Scripts/MAEAnalysis/Overview.R
@@ -5,9 +5,9 @@
 #'  params:
 #'    - tmpdir: '`sm drop.getTmpDir()`'
 #'    - mae_ids: '`sm parser.getMaeAll()`'
-#'    - count_matrices: '`sm expand(parser.getProcDataDir() + 
-#'                       "/mae/allelic_counts/{mae_id}.csv.gz",
-#'                       mae_id=parser.getMaeAll())`'
+#'    - allelic_counts: '`sm expand(parser.getProcDataDir() + 
+#'                          "/mae/allelic_counts/{mae_id}.csv.gz",
+#'                          mae_id=parser.getMaeAll())`'
 #'    - results_obj: '`sm expand(parser.getProcResultsDir() + 
 #'                       "/mae/samples/{mae_id}_res.Rds", 
 #'                       mae_id=parser.getMaeAll())`'
@@ -18,7 +18,6 @@
 #'    - html: '`sm config["htmlOutputPath"] + "/Scripts_MAE_Results_Overview.html"`'
 #'    - qc_matrix: '`sm expand(parser.getProcResultsDir() + "/mae/{qc_group}/" +
 #'                  "dna_rna_qc_matrix.Rds", qc_group=config["mae"]["qcGroups"])`'
-#'    - qc_html: '`sm config["htmlOutputPath"] + "/Scripts_QC_Overview.html"`'
 #'  input:
 #'    - MAE: '`sm drop.getTmpDir() + "/MAE.done"`'
 #' output:
@@ -32,35 +31,50 @@ saveRDS(snakemake, file.path(snakemake@params$tmpdir, "MAE_analysis.snakemake"))
 # snakemake <- readRDS(".drop/tmp/MAE_analysis.snakemake")
 
 suppressPackageStartupMessages({
-    library(tMAE)
+  library(tMAE)
 })
 
-all_files <- paste0("* ", snakemake@params$mae_ids, ": ",
-       snakemake@params$count_matrices, collapse = "\n")
-#'
-#' # All Samples
+
+#' ## Files
+#' ### Allelic counts
+#' Located in `r file.path(snakemake@config$root, 'processed_data/mae/allelic_counts/')`
 #' 
-#'  `r all_files`
+#' ### Results tables of each sample
+#' Located in `r file.path(snakemake@config$root, 'processed_results/mae/samples/')`
 #' 
-#' [MAE Pipeline Output](`r snakemake@params$html`)
+#' ### Aggregated results tables of each group
+#' `r paste('* ', snakemake@params$results_tables, collapse = '\n')`  
+#'
+#' ### MAE Pipeline Output
+#' [MAE Pipeline Output](`r "./Scripts_MAE_Datasets.html"`)
 #' 
 
-#' # Analyze Individual Results
+#' ## Analyze Individual Results
 #' 
 file <- snakemake@params$results_tables[[1]]
 res <- fread(file)
 
 file_location <- strsplit(file, "/")[[1]]
 res_sample <- readRDS(snakemake@params$results_obj[[1]])
 
-plotMA4MAE(res_sample)
-plotAllelicCounts(res_sample)
+if(is.null(res_sample$rare)){
+  g1 <- plotMA4MAE(res_sample)
+  g2 <- plotAllelicCounts(res_sample)
+} else {
+  g1 <- plotMA4MAE(res_sample, rare_column = 'rare')
+  g2 <- plotAllelicCounts(res_sample, rare_column = 'rare')
+}
 
-#' # Quality Control: VCF-BAM Matching
-#' 
-#' DNA-RNA matrix: 
+#' ### MA plot: fold change vs RNA coverage
+g1
+
+#' ### Alternative vs Reference plot
+g2
+
+#' ## Quality Control: VCF-BAM Matching
 #' 
-#' `r paste('    *', snakemake@params$qc_matrix, collapse='\n')`  
+#' [QC Overview](`r "./Scripts_QC_Datasets.html"`)
 #' 
-#' [QC Overview](`r snakemake@params$qc_html`)
+#' DNA-RNA matrix: 
+#' `r paste('* ', snakemake@params$qc_matrix, collapse='\n')`  
 
diff --git a/drop/template/Scripts/Pipeline/SampleAnnotation.R b/drop/template/Scripts/Pipeline/SampleAnnotation.R
@@ -25,17 +25,22 @@ suppressPackageStartupMessages({
 
 sa <- fread(snakemake@input$sampleAnnotation)
 
-#' Number of rows and columns in tha sample annotation
-dim(sa)
-DT::datatable(sa)
+#'
+#' Number of rows and columns in the sample annotation: `r dim(sa)`
+
+#'
+#' ## Sample annotation
+DT::datatable(sa, filter = 'top')
+
+#' ## Quality control checks
 
 # check for duplicated rows
 if(sum(duplicated(sa)) > 0){
   print("The sample annotation has the following duplicated rows. Remove them.")
   sa[duplicated(sa)]
 }
 
-#' Check for NAs
+#' Check for RNA_IDs without a value
 if(nrow(sa[is.na(RNA_ID)]) > 0){
   print("The sample annotation has some non-existent RNA_IDs. Fill them.")
   sa[is.na(RNA_ID)]
@@ -58,13 +63,13 @@ if('DNA_VCF_FILE' %in% colnames(sa)){
     }
 }
 
-#' Check for RNA_IDs with more than 1 RNA_BAM_FILE
+#' Check for RNA_IDs with more than one RNA_BAM_FILE
 if(sum(duplicated(unique(sa[,.(RNA_ID, RNA_BAM_FILE)])$RNA_ID)) > 0){
   print('The following RNA_IDs and RNA_BAM_FILEs do not have a 1:1 match. Correct them.')
   DT::datatable(duplicated(unique(sa[,.(RNA_ID, RNA_BAM_FILE)])$RNA_ID))
 }
 
-#' Barplot with DROP groups
+#' ## Barplot with DROP groups
 unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>%
   table %>% barplot(xlab = 'DROP groups', ylab = 'Number of samples')
 
@@ -86,5 +91,5 @@ if(!is.null(sa$HPO_TERMS)){
          na = NA, sep = "\t", row.names = F, quote = F)
 }
 
-file.create(snakemake@output$done)
+file.create(snakemake@output$done) %>% invisible
 
diff --git a/drop/template/Snakefile b/drop/template/Snakefile
@@ -31,6 +31,7 @@ rule all:
         AS(config["finalFiles"]["AS"]),
         MAE(config["finalFiles"]["MAE"]),
         rules.Index.output,
+        config["htmlOutputPath"] + "/readme.html",
         config["tmpdir"] + "/rulegraphs.done"
     output:
         touch(config["tmpdir"] + "/drop_all.done")
@@ -56,7 +57,8 @@ rule aberrantExpression:
     output: touch(drop.getTmpDir() + "/AE.done")
 
 rule aberrantSplicing:
-    input: AS(config["finalFiles"]["AS"])
+    input: 
+        AS(config["finalFiles"]["AS"])
     output: touch(drop.getTmpDir() + "/AS.done")
 
 rule mae:

diff --git a/drop/template/config.yaml b/drop/template/config.yaml
@@ -14,6 +14,7 @@ geneAnnotation:
     v29:  /path/to/gencode29.gtf.gz # example
 
 genomeAssembly: hg19  # either hg19 or hg38
+scanBamParam: null # or a path to an Rds file containing a scanBamParam object
 
 aberrantExpression:
     groups: 

diff --git a/drop/template/readme.md b/drop/template/readme.md
@@ -0,0 +1,21 @@
+# DROP Analysis
+
+Analysis of the demo dataset using the Detection of RNA Outliers Pipeline. For 
+speed purposes, this is only a small subset of the test dataset from GEUVADIS.
+
+Browse either through the tabs to see the different analysis results or through
+the following links to see the pipelines results.
+
+* [**Aberrant expression pipeline**](./aberrant-expression-pipeline_index.html)
+
+* [**Aberrant splicing pipeline**](./aberrant-splicing-pipeline_index.html)
+
+* [**Mono-allelic expression pipeline**](./mae-pipeline_index.html)
+
+## Resources
+
+**Code**: available to download for free in our [github](https://www.github.com/gagneurlab/drop)
+
+**Manuscript**: preprint in [protocol exchange](https://protocolexchange.researchsquare.com/article/pex-787/v1)
+
+**Data**: subset of 100 samples from the GEUVADIS [dataset](https://www.ebi.ac.uk/Tools/geuvadis-das/)