Install needed BSgenome on the fly (#147)

* remove assembly specific packages closes #142 * fix pipeline when no import count matrix in sample annotation #114 * refactored config parsers by submodule * update to FRASER 1.2.0 @c-mertes * update aberrant splicing export @c-mertes * local loading of HPO file if present @vyepez88 * fixed buggy file extension when saving ods @jemten * Add files via upload (#146) * update OUTRIDER version and bugfixes * allow NCBI based assembly in config fixes #135 * move assembly version test to DropConf and adjust the pipeline. * add unlock if snakemake fails * use addAF to check installation * Add badge to readme Co-authored-by: mumichae <51025211+mumichae@users.noreply.github.com> Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
gagneurlab · Jan 28, 2021 · 0b4bb21 · 0b4bb21
1 parent 3e26ae1
commit 0b4bb21
Show file tree

Hide file tree

Showing 19 changed files with 112 additions and 71 deletions.
diff --git a/._travis.yml b/._travis.yml
@@ -16,7 +16,8 @@ install:
   # install dependencies
   - source $HOME/miniconda/etc/profile.d/conda.sh
   - conda create -q -n drop -c conda-forge -c bioconda "python>=${TRAVIS_PYTHON_VERSION}" "r-base>=4.0.3" "drop>=1.0.1" "wbuild>=1.8" "bioconductor-fraser>=1.2.0"
-  - conda remove -n drop --force drop
+  # remove FRASER to check installation routine
+  - conda remove -n drop --force drop bioconductor-fraser
   - conda activate drop
   - pip install -r tests/requirements.txt
 

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -1,4 +1,4 @@
-name: Python Package using Conda
+name: Build
 
 on: [push]
 

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Detection of RNA Outlier Pipeline
-[![Pipeline status](https://travis-ci.org/gagneurlab/drop.svg?branch=master)](https://travis-ci.org/gagneurlab/drop)
+[![DROP pipeline status](https://github.com/gagneurlab/drop/workflows/Build/badge.svg?branch=bsgenome)](https://github.com/gagneurlab/drop/actions?query=workflow%3ABuild)
 [![Version](https://img.shields.io/github/v/release/gagneurlab/drop?include_prereleases)](https://github.com/gagneurlab/drop/releases)
 [![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest)
 

diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
@@ -40,7 +40,7 @@ Parameter            Type        Description
 projectTitle         character   Title of the project to be displayed on the rendered HTML output                                                                         ``Project 1``
 htmlOutputPath       character   Full path of the folder where the HTML files are rendered                                                                                ``/data/project1/htmlOutput``
 indexWithFolderName  boolean     If true, the basename of the project directory will be used as prefix for the index.html file                                            ``true``
-genomeAssembly       character   Either hg19 or hg38, depending on the genome assembly used for mapping                                                                   ``/data/project1``
+genomeAssembly       character   Either hg19/hs37d5 or hg38/GRCh38, depending on the genome assembly used for mapping                                                     ``/data/project1``
 sampleAnnotation     character   Full path of the sample annotation table                                                                                                 ``/data/project1/sample_annotation.tsv``
 root                 character   Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files.    ``/data/project1``
 geneAnnotation       dictionary  A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided.  ``anno1: /path/to/gtf1.gtf``
@@ -197,7 +197,7 @@ Two different files can be downloaded from our `public repository <https://www.c
 
 1. VCF file containing different positions to be used to match DNA with RNA files.
 The file name is ``qc_vcf_1000G_{genome_build}.vcf.gz``. One file is available for each 
-genome build (hg19 and hg38). Download it together with the corresponding .tbi file. 
+genome build (hg19/hs37d5 and hg38/GRCh38). Download it together with the corresponding .tbi file. 
 Indicate the full path to the vcf file in the ``qcVcf`` key in the mono-allelic expression dictionary.
 This file is only needed for the MAE module. Otherwise, write ``null`` in the ``qcVcf`` key.
 

diff --git a/drop/config/DropConfig.py b/drop/config/DropConfig.py
@@ -155,3 +155,39 @@ def getFastaFile(self, str_=True):
 
     def getFastaDict(self, str_=True):
         return utils.returnPath(self.fastaDict, str_)
+
+    def getBSGenomeName(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID == 'hg19':
+            return "BSgenome.Hsapiens.UCSC.hg19"
+        if assemblyID == 'hs37d5':
+            return "BSgenome.Hsapiens.1000genomes.hs37d5"
+        if assemblyID == 'hg38':
+            return "BSgenome.Hsapiens.UCSC.hg38"
+        if assemblyID == 'GRCh38':
+            return "BSgenome.Hsapiens.NCBI.GRCh38"
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+    def getBSGenomeVersion(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID in ['hg19', 'hs37d5']:
+            return 37
+        if assemblyID in ['hg38', 'GRCh38']:
+            return 38
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+    def getMafDbName(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID in ['hg19', 'hs37d5']:
+            return "MafDb.gnomAD.r2.1.hs37d5"
+        if assemblyID in ['hg38', 'GRCh38']:
+            return "MafDb.gnomAD.r2.1.GRCh38"
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+
diff --git a/drop/installRPackages.R b/drop/installRPackages.R
@@ -1,37 +1,42 @@
 options(repos=structure(c(CRAN="https://cloud.r-project.org")), warn = -1)
-suppressPackageStartupMessages(library(data.table))
 
 if (!requireNamespace('BiocManager', quietly = TRUE)) {
     install.packages('BiocManager')
     BiocManager::install("remotes")
 }
+if (!requireNamespace('data.table', quietly = TRUE)) {
+    install.packages('data.table')
+}
 
+suppressPackageStartupMessages(library(data.table))
+
+# do not turn wanrings into errors. E.g. "Package XXX build for R 4.0.X"
+Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
 
 args <- commandArgs(trailingOnly=TRUE)
-packages <- fread(args[1], fill = TRUE)
-packages <- packages[!startsWith(package, "#")]
+if (file.exists(args[1])){
+    packages <- fread(args[1], fill = TRUE)
+} else {
+    packages <- data.table(
+        package=gsub("=.*", "", unlist(args)),
+        version=gsub(".*=", "", unlist(args)))
+    packages[package == version, version:=NA]
+}
 installed <- as.data.table(installed.packages())
 
 for (pckg_name in packages$package) {
     package_dt <- packages[package == pckg_name]
-    pckg_name <- tail(unlist(strsplit(pckg_name, split = "/")), n = 1)
+    pckg_name <- gsub(".*/", "", pckg_name)
     version <- package_dt$version
 
-    if (pckg_name %in% installed$Package &
-      (version == "" || installed[Package == pckg_name, Version] == version)
-    ) {
-        #message(paste(pckg_name, "already installed"))
-    } else {
-        if (package_dt$bioconductor == TRUE) {
-            INSTALL <- BiocManager::install
-        } else {
-            INSTALL <- install.packages
-        }
+    if (!pckg_name %in% installed$Package || (!is.na(version) && compareVersion(
+            installed[Package == pckg_name, Version], version) < 0)) {
+
         package <- package_dt$package
         message(paste("install", package))
-        INSTALL(package)
+        BiocManager::install(package, ask=FALSE, update=FALSE)
         message(paste("installed", package))
     }
 }
 
-options(warn = 0)
+options(warn = 0)
diff --git a/drop/modules/aberrant-expression-pipeline/Snakefile b/drop/modules/aberrant-expression-pipeline/Snakefile
@@ -13,7 +13,7 @@ rule aberrantExpression:
 
 rule aberrantExpression_dependency:
     output: AE_graph_file
-    shell: "snakemake --rulegraph {AE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
+    shell: "snakemake --nolock --rulegraph {AE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
 
 rule aberrantExpression_bamStats:
     input: 

diff --git a/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R b/drop/modules/aberrant-splicing-pipeline/Counting/01_1_countRNA_splitReads_samplewise.R
@@ -25,7 +25,7 @@ library(BSgenome)
 dataset    <- snakemake@wildcards$dataset
 workingDir <- snakemake@params$workingDir
 params <- snakemake@config$aberrantSplicing
-
+genomeAssembly <- snakemake@config$genomeAssembly
 
 # Read FRASER object
 fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
@@ -35,16 +35,8 @@ sample_id <- snakemake@wildcards[["sample_id"]]
 
 # If data is not strand specific, add genome info
 genome <- NULL
-
 if(strandSpecific(fds) == 0){
-  if(snakemake@config$genomeAssembly == 'hg19'){
-    genome <- "BSgenome.Hsapiens.UCSC.hg19"
-  } else if(snakemake@config$genomeAssembly == 'hg38'){
-    genome <- "BSgenome.Hsapiens.UCSC.hg38" 
-  }
-  if(!is.null(genome)){
-    genome <- getBSgenome(genome)
-  }
+  genome <- getBSgenome(genomeAssembly)
 }
 
 # Count splitReads for a given sample id

diff --git a/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R b/drop/modules/aberrant-splicing-pipeline/FRASER/07_extract_results_FraseR.R
@@ -10,6 +10,7 @@
 #'   - zScoreCutoff: '`sm cfg.AS.get("zScoreCutoff")`'
 #'   - deltaPsiCutoff: '`sm cfg.AS.get("deltaPsiCutoff")`'
 #'   - hpoFile: '`sm cfg.get("hpoFile")`'
+#'   - assemblyVersion: '`sm cfg.getBSGenomeVersion()`'
 #'  threads: 10
 #'  input:
 #'   - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
@@ -34,17 +35,15 @@ opts_chunk$set(fig.width=12, fig.height=8)
 dataset    <- snakemake@wildcards$dataset
 fdsFile    <- snakemake@input$fdsin
 workingDir <- snakemake@params$workingDir
+assemblyVersion <- snakemake@params$assemblyVersion
 
 register(MulticoreParam(snakemake@threads))
 # Limit number of threads for DelayedArray operations
 setAutoBPPARAM(MulticoreParam(snakemake@threads))
 
 # Load data and annotate ranges with gene names
 fds <- loadFraserDataSet(dir=workingDir, name=dataset)
-GRCh <- ifelse(snakemake@config$genomeAssembly == 'hg19', 37, 
-               ifelse(snakemake@config$genomeAssembly == 'hg38', 38,
-                      error('Genome assembly must be either hg19 or hg38')))
-fds <- annotateRanges(fds, GRCh = GRCh)
+fds <- annotateRanges(fds, GRCh = assemblyVersion)
 colData(fds)$sampleID <- as.character(colData(fds)$sampleID)
 
 # Extract results per junction

diff --git a/drop/modules/aberrant-splicing-pipeline/Snakefile b/drop/modules/aberrant-splicing-pipeline/Snakefile
@@ -13,4 +13,4 @@ rule aberrantSplicing:
 
 rule aberrantSplicing_dependency:
     output: AS_graph_file
-    shell: "snakemake --rulegraph {AS_index_output} | dot -Tsvg -Grankdir=TB > {output}"
+    shell: "snakemake --nolock --rulegraph {AS_index_output} | dot -Tsvg -Grankdir=TB > {output}"
diff --git a/drop/modules/mae-pipeline/MAE/deseq_mae.R b/drop/modules/mae-pipeline/MAE/deseq_mae.R
@@ -35,19 +35,12 @@ rmae <- DESeq4MAE(mae_counts) ## build test for counting REF and ALT in MAE
 
 ### Add AF information from gnomAD
 if (snakemake@config$mae$addAF == TRUE) {
-  print("Adding gnomAD allele frequencies...")
+    print("Adding gnomAD allele frequencies...")
 
-  # obtain the assembly from the config
-  gene_assembly <- snakemake@config$genomeAssembly
-
-  if(gene_assembly == 'hg19'){
-    suppressPackageStartupMessages(library(MafDb.gnomAD.r2.1.hs37d5))
-  } else if(gene_assembly == 'hg38'){
-    suppressPackageStartupMessages(library(MafDb.gnomAD.r2.1.GRCh38))
-  }
-
-  rmae <- add_gnomAD_AF(rmae, gene_assembly = gene_assembly,
-                          max_af_cutoff = snakemake@config$mae$maxAF)
+    # obtain the assembly from the config
+    genome_assembly <- snakemake@config$genomeAssembly
+    rmae <- add_gnomAD_AF(rmae, genome_assembly = genome_assembly,
+        max_af_cutoff = snakemake@config$mae$maxAF)
 } else {
     rmae[, rare := NA]
 }

diff --git a/drop/modules/mae-pipeline/Snakefile b/drop/modules/mae-pipeline/Snakefile
@@ -13,7 +13,7 @@ rule mae:
 
 rule mae_dependency:
     output: MAE_graph_file
-    shell: "snakemake --rulegraph {MAE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
+    shell: "snakemake --nolock --rulegraph {MAE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
 
 rule sampleQC:
     input: cfg.getHtmlFromScript(MAE_WORKDIR / "QC" / "Datasets.R")

diff --git a/drop/requirementsR.txt b/drop/requirementsR.txt
@@ -1,18 +1,15 @@
-package bioconductor version
-OUTRIDER TRUE 
-c-mertes/FRASER TRUE 1.2.0
-mumichae/tMAE TRUE 
-VariantAnnotation TRUE 
-rmarkdown FALSE 
-knitr FALSE 
-ggplot2 FALSE 
-ggthemes FALSE 
-cowplot FALSE 
-data.table FALSE  
-dplyr FALSE 
-tidyr FALSE 
-magrittr FALSE 
-devtools FALSE 
-BSgenome.Hsapiens.UCSC.hg19 TRUE 
-#MafDb.gnomAD.r2.1.hs37d5 TRUE 
-#MafDb.gnomAD.r2.1.GRCh38 TRUE 
+package	version
+gagneurlab/OUTRIDER	1.6.1
+c-mertes/FRASER	1.2.0
+mumichae/tMAE	1.0.0
+VariantAnnotation	
+rmarkdown	
+knitr	
+ggplot2	
+ggthemes	
+cowplot	
+data.table	
+dplyr	
+tidyr	
+magrittr	
+devtools	
diff --git a/drop/setupDrop.py b/drop/setupDrop.py
@@ -1,4 +1,5 @@
 import drop
+from drop.config.DropConfig import DropConfig
 import subprocess
 from pathlib import Path
 from snakemake.logging import logger
@@ -24,14 +25,26 @@ def setupPaths(projectRoot):
     return repoPaths, projectPaths
 
 
-def installRPackages():
+def installRPackages(config: DropConfig = None):
     logger.info("check for missing R packages")
     script = Path(drop.__file__).parent / "installRPackages.R"
     requirements = Path(drop.__file__).parent / 'requirementsR.txt'
 
+    # install main packages
     response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT)
     response.check_returncode()
 
+    # install pipeline depending packages
+    if config is not None:
+        pkg_assembly_name = config.getBSGenomeName()
+        response = subprocess.run(["Rscript", script, pkg_assembly_name], stderr=subprocess.STDOUT)
+        response.check_returncode()
+
+        pkg_mafdb_name = config.getMafDbName()
+        if pkg_mafdb_name is not None and config.get("mae").get('addAF') is True:
+            response = subprocess.run(["Rscript", script, pkg_mafdb_name], stderr=subprocess.STDOUT)
+            response.check_returncode()
+
 
 def checkDropVersion(projectRoot, force=False):
     if projectRoot != Path.cwd().resolve():

diff --git a/drop/template/Snakefile b/drop/template/Snakefile
@@ -6,12 +6,12 @@ from wbuild.createIndex import createIndexRule, ci
 configfile: "config.yaml"
 
 projectDir = Path.cwd().resolve()
-drop.installRPackages()
 drop.checkDropVersion(projectDir)
 _, projectPaths = drop.setupPaths(projectDir)
 tmp_dir = projectPaths["tmpDir"]
 
 cfg = drop.config.DropConfig(wbuild.utils.Config())
+drop.installRPackages(cfg)
 sa = cfg.sampleAnnotation
 config = cfg.config_dict # legacy
 

diff --git a/drop/template/config.yaml b/drop/template/config.yaml
@@ -11,7 +11,7 @@ geneAnnotation:
     # <annotation_name> : <path to gencode v29 annotation>
     v29:  /path/to/gencode29.gtf.gz # example
 
-genomeAssembly: hg19  # either hg19 or hg38
+genomeAssembly: hg19  # either hg19/hs37d5 or hg38/GRCh38
 exportCounts:
     # specify which gene annotations to include and which 
     # groups to exclude when exporting counts

diff --git a/tests/pipeline/test_AE.py b/tests/pipeline/test_AE.py
@@ -7,6 +7,7 @@ class Test_AE_Pipeline:
     def pipeline_run(self, demo_dir):
         LOGGER.info("run aberrant expression pipeline...")
         pipeline_run = run(["snakemake", "aberrantExpression", f"-j{CORES}"], demo_dir)
+        tmp = run(["snakemake", "--unlock"], demo_dir)
         assert "Finished job 0." in pipeline_run.stderr
         return pipeline_run
 
@@ -74,6 +75,8 @@ def no_import(self, demo_dir):
     def test_no_import(self, no_import):
         merged_counts = f"{no_import}/Output/processed_data/aberrant_expression/v29/outrider/outrider/total_counts.Rds"
         r = run(f"snakemake {merged_counts} --configfile config_noimp.yaml -nqF", no_import)
+        tmp = run(["snakemake", "--unlock"], no_import)
+
         print(r.stdout)
         assert "10\tAberrantExpression_pipeline_Counting_countReads_R" in r.stdout
         assert "1\tAberrantExpression_pipeline_Counting_mergeCounts_R" in r.stdout

diff --git a/tests/pipeline/test_AS.py b/tests/pipeline/test_AS.py
@@ -7,6 +7,7 @@ class Test_AS_Pipeline:
     def pipeline_run(self, demo_dir):
         LOGGER.info("run aberrant splicing pipeline")
         pipeline_run = run(["snakemake", "aberrantSplicing", f"-j{CORES}"], demo_dir)
+        tmp = run(["snakemake", "--unlock"], demo_dir)
         assert "Finished job 0." in pipeline_run.stderr
         return pipeline_run
 

diff --git a/tests/pipeline/test_MAE.py b/tests/pipeline/test_MAE.py
@@ -7,6 +7,7 @@ class Test_MAE_Pipeline:
     def pipeline_run(self, demo_dir):
         LOGGER.info("run MAE pipeline")
         pipeline_run = run(["snakemake", "mae", f"-j{CORES}"], demo_dir)
+        tmp = run(["snakemake", "--unlock"], demo_dir)
         assert "Finished job 0." in pipeline_run.stderr
         return pipeline_run