Skip to content

Commit

Permalink
Install needed BSgenome on the fly (#147)
Browse files Browse the repository at this point in the history
* remove assembly specific packages closes #142
* fix pipeline when no import count matrix in sample annotation #114 
* refactored config parsers by submodule
* update to FRASER 1.2.0 @c-mertes 
* update aberrant splicing export @c-mertes 
* local loading of HPO file if present @vyepez88 
* fixed buggy file extension when saving ods @jemten 
* Add files via upload (#146)
* update OUTRIDER version and bugfixes
* allow NCBI based assembly in config fixes #135
* move assembly version test to DropConf and adjust the pipeline.
* add unlock if snakemake fails
* use addAF to check installation
* Add badge to readme
Co-authored-by: mumichae <51025211+mumichae@users.noreply.github.com>
Co-authored-by: Vicente Yepez <30469316+vyepez88@users.noreply.github.com>
  • Loading branch information
c-mertes committed Jan 28, 2021
1 parent 3e26ae1 commit 0b4bb21
Show file tree
Hide file tree
Showing 19 changed files with 112 additions and 71 deletions.
3 changes: 2 additions & 1 deletion ._travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ install:
# install dependencies
- source $HOME/miniconda/etc/profile.d/conda.sh
- conda create -q -n drop -c conda-forge -c bioconda "python>=${TRAVIS_PYTHON_VERSION}" "r-base>=4.0.3" "drop>=1.0.1" "wbuild>=1.8" "bioconductor-fraser>=1.2.0"
- conda remove -n drop --force drop
# remove FRASER to check installation routine
- conda remove -n drop --force drop bioconductor-fraser
- conda activate drop
- pip install -r tests/requirements.txt

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Python Package using Conda
name: Build

on: [push]

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Detection of RNA Outlier Pipeline
[![Pipeline status](https://travis-ci.org/gagneurlab/drop.svg?branch=master)](https://travis-ci.org/gagneurlab/drop)
[![DROP pipeline status](https://github.com/gagneurlab/drop/workflows/Build/badge.svg?branch=bsgenome)](https://github.com/gagneurlab/drop/actions?query=workflow%3ABuild)
[![Version](https://img.shields.io/github/v/release/gagneurlab/drop?include_prereleases)](https://github.com/gagneurlab/drop/releases)
[![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest)

Expand Down
4 changes: 2 additions & 2 deletions docs/source/prepare.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Parameter Type Description
projectTitle character Title of the project to be displayed on the rendered HTML output ``Project 1``
htmlOutputPath character Full path of the folder where the HTML files are rendered ``/data/project1/htmlOutput``
indexWithFolderName boolean If true, the basename of the project directory will be used as prefix for the index.html file ``true``
genomeAssembly character Either hg19 or hg38, depending on the genome assembly used for mapping ``/data/project1``
genomeAssembly character Either hg19/hs37d5 or hg38/GRCh38, depending on the genome assembly used for mapping ``/data/project1``
sampleAnnotation character Full path of the sample annotation table ``/data/project1/sample_annotation.tsv``
root character Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files. ``/data/project1``
geneAnnotation dictionary A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided. ``anno1: /path/to/gtf1.gtf``
Expand Down Expand Up @@ -197,7 +197,7 @@ Two different files can be downloaded from our `public repository <https://www.c

1. VCF file containing different positions to be used to match DNA with RNA files.
The file name is ``qc_vcf_1000G_{genome_build}.vcf.gz``. One file is available for each
genome build (hg19 and hg38). Download it together with the corresponding .tbi file.
genome build (hg19/hs37d5 and hg38/GRCh38). Download it together with the corresponding .tbi file.
Indicate the full path to the vcf file in the ``qcVcf`` key in the mono-allelic expression dictionary.
This file is only needed for the MAE module. Otherwise, write ``null`` in the ``qcVcf`` key.

Expand Down
36 changes: 36 additions & 0 deletions drop/config/DropConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,3 +155,39 @@ def getFastaFile(self, str_=True):

def getFastaDict(self, str_=True):
return utils.returnPath(self.fastaDict, str_)

def getBSGenomeName(self):
assemblyID = self.get("genomeAssembly")

if assemblyID == 'hg19':
return "BSgenome.Hsapiens.UCSC.hg19"
if assemblyID == 'hs37d5':
return "BSgenome.Hsapiens.1000genomes.hs37d5"
if assemblyID == 'hg38':
return "BSgenome.Hsapiens.UCSC.hg38"
if assemblyID == 'GRCh38':
return "BSgenome.Hsapiens.NCBI.GRCh38"

raise ValueError("Provided genome assembly not known: " + assemblyID)

def getBSGenomeVersion(self):
assemblyID = self.get("genomeAssembly")

if assemblyID in ['hg19', 'hs37d5']:
return 37
if assemblyID in ['hg38', 'GRCh38']:
return 38

raise ValueError("Provided genome assembly not known: " + assemblyID)

def getMafDbName(self):
assemblyID = self.get("genomeAssembly")

if assemblyID in ['hg19', 'hs37d5']:
return "MafDb.gnomAD.r2.1.hs37d5"
if assemblyID in ['hg38', 'GRCh38']:
return "MafDb.gnomAD.r2.1.GRCh38"

raise ValueError("Provided genome assembly not known: " + assemblyID)


37 changes: 21 additions & 16 deletions drop/installRPackages.R
Original file line number Diff line number Diff line change
@@ -1,37 +1,42 @@
options(repos=structure(c(CRAN="https://cloud.r-project.org")), warn = -1)
suppressPackageStartupMessages(library(data.table))

if (!requireNamespace('BiocManager', quietly = TRUE)) {
install.packages('BiocManager')
BiocManager::install("remotes")
}
if (!requireNamespace('data.table', quietly = TRUE)) {
install.packages('data.table')
}

suppressPackageStartupMessages(library(data.table))

# do not turn wanrings into errors. E.g. "Package XXX build for R 4.0.X"
Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")

args <- commandArgs(trailingOnly=TRUE)
packages <- fread(args[1], fill = TRUE)
packages <- packages[!startsWith(package, "#")]
if (file.exists(args[1])){
packages <- fread(args[1], fill = TRUE)
} else {
packages <- data.table(
package=gsub("=.*", "", unlist(args)),
version=gsub(".*=", "", unlist(args)))
packages[package == version, version:=NA]
}
installed <- as.data.table(installed.packages())

for (pckg_name in packages$package) {
package_dt <- packages[package == pckg_name]
pckg_name <- tail(unlist(strsplit(pckg_name, split = "/")), n = 1)
pckg_name <- gsub(".*/", "", pckg_name)
version <- package_dt$version

if (pckg_name %in% installed$Package &
(version == "" || installed[Package == pckg_name, Version] == version)
) {
#message(paste(pckg_name, "already installed"))
} else {
if (package_dt$bioconductor == TRUE) {
INSTALL <- BiocManager::install
} else {
INSTALL <- install.packages
}
if (!pckg_name %in% installed$Package || (!is.na(version) && compareVersion(
installed[Package == pckg_name, Version], version) < 0)) {

package <- package_dt$package
message(paste("install", package))
INSTALL(package)
BiocManager::install(package, ask=FALSE, update=FALSE)
message(paste("installed", package))
}
}

options(warn = 0)
options(warn = 0)
2 changes: 1 addition & 1 deletion drop/modules/aberrant-expression-pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule aberrantExpression:

rule aberrantExpression_dependency:
output: AE_graph_file
shell: "snakemake --rulegraph {AE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
shell: "snakemake --nolock --rulegraph {AE_index_output} | dot -Tsvg -Grankdir=TB > {output}"

rule aberrantExpression_bamStats:
input:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ library(BSgenome)
dataset <- snakemake@wildcards$dataset
workingDir <- snakemake@params$workingDir
params <- snakemake@config$aberrantSplicing

genomeAssembly <- snakemake@config$genomeAssembly

# Read FRASER object
fds <- loadFraserDataSet(dir=workingDir, name=paste0("raw-", dataset))
Expand All @@ -35,16 +35,8 @@ sample_id <- snakemake@wildcards[["sample_id"]]

# If data is not strand specific, add genome info
genome <- NULL

if(strandSpecific(fds) == 0){
if(snakemake@config$genomeAssembly == 'hg19'){
genome <- "BSgenome.Hsapiens.UCSC.hg19"
} else if(snakemake@config$genomeAssembly == 'hg38'){
genome <- "BSgenome.Hsapiens.UCSC.hg38"
}
if(!is.null(genome)){
genome <- getBSgenome(genome)
}
genome <- getBSgenome(genomeAssembly)
}

# Count splitReads for a given sample id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#' - zScoreCutoff: '`sm cfg.AS.get("zScoreCutoff")`'
#' - deltaPsiCutoff: '`sm cfg.AS.get("deltaPsiCutoff")`'
#' - hpoFile: '`sm cfg.get("hpoFile")`'
#' - assemblyVersion: '`sm cfg.getBSGenomeVersion()`'
#' threads: 10
#' input:
#' - setup: '`sm cfg.AS.getWorkdir() + "/config.R"`'
Expand All @@ -34,17 +35,15 @@ opts_chunk$set(fig.width=12, fig.height=8)
dataset <- snakemake@wildcards$dataset
fdsFile <- snakemake@input$fdsin
workingDir <- snakemake@params$workingDir
assemblyVersion <- snakemake@params$assemblyVersion

register(MulticoreParam(snakemake@threads))
# Limit number of threads for DelayedArray operations
setAutoBPPARAM(MulticoreParam(snakemake@threads))

# Load data and annotate ranges with gene names
fds <- loadFraserDataSet(dir=workingDir, name=dataset)
GRCh <- ifelse(snakemake@config$genomeAssembly == 'hg19', 37,
ifelse(snakemake@config$genomeAssembly == 'hg38', 38,
error('Genome assembly must be either hg19 or hg38')))
fds <- annotateRanges(fds, GRCh = GRCh)
fds <- annotateRanges(fds, GRCh = assemblyVersion)
colData(fds)$sampleID <- as.character(colData(fds)$sampleID)

# Extract results per junction
Expand Down
2 changes: 1 addition & 1 deletion drop/modules/aberrant-splicing-pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ rule aberrantSplicing:

rule aberrantSplicing_dependency:
output: AS_graph_file
shell: "snakemake --rulegraph {AS_index_output} | dot -Tsvg -Grankdir=TB > {output}"
shell: "snakemake --nolock --rulegraph {AS_index_output} | dot -Tsvg -Grankdir=TB > {output}"
17 changes: 5 additions & 12 deletions drop/modules/mae-pipeline/MAE/deseq_mae.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,12 @@ rmae <- DESeq4MAE(mae_counts) ## build test for counting REF and ALT in MAE

### Add AF information from gnomAD
if (snakemake@config$mae$addAF == TRUE) {
print("Adding gnomAD allele frequencies...")
print("Adding gnomAD allele frequencies...")

# obtain the assembly from the config
gene_assembly <- snakemake@config$genomeAssembly

if(gene_assembly == 'hg19'){
suppressPackageStartupMessages(library(MafDb.gnomAD.r2.1.hs37d5))
} else if(gene_assembly == 'hg38'){
suppressPackageStartupMessages(library(MafDb.gnomAD.r2.1.GRCh38))
}

rmae <- add_gnomAD_AF(rmae, gene_assembly = gene_assembly,
max_af_cutoff = snakemake@config$mae$maxAF)
# obtain the assembly from the config
genome_assembly <- snakemake@config$genomeAssembly
rmae <- add_gnomAD_AF(rmae, genome_assembly = genome_assembly,
max_af_cutoff = snakemake@config$mae$maxAF)
} else {
rmae[, rare := NA]
}
Expand Down
2 changes: 1 addition & 1 deletion drop/modules/mae-pipeline/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule mae:

rule mae_dependency:
output: MAE_graph_file
shell: "snakemake --rulegraph {MAE_index_output} | dot -Tsvg -Grankdir=TB > {output}"
shell: "snakemake --nolock --rulegraph {MAE_index_output} | dot -Tsvg -Grankdir=TB > {output}"

rule sampleQC:
input: cfg.getHtmlFromScript(MAE_WORKDIR / "QC" / "Datasets.R")
Expand Down
33 changes: 15 additions & 18 deletions drop/requirementsR.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
package bioconductor version
OUTRIDER TRUE
c-mertes/FRASER TRUE 1.2.0
mumichae/tMAE TRUE
VariantAnnotation TRUE
rmarkdown FALSE
knitr FALSE
ggplot2 FALSE
ggthemes FALSE
cowplot FALSE
data.table FALSE
dplyr FALSE
tidyr FALSE
magrittr FALSE
devtools FALSE
BSgenome.Hsapiens.UCSC.hg19 TRUE
#MafDb.gnomAD.r2.1.hs37d5 TRUE
#MafDb.gnomAD.r2.1.GRCh38 TRUE
package version
gagneurlab/OUTRIDER 1.6.1
c-mertes/FRASER 1.2.0
mumichae/tMAE 1.0.0
VariantAnnotation
rmarkdown
knitr
ggplot2
ggthemes
cowplot
data.table
dplyr
tidyr
magrittr
devtools
15 changes: 14 additions & 1 deletion drop/setupDrop.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import drop
from drop.config.DropConfig import DropConfig
import subprocess
from pathlib import Path
from snakemake.logging import logger
Expand All @@ -24,14 +25,26 @@ def setupPaths(projectRoot):
return repoPaths, projectPaths


def installRPackages():
def installRPackages(config: DropConfig = None):
logger.info("check for missing R packages")
script = Path(drop.__file__).parent / "installRPackages.R"
requirements = Path(drop.__file__).parent / 'requirementsR.txt'

# install main packages
response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT)
response.check_returncode()

# install pipeline depending packages
if config is not None:
pkg_assembly_name = config.getBSGenomeName()
response = subprocess.run(["Rscript", script, pkg_assembly_name], stderr=subprocess.STDOUT)
response.check_returncode()

pkg_mafdb_name = config.getMafDbName()
if pkg_mafdb_name is not None and config.get("mae").get('addAF') is True:
response = subprocess.run(["Rscript", script, pkg_mafdb_name], stderr=subprocess.STDOUT)
response.check_returncode()


def checkDropVersion(projectRoot, force=False):
if projectRoot != Path.cwd().resolve():
Expand Down
2 changes: 1 addition & 1 deletion drop/template/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ from wbuild.createIndex import createIndexRule, ci
configfile: "config.yaml"

projectDir = Path.cwd().resolve()
drop.installRPackages()
drop.checkDropVersion(projectDir)
_, projectPaths = drop.setupPaths(projectDir)
tmp_dir = projectPaths["tmpDir"]

cfg = drop.config.DropConfig(wbuild.utils.Config())
drop.installRPackages(cfg)
sa = cfg.sampleAnnotation
config = cfg.config_dict # legacy

Expand Down
2 changes: 1 addition & 1 deletion drop/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ geneAnnotation:
# <annotation_name> : <path to gencode v29 annotation>
v29: /path/to/gencode29.gtf.gz # example

genomeAssembly: hg19 # either hg19 or hg38
genomeAssembly: hg19 # either hg19/hs37d5 or hg38/GRCh38
exportCounts:
# specify which gene annotations to include and which
# groups to exclude when exporting counts
Expand Down
3 changes: 3 additions & 0 deletions tests/pipeline/test_AE.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Test_AE_Pipeline:
def pipeline_run(self, demo_dir):
LOGGER.info("run aberrant expression pipeline...")
pipeline_run = run(["snakemake", "aberrantExpression", f"-j{CORES}"], demo_dir)
tmp = run(["snakemake", "--unlock"], demo_dir)
assert "Finished job 0." in pipeline_run.stderr
return pipeline_run

Expand Down Expand Up @@ -74,6 +75,8 @@ def no_import(self, demo_dir):
def test_no_import(self, no_import):
merged_counts = f"{no_import}/Output/processed_data/aberrant_expression/v29/outrider/outrider/total_counts.Rds"
r = run(f"snakemake {merged_counts} --configfile config_noimp.yaml -nqF", no_import)
tmp = run(["snakemake", "--unlock"], no_import)

print(r.stdout)
assert "10\tAberrantExpression_pipeline_Counting_countReads_R" in r.stdout
assert "1\tAberrantExpression_pipeline_Counting_mergeCounts_R" in r.stdout
Expand Down
1 change: 1 addition & 0 deletions tests/pipeline/test_AS.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Test_AS_Pipeline:
def pipeline_run(self, demo_dir):
LOGGER.info("run aberrant splicing pipeline")
pipeline_run = run(["snakemake", "aberrantSplicing", f"-j{CORES}"], demo_dir)
tmp = run(["snakemake", "--unlock"], demo_dir)
assert "Finished job 0." in pipeline_run.stderr
return pipeline_run

Expand Down
1 change: 1 addition & 0 deletions tests/pipeline/test_MAE.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ class Test_MAE_Pipeline:
def pipeline_run(self, demo_dir):
LOGGER.info("run MAE pipeline")
pipeline_run = run(["snakemake", "mae", f"-j{CORES}"], demo_dir)
tmp = run(["snakemake", "--unlock"], demo_dir)
assert "Finished job 0." in pipeline_run.stderr
return pipeline_run

Expand Down

0 comments on commit 0b4bb21

Please sign in to comment.