Bump to 1.0.3 (#155)

* Fix Issue #114 import counts (#116, #122) * Refactoring (#118) * adapt DROP to theta usage in FRASER (#129) * Offline runs (#137, #139, #160) * remove file extension in outrider module (#140) * Update splicing export (#141) * Version check (#143) * better check for chromosome style (#148) * GitHub actions (#151) * Export counts meta (#144) * Install needed BSgenome on the fly (#147) * rename gene_biotype->gene_type in GTF (#162) Co-authored-by: Michaela Müller <mi.mueller@tum.de> Co-authored-by: Vicente <yepez@in.tum.de> Co-authored-by: Christian Mertes <mertes@in.tum.de> Co-authored-by: Anders Jemt <jemten@users.noreply.github.com> Co-authored-by: Alex Paul <github@ajpdev.com>
gagneurlab · Feb 5, 2021 · f089b63 · f089b63
1 parent d4afeac
commit f089b63
Show file tree

Hide file tree

Showing 44 changed files with 515 additions and 344 deletions.
diff --git a/.travis.yml → ._travis.yml b/.travis.yml → ._travis.yml
@@ -16,7 +16,8 @@ install:
   # install dependencies
   - source $HOME/miniconda/etc/profile.d/conda.sh
   - conda create -q -n drop -c conda-forge -c bioconda "python>=${TRAVIS_PYTHON_VERSION}" "r-base>=4.0.3" "drop>=1.0.1" "wbuild>=1.8" "bioconductor-fraser>=1.2.0"
-  - conda remove -n drop --force drop
+  # remove FRASER to check installation routine
+  - conda remove -n drop --force drop bioconductor-fraser
   - conda activate drop
   - pip install -r tests/requirements.txt
 

diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -0,0 +1,37 @@
+name: Build
+
+on: [push]
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        #conda env update --file environment.yml --name base
+        conda install -c conda-forge -c bioconda drop
+        pip install .
+    - name: Lint with flake8
+      run: |
+        conda install flake8
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --builtins="snakemake" --select=E9,F63,F7,F82 --show-source --statistics 
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        conda install pytest
+        pip install -r tests/requirements.txt
+        pytest
diff --git a/README.md b/README.md
@@ -1,10 +1,9 @@
 # Detection of RNA Outlier Pipeline
-[![Pipeline status](https://travis-ci.org/gagneurlab/drop.svg?branch=master)](https://travis-ci.org/gagneurlab/drop)
+[![DROP pipeline status](https://github.com/gagneurlab/drop/workflows/Build/badge.svg?branch=master)](https://github.com/gagneurlab/drop/actions?query=workflow%3ABuild)
 [![Version](https://img.shields.io/github/v/release/gagneurlab/drop?include_prereleases)](https://github.com/gagneurlab/drop/releases)
 [![Version](https://readthedocs.org/projects/gagneurlab-drop/badge/?version=latest)](https://gagneurlab-drop.readthedocs.io/en/latest)
 
-The manuscript main file, supplementary figures and table can be found in the manuscript folder or in 
-[protocol exchange](https://protocolexchange.researchsquare.com/article/993ff4a5-38ce-4261-902a-600dbd528ba2/v1).
+The manuscript is now available in [Nature Protocols](https://www.nature.com/articles/s41596-020-00462-5). [SharedIt link.](https://rdcu.be/cdMmF)
 
 <img src="drop_sticker.png" alt="drop logo" width="200" class="center"/>
 
@@ -45,19 +44,19 @@ Once these files are set up, you can execute a dry run from your project directo
 ```
 snakemake -n
 ```
-This shows you the rules of all subworkflows. Omit `-n` if you are sure that you want you execute all printed rules. You can also invoke single workflows explicitly e.g. for aberrant splicing with 
+This shows you the rules of all subworkflows. Omit `-n` and specify the number of cores with `--cores ` if you are sure that you want you execute all printed rules. You can also invoke single workflows explicitly e.g. for aberrant expression with:
 ```
-snakemake aberrantExpression -n
+snakemake aberrantExpression --cores 10
 ```
 
 ## Datasets
 The following publicly-available datasets of gene counts can be used as controls.
 Please cite as instructed for each dataset.
 
-* 119 non-strand specific fibroblasts: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3887451.svg)](https://doi.org/10.5281/zenodo.3887451)
+* 119 non-strand specific fibroblasts: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3887450.svg)](https://doi.org/10.5281/zenodo.3887450)
 
-* 139 strand specific fibroblasts: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3963474.svg)](https://doi.org/10.5281/zenodo.3963474)
+* 139 strand specific fibroblasts: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3963473.svg)](https://doi.org/10.5281/zenodo.3963473)
 
-* 125 strand specific blood: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3963470.svg)](https://doi.org/10.5281/zenodo.3963470)
+* 125 strand specific blood: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3963469.svg)](https://doi.org/10.5281/zenodo.3963469)
 
 If you want to contribute with your own count matrices, please contact us: yepez at in.tum.de
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -23,7 +23,7 @@
 author = 'Michaela Müller'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.2'
+release_ = '1.0.3'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
@@ -40,7 +40,7 @@ Parameter            Type        Description
 projectTitle         character   Title of the project to be displayed on the rendered HTML output                                                                         ``Project 1``
 htmlOutputPath       character   Full path of the folder where the HTML files are rendered                                                                                ``/data/project1/htmlOutput``
 indexWithFolderName  boolean     If true, the basename of the project directory will be used as prefix for the index.html file                                            ``true``
-genomeAssembly       character   Either hg19 or hg38, depending on the genome assembly used for mapping                                                                   ``/data/project1``
+genomeAssembly       character   Either hg19/hs37d5 or hg38/GRCh38, depending on the genome assembly used for mapping                                                     ``/data/project1``
 sampleAnnotation     character   Full path of the sample annotation table                                                                                                 ``/data/project1/sample_annotation.tsv``
 root                 character   Full path of the folder where the subdirectories processed_data and processed_results will be created containing DROP's output files.    ``/data/project1``
 geneAnnotation       dictionary  A key-value list of the annotation name (key) and the full path to the GTF file (value). More than one annotation file can be provided.  ``anno1: /path/to/gtf1.gtf``
@@ -197,7 +197,7 @@ Two different files can be downloaded from our `public repository <https://www.c
 
 1. VCF file containing different positions to be used to match DNA with RNA files.
 The file name is ``qc_vcf_1000G_{genome_build}.vcf.gz``. One file is available for each 
-genome build (hg19 and hg38). Download it together with the corresponding .tbi file. 
+genome build (hg19/hs37d5 and hg38/GRCh38). Download it together with the corresponding .tbi file. 
 Indicate the full path to the vcf file in the ``qcVcf`` key in the mono-allelic expression dictionary.
 This file is only needed for the MAE module. Otherwise, write ``null`` in the ``qcVcf`` key.
 

diff --git a/drop/__init__.py b/drop/__init__.py
@@ -4,4 +4,4 @@
 from . import utils
 from . import demo
 
-__version__ = "1.0.2"
+__version__ = "1.0.3"
diff --git a/drop/cli.py b/drop/cli.py
@@ -15,7 +15,7 @@
 
 @click.group()
 @click_log.simple_verbosity_option(logger)
-@click.version_option('1.0.2',prog_name='drop')
+@click.version_option('1.0.3',prog_name='drop')
 def main():
     pass
 

diff --git a/drop/config/DropConfig.py b/drop/config/DropConfig.py
@@ -94,7 +94,8 @@ def setDefaults(self, config_dict):
         setKey = utils.setKey
         setKey(config_dict, None, "fileRegex", r".*\.(R|md)")
         setKey(config_dict, None, "genomeAssembly", "hg19")
-        setKey(config_dict, None, "hpoFile", None)
+        hpo_url = 'https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz'
+        setKey(config_dict, None, "hpoFile", hpo_url)
 
         # set submodule dictionaries
         setKey(config_dict, None, "aberrantExpression", dict())
@@ -154,3 +155,39 @@ def getFastaFile(self, str_=True):
 
     def getFastaDict(self, str_=True):
         return utils.returnPath(self.fastaDict, str_)
+
+    def getBSGenomeName(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID == 'hg19':
+            return "BSgenome.Hsapiens.UCSC.hg19"
+        if assemblyID == 'hs37d5':
+            return "BSgenome.Hsapiens.1000genomes.hs37d5"
+        if assemblyID == 'hg38':
+            return "BSgenome.Hsapiens.UCSC.hg38"
+        if assemblyID == 'GRCh38':
+            return "BSgenome.Hsapiens.NCBI.GRCh38"
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+    def getBSGenomeVersion(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID in ['hg19', 'hs37d5']:
+            return 37
+        if assemblyID in ['hg38', 'GRCh38']:
+            return 38
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+    def getMafDbName(self):
+        assemblyID = self.get("genomeAssembly")
+
+        if assemblyID in ['hg19', 'hs37d5']:
+            return "MafDb.gnomAD.r2.1.hs37d5"
+        if assemblyID in ['hg38', 'GRCh38']:
+            return "MafDb.gnomAD.r2.1.GRCh38"
+
+        raise ValueError("Provided genome assembly not known: " + assemblyID)
+
+
diff --git a/drop/config/ExportCounts.py b/drop/config/ExportCounts.py
@@ -19,14 +19,15 @@ def __init__(self, dict_, outputRoot, sampleAnnotation, geneAnnotations, genomeA
         """
         self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"]
         self.config_dict = self.setDefaults(dict_, geneAnnotations)
-        self.outputRoot = outputRoot
+        self.outputRoot = outputRoot / "exported_counts"
         self.sa = sampleAnnotation
         self.genomeAssembly = genomeAssembly
+        self.geneAnnotations = self.get("geneAnnotations")
         self.modules = {
             "aberrantExpression": aberrantExpression,
             "aberrantSplicing": aberrantSplicing
         }
-        self.pattern = self.outputRoot / "exported_counts" / "{dataset}--{genomeAssembly}--{annotation}"
+        self.pattern = self.outputRoot / "{dataset}--{genomeAssembly}--{annotation}"
 
     def setDefaults(self, config_dict, gene_annotations):
         utils.setKey(config_dict, None, "geneAnnotations", gene_annotations)
@@ -49,7 +50,6 @@ def get(self, key):
     def getFilePattern(self, str_=True, expandStr=False):
         pattern = self.pattern
         if expandStr:
-            str_=True
             pattern = pattern.__str__().replace("{", "{{").replace("}", "}}") 
         return utils.returnPath(pattern, str_=str_)
 
@@ -67,23 +67,35 @@ def getExportGroups(self, modules=None):
         for module in modules:
             groups.extend(self.modules[module].groups)
         export_groups = set(groups) - set(self.get("excludeGroups"))
-        return export_groups
+        return sorted(list(export_groups))
 
-    def getExportCountFiles(self, prefix, expandPattern=None, **kwargs):
+    def getFiles(self, filename, datasets=None, **kwargs):
         """
-        Determine export count files.
-        :param prefix: name of file
-        :return: list of files to
+        Determine files for export count groups.
+        :param filename: name of file
+        :return: list of export files
         """
-        if prefix not in self.COUNT_TYPE_MAP.keys():
-            raise ValueError(f"{prefix} not a valid file type for exported counts")
+        if datasets is None:
+            datasets = self.getExportGroups()
+        file_pattern = str(self.pattern / f"{filename}")
+        return expand(
+            file_pattern,
+            dataset=datasets,
+            annotation=self.geneAnnotations,
+            genomeAssembly=self.genomeAssembly,
+            **kwargs
+        )
 
-        datasets = self.getExportGroups([self.COUNT_TYPE_MAP[prefix]])
-        if expandPattern is None:
-            file_pattern = str(self.pattern / f"{prefix}.tsv.gz")
-        else:
-            file_pattern = str(self.pattern / f"{expandPattern}.tsv.gz")
-        count_files = expand(file_pattern, annotation=self.get("geneAnnotations"),
-                             dataset=datasets, genomeAssembly=self.genomeAssembly, **kwargs)
-        return count_files
+    def getExportCountFiles(self, count_type, suffix="tsv.gz", expandPattern=None, **kwargs):
+        """
+        Determine export count files.
+        :param count_type: count type for mapping the submodule
+        :param suffix: file type suffix (without dot)
+        :return: list of export count files
+        """
+        if count_type not in self.COUNT_TYPE_MAP.keys():
+            raise ValueError(f"'{count_type}' not a valid file type for exported counts")
+        datasets = self.getExportGroups([self.COUNT_TYPE_MAP[count_type]])
+        expandPattern = count_type if expandPattern is None else expandPattern
+        return self.getFiles(f"{expandPattern}.{suffix}", datasets, **kwargs)
 
diff --git a/drop/config/SampleAnnotation.py b/drop/config/SampleAnnotation.py
@@ -97,7 +97,10 @@ def createSampleFileMapping(self):
 
     def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
         """
-        Create a mapping of DROP groups to lists of sample IDs
+        :param group_key: name of group column in sample annotation
+        :param file_type: name of file column e.g. "RNA_BAM_FILE", "DNA_VCF_FILE"
+        :param sep: separator of multiple groups in group column
+        :return: mapping of drop group and ID
         """
         if not file_type:
             file_type = "RNA_BAM_FILE"

diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
@@ -8,6 +8,7 @@ geneAnnotation:
     v29: Data/gencode_annotation_trunc.gtf
 genomeAssembly: hg19
 hpoFile: null
+random_seed: true  # use this to fully gain reproducible data in debug mode
 
 exportCounts:
   geneAnnotations:
@@ -30,7 +31,7 @@ aberrantExpression:
         pasWindow: 1000
 
 aberrantSplicing:
-    groups: 
+    groups:
       - fraser
     recount: true
     longRead: false
@@ -63,4 +64,3 @@ tools:
     gatkCmd: gatk
     bcftoolsCmd: bcftools
     samtoolsCmd: samtools
-
diff --git a/drop/installRPackages.R b/drop/installRPackages.R
@@ -1,35 +1,40 @@
 options(repos=structure(c(CRAN="https://cloud.r-project.org")), warn = -1)
-suppressPackageStartupMessages(library(data.table))
 
 if (!requireNamespace('BiocManager', quietly = TRUE)) {
     install.packages('BiocManager')
     BiocManager::install("remotes")
 }
+if (!requireNamespace('data.table', quietly = TRUE)) {
+    install.packages('data.table')
+}
 
+suppressPackageStartupMessages(library(data.table))
+
+# do not turn wanrings into errors. E.g. "Package XXX build for R 4.0.X"
+Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true")
 
 args <- commandArgs(trailingOnly=TRUE)
-packages <- fread(args[1], fill = TRUE)
-packages <- packages[!startsWith(package, "#")]
+if (file.exists(args[1])){
+    packages <- fread(args[1], fill = TRUE)
+} else {
+    packages <- data.table(
+        package=gsub("=.*", "", unlist(args)),
+        version=gsub(".*=", "", unlist(args)))
+    packages[package == version, version:=NA]
+}
 installed <- as.data.table(installed.packages())
 
 for (pckg_name in packages$package) {
     package_dt <- packages[package == pckg_name]
-    pckg_name <- tail(unlist(strsplit(pckg_name, split = "/")), n = 1)
+    pckg_name <- gsub(".*/", "", pckg_name)
     version <- package_dt$version
 
-    if (pckg_name %in% installed$Package &
-      (version == "" || installed[Package == pckg_name, Version] == version)
-    ) {
-        #message(paste(pckg_name, "already installed"))
-    } else {
-        if (package_dt$bioconductor == TRUE) {
-            INSTALL <- BiocManager::install
-        } else {
-            INSTALL <- install.packages
-        }
+    if (!pckg_name %in% installed$Package || (!is.na(version) && compareVersion(
+        installed[Package == pckg_name, Version], version) < 0)) {
+
         package <- package_dt$package
         message(paste("install", package))
-        INSTALL(package)
+        BiocManager::install(package, ask=FALSE, update=FALSE)
         message(paste("installed", package))
     }
 }

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/Datasets.R b/drop/modules/aberrant-expression-pipeline/Counting/Datasets.R
@@ -23,11 +23,11 @@ datasets <- snakemake@config$aberrantExpression$groups
 #+ echo=FALSE, results="asis"
 devNull <- sapply(datasets, function(name){
   sapply(gene_annotation_names, function(version){
-  cat(paste0(
-    "<h1>Dataset: ", name, "</h1>",
-    "<p>",
-    "</br>", "<a href='AberrantExpression/Counting/", version, "/Summary_", name, ".html'   >Count Summary</a>",
-    "</br>", "</p>"
-  ))
+    cat(paste0(
+      "<h1>Dataset: ", name, ", annotation: ", version, "</h1>",
+      "<p>",
+      "</br>", "<a href='AberrantExpression/Counting/", version, "/Summary_", name, ".html'   >Count Summary</a>",
+      "</br>", "</p>"
+    ))
   })
 })