Merge branch 'development' v0.6.0 candidate

giesselmann · Apr 30, 2019 · 75ea566 · 75ea566
2 parents d9b1324 + 56ee2fd
commit 75ea566
Show file tree

Hide file tree

Showing 37 changed files with 1,012 additions and 439 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -51,6 +51,7 @@ RUN mkdir -p /src
 WORKDIR /src
 RUN wget https://bootstrap.pypa.io/get-pip.py
 RUN python3 get-pip.py
+RUN pip3 install --upgrade pip
 
 # copy and configure nanopype
 RUN mkdir -p /app

diff --git a/Snakefile b/Snakefile
@@ -38,7 +38,7 @@ from snakemake.utils import min_version
 
 
 # snakemake config
-min_version("5.4.0")
+min_version("5.4.3")
 configfile: "nanopype.yaml"
 
 
@@ -48,7 +48,7 @@ def get_tag():
     try:
         version = subprocess.check_output(cmd.split(), cwd=os.path.dirname(workflow.snakefile)).decode().strip()
     except subprocess.CalledProcessError:
-        raise RuntimeError('Unable to get version number from git tags')
+        raise RuntimeError('[ERROR] Unable to get version number from git tags.')
     if '-' in version:
         return 'latest'
     else:
@@ -59,12 +59,19 @@ nanopype_tag = get_tag()
 config['version'] = {'tag': nanopype_tag}
 
 
+# make raw data directory absolute path
+if os.path.exists(config['storage_data_raw']):
+    config['storage_data_raw'] = os.path.abspath(config['storage_data_raw'])
+else:
+    raise RuntimeError("[ERROR] Raw data archive not found.")
+
+
 # append username to shadow prefix if not present
 if hasattr(workflow, "shadow_prefix") and workflow.shadow_prefix:
     shadow_prefix = workflow.shadow_prefix
     if not os.environ['USER'] in shadow_prefix:
         shadow_prefix = os.path.join(shadow_prefix, os.environ['USER'])
-        print("Shadow prefix is changed from {p1} to {p2} to be user-specific".format(
+        print("[INFO] Shadow prefix is changed from {p1} to {p2} to be user-specific".format(
             p1=workflow.shadow_prefix, p2=shadow_prefix), file=sys.stderr)
     workflow.shadow_prefix = shadow_prefix
 
@@ -81,15 +88,16 @@ if 'references' in nanopype_env:
         config['references'] = {}
     for name, values in nanopype_env['references'].items():
         genome = values['genome']
-        chr_sizes = values['chr_sizes']
+        chr_sizes = values['chr_sizes'] if 'chr_sizes' in values else ''
         if not os.path.isfile(genome):
-            print("Genome for {name} not found in {genome}, skipping entry".format(
+            print("[WARNING] Genome for {name} not found in {genome}, skipping entry".format(
                 name=name, genome=genome), file=sys.stderr)
             continue
-        if not os.path.isfile(chr_sizes):
-            print("Chromosome sizes for {name} not found in {chr_sizes}, skipping entry".format(
+        if chr_sizes and not os.path.isfile(chr_sizes):
+            print("[WARNING] Chromosome sizes for {name} not found in {chr_sizes}, skipping entry".format(
                 name=name, chr_sizes=chr_sizes), file=sys.stderr)
             continue
+        print('add ', name)
         config['references'][name] = {"genome":genome, "chr_sizes":chr_sizes}
 
 
@@ -148,7 +156,7 @@ for s in [s for s in os.listdir(os.path.join(os.path.dirname(workflow.snakefile)
     else:
         config['sbin_singularity'][s] = config['sbin'][s]
 
-		
+
 # helper of submodules are called relative to the pipeline base directory
 config['sbin']['base'] = os.path.join(os.path.dirname(workflow.snakefile))
 if hasattr(workflow, 'use_singularity') and workflow.use_singularity:
@@ -170,10 +178,28 @@ else:
 # names for multi-run rules
 runnames = []
 if os.path.isfile('runnames.txt'):
-    runnames = [line.rstrip('\n') for line in open('runnames.txt')]
+    runnames = [line.rstrip() for line in open('runnames.txt') if line.rstrip() and not line.startswith('#')]
 config['runnames'] = runnames
 
 
+# barcode mappings
+barcodes = {}
+if os.path.isfile('barcodes.yaml'):
+    with open("barcodes.yaml", 'r') as fp:
+        barcode_map = yaml.load(fp)
+        barcodes = barcode_map
+config['barcodes'] = barcodes
+
+
+# region of interest tags
+roi = {}
+if os.path.isfile('roi.yaml'):
+    with open("roi.yaml", 'r') as fp:
+        roi_map = yaml.load(fp)
+        roi = roi_map
+config['roi'] = roi
+
+
 # include modules
 include : "rules/storage.smk"
 include : "rules/basecalling.smk"
@@ -182,3 +208,4 @@ include : "rules/methylation.smk"
 include : "rules/sv.smk"
 include : "rules/demux.smk"
 include : "rules/transcript.smk"
+include : "rules/clean.smk"
diff --git a/docs/examples/align.md b/docs/examples/align.md
@@ -21,7 +21,7 @@ The *env.yaml* in the original repository already contains an entry for this tut
 The easiest way to obtain an alignment is to directly request the output file with:
 
 ```
-snakemake --snakefile ~/src/nanopype/Snakefile -j 4 alignments/minimap2/guppy/20170725_FAH14126_FLO-MIN107_SQK-LSK308_human_Hues64.test.bam
+snakemake --snakefile ~/src/nanopype/Snakefile -j 4 alignments/minimap2/guppy/batches/tutorial/20170725_FAH14126_FLO-MIN107_SQK-LSK308_human_Hues64.test.bam
 ```
 
 This will trigger base calling using guppy and an alignment against our test genome with minimap2. In order to process multiple runs in parallel and merge the results into a single output file, a runnames.txt in the working directory is used:
@@ -49,5 +49,3 @@ Note that the second command only contains jobs running minimap2.
 ## Batch processing
 
 Browsing the working directory after this tutorial you will notice the batch output in form of e.g. *0.fastq.gz* or *0.test.bam* in a specific batches folder. These files are temporary and can be deleted after the processing. They are however required as input for the processing, thus if you delete the sequence batches a subsequent alignment run will include base calling to get the batch-wise input.
-
-We will target this issue in a future release of Nanopype. The merged fastx filex can be split according to the read name and merged bam files already contain a RG tag with the source batch.
diff --git a/docs/installation/configuration.md b/docs/installation/configuration.md
@@ -1,6 +1,6 @@
 # Configuration
 
-Nanopype has two configuration layers: The central **environment** configuration *env.yaml* covers application paths and reference genomes and is set up independent of installation method and operating system once. The environment configuration is stored in the installation directory.
+Nanopype has two configuration layers: The central **environment** configuration *env.yaml* covers application paths and reference genomes and is set up independent of installation method and operating system once. The environment configuration is stored in the installation directory. If a compute cluster is available the respective Snakemake configuration is only needed once per Nanopype installation and explained here by the example of a custom scheduler called *mxq*.
 
 For a project an additional **[workflow](../usage/general.md)** configuration is required providing data sources, tool flags and parameters. The workflow config file *nanopype.yaml* is expected in the root of each project directory. Configuration files are in .yaml format.
 
@@ -57,6 +57,7 @@ references:
         bedGraphToBigWig: ~/bin/bedGraphToBigWig
     ```
 
+Additional references possibly only needed once in a **[workflow](../usage/general.md)** can be configured in the *nanopype.yaml* of the working directory.
 
 ## Profiles
 
@@ -89,40 +90,39 @@ Additional profiles can be found at [https://github.com/snakemake-profiles/doc](
 
 The [cluster configuration](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration) of Snakemake is separated from the workflow and can be provided in .json or .yaml format. The configuration is composed of default settings mapping to all rules (e.g. basecalling, alignment, etc.) and rule specific settings enabling a fine grained control of resources such as memory and run time.
 
-??? info "example cluster.json for LSF/BSUB ([source](https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration))"
+??? info "example cluster.json for MXQ"
     ```
     {
         "__default__" :
         {
-            "queue"     : "medium_priority",
-            "nCPUs"     : "16",
-            "memory"    : 20000,
-            "resources" : "\"select[mem>20000] rusage[mem=20000] span[hosts=1]\"",
-            "name"      : "JOBNAME.{rule}.{wildcards}",
-            "output"    : "logs/cluster/{rule}.{wildcards}.out",
-            "error"     : "logs/cluster/{rule}.{wildcards}.err"
+            "nCPUs"     : "8",
+            "memory"    : "20000",
+            "time"      : "60"
         },
 
         "minimap2" :
         {
-            "memory"    : 30000,
-            "resources" : "\"select[mem>30000] rusage[mem=30000] span[hosts=1]\"",
+            "memory"    : "30000",
+            "time"      : "30"
+            "nCPUs"     : "16"
         }
     }
     ```
 
-Parameters of the cluster config are accessible inside the cluster submission command:
+Rule names of Nanopype can be obtained by starting either a dryrun (-n) or directly from the source. Parameters of the cluster config are accessible inside the cluster submission command:
 
-    snakemake --cluster-config cluster.json --cluster "sbatch -A {cluster.account} -p {cluster.partition} -n {cluster.n}  -t {cluster.time}"
+    snakemake --cluster-config cluster.json --cluster "mxqsub --threads={cluster.nCPUs} --memory={cluster.memory} --runtime={cluster.time}"
 
 This cluster configuration is static per target and needs to be conservative enough to handle any type of input data. For a more fine grained control Nanopype specifies per rule a configurable number of threads and calculates the estimated run time and memory consumption accordingly. Integration and customization of these parameters is described in the following section.
 
 ## Job properties
 
 All Nanopype workflows specify **threads** and **time_min** and **mem_mb** resources. Furthermore runtime and memory are dynamically increased if a clsuter job fails or is killed by the cluster management (Due to runtime or memory violation). To restart jobs Snakemake needs to be executed with e.g. --restart-times 3. If supported by the cluster engine you could then use:
 
-    snakemake --cluster "qsub {threads} --runtime {resources.time_min} --memory {resources.mem_mb}"
-
+    snakemake --cluster "mxqsub --threads={threads} --runtime={resources.time_min} --memory={resources.mem_mb}"
+
+Please note that the *threads* resource in this example is now not set per rule as in the previous example, but configured per module in the workflow config file *nanopype.yaml* in the working directory.
+
 If the formats of estimated runtime and memory consumption do not match your cluster system a custom [wrapper](https://snakemake.readthedocs.io/en/stable/executable.html#job-properties) script is easily set up. To convert from time in minutes to a custom string the following snippet is a starting point:
 
 ??? info "example cluster_wrapper.py format conversion"
@@ -132,7 +132,7 @@ If the formats of estimated runtime and memory consumption do not match your clu
 
     jobscript = sys.argv[-1]
     job_properties = read_job_properties(jobscript)
-    
+
     # default resources
     threads = '1'
     runtime = '01:00'
@@ -144,21 +144,21 @@ If the formats of estimated runtime and memory consumption do not match your clu
         resources = job_properties["resources"]
         if "mem_mb" in resources: memory = str(resources["mem_mb"]) + 'M'
         if "time_min" in resources: runtime = "{:02d}:{:02d}".format(*divmod(resources["time_min"], 60))
-    
+
     # cmd and submission
-    cmd = 'sub --threads={threads} --memory={memory} --time="{runtime}" {jobscript}'.format(threads=threads, memory=memory, runtime=runtime, jobscript=jobscript)
+    cmd = 'mxqsub --threads={threads} --memory={memory} --time="{runtime}" {jobscript}'.format(threads=threads, memory=memory, runtime=runtime, jobscript=jobscript)
     os.system(cmd)
     ```
 
 The respective Snakemake command line would then be:
 
     snakemake --cluster cluster_wrapper.py
-
-resulting in a cluster submission on the shell similar to:
 
-    sub --threads=1 --memory=16000M --time="00:15" ./snakemake/tmp123/job_xy.sh
-
-
+resulting in a cluster submission of the temporary job script on the shell similar to:
+
+    mxqsub --threads=1 --memory=16000M --time="00:15" ./snakemake/tmp123/job_xy.sh
+
+
 ## Custom cluster integration
 
 A full example on how to enable a not yet supported cluster system is given in *profiles/mxq/* of the git repository. Briefly four components are required:
@@ -168,8 +168,53 @@ A full example on how to enable a not yet supported cluster system is given in *
     * status.py - job status request from cluster management
     * jobscript.sh - wrapper script for rules to be executed
 
-**Important:** When working with batches of raw nanopore reads, Nanopype makes use of Snakemakes shadow mechanism. A shadow directory is temporary per rule execution and can be placed on per node local storage to reduce network overhead. The shadow prefix e.g. */scratch/local/* is set in the profiles config.yaml. The *--profile* argument tells Snakemake to use a specific profile:
+**Important:** When working with batches of raw nanopore reads, Nanopype makes use of Snakemakes shadow mechanism. A shadow directory is temporary per rule execution and can be placed on per node local storage to reduce network overhead. The shadow prefix e.g. */scratch/local/* can be set in the profiles config.yaml. The *--profile* argument tells Snakemake to use a specific profile:
 
     snakemake --profile /path/nanopype/profiles/mxq [OPTIONS...] [FILE]
 
 When running in an environment with **multiple users** the shadow prefix should contain a user specific part to ensure accessibility and to prevent corrupted data. A profile per user is compelling in this case and enforced by Nanopype if not given.
+
+## Logging
+
+Writing log files becomes useful to inspect the output of potentially failing jobs. The actual implementation is system and user specific, two possible scenarios are given here:
+
+### Log per output file
+
+Some cluster schedulers support the redirection of stdout and stderr. These could be caught using the cluster configuration of Snakemake:
+
+??? info "example cluster.json for MXQ logging"
+    ```
+    {
+        "__default__" :
+        {
+            "output"    : "logs/cluster/{rule}.{wildcards}.out",
+            "error"     : "logs/cluster/{rule}.{wildcards}.err"
+        }
+    }
+    ```
+
+Parameters of the cluster config are again accessible inside the cluster submission command:
+
+    snakemake --cluster-config cluster.json --cluster "mxqsub --stdout={cluster.output} --stderr={cluster.error}"
+
+This type of logging will create one log file per output file of Nanopype. Restarting workflows will overwrite previous logging.
+
+### Log per job submission
+
+Logging can also be set up independent of Snakemake by modifying the job script itself.
+
+??? info "example jobscript.sh for MXQ logging"
+    ```
+    H=${{HOSTNAME}}
+    J=${{MXQ_JOBID:-${{PID}}}}
+    DATE=`date +%Y%m%d`
+
+    mkdir -p log
+    ({exec_job}) &> log/${{DATE}}_${{H}}_${{J}}.log
+    ```
+
+Snakemake will replace the *{exec_job}* wildcard with the temporary job script (Double curly brackets are needed for generic environment variables to escape the Snakemake substitution). Note that the environment variables *MXQ_JOBID* and *PID* are specific to the mxq scheduler. The above wrapper will create log files with timestamp and hostname in the filename without overwriting previous logs.
+
+The custom job script is included to the Snakemake command line by using:
+
+    snakemake --jobscript jobscript.sh --cluster "mxqsub ..."
diff --git a/docs/installation/singularity.md b/docs/installation/singularity.md
@@ -1,6 +1,6 @@
 # Singularity
 
-In order to use the Singularity driven version of Nanopype a working python3 with Snakemake and the pipeline repository itself are sufficient. At least python version 3.4 is required and we recommend to use a virtual environment. Additionally Singularity needs to be installed system wide. On Ubuntu the package is called *singularity-container*. The installation requires root privileges and might require help of your IT department.
+In order to use the Singularity driven version of Nanopype a working python3 with Snakemake and the pipeline repository itself are sufficient. At least python version 3.4 is required and we recommend to use a virtual environment. Additionally Singularity needs to be installed system wide. On Ubuntu the package is called *singularity-container*. The installation requires root privileges and might require help of your IT department. We currently test workflows with Singularity version 2.4.2.
 
 Start with creating a virtual environment:
 

diff --git a/docs/release-notes.md b/docs/release-notes.md
@@ -1,22 +1,35 @@
 # Release notes
 
-#### v0.5.0
+#### v0.6.0 - 2019-04-30
+Development release:
+
+The output of nanopype >= v0.6.0 is **not** backwards compatible due to major changes in the output filesystem structure.
+
+:   * Introduce tag-concept to re-run the same workflow under different conditions
+    * Transparent demultiplexing with 'special' tags
+    * Rules to clean up batch data
+    * Enhance documentation on singularity usage
+    * Include STRique version v0.3.0
+    * Update Pychopper to version v0.5.0
+    * Update guppy_basecaller to version v2.3.7
 
+
+#### v0.5.0 - 2019-03-26
 Development release:
 
 The output of nanopype >= v0.5.0 is **not** backwards compatible due to major changes in the output filesystem structure.
 
 :   * Rework alignment module to support sequence post-processing
     * Enable Pinfish package for isoform detection
 
-#### v0.4.1
 
+#### v0.4.1 - 2019-03-15
 Development release:
 
 :   * Update guppy_basecaller to version v2.3.5
 
-#### v0.4.0
 
+#### v0.4.0 - 2019-03-13
 Development release:
 
 Version 0.4.0 is the singularity and bulk-fast5 release.
@@ -25,17 +38,17 @@ Version 0.4.0 is the singularity and bulk-fast5 release.
     * Support both, .tar of single fast5 and bulk-fast5 of current MinKNOW versions
     * Enhance documentation details on installation and cluster usage
 
-#### v0.3.0
 
+#### v0.3.0 - 2019-02-05
 Development release:
 
 The output of nanopype >= v0.3.0 is **not** backwards compatible due to major changes in the output filesystem structure.
 
 :   * Fix guppy installation instructions to not overwrite existing library installations
     * Apply general output structure of *tool2/tool1/runs/runname.x* to all processing chains (see module documentation/tutorial for details).
 
-#### v0.2.1
 
+#### v0.2.1 - 2019-01-28
 Development release:
 
 :   * Parse experimental Flappie methylation calls to bedGraph/bigWig
@@ -45,16 +58,15 @@ Known issues:
 
 :   * Installing guppy after flappie is overwriting a symlink to *libhdf5.so* causing the flappie basecaller to load a hdf5 library it was not linked to. This will be fixed in a future release by separating the libraries properly.
 
-#### v0.2.0
 
+#### v0.2.0 - 2019-01-21
 Development release:
 
 :   * Nanopolish                v0.11.0
     * htslib                    v1.9
     * OpenBLAS                  v0.3.4
 
-#### v0.1.0
-
+#### v0.1.0 - 2018-12-24
 Initial release with tool versions (for development or untagged repositories, the master branch with the tested version in brackets is used):
 
 :   * Bedtools                  v2.27.1