Add varfilter to mark-I workflow (#355)

This update extends the mark I workflow to include the option to run the `varfilter` step. Closes #329.
kevlar-dev · Feb 8, 2019 · 6fa2623 · 6fa2623
1 parent 0cfd001
commit 6fa2623
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,8 +5,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 ## Unreleased
 
 ### Added
-- A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
-- A new Snakemake workflow for kevlar's standard processing procedure (see #306).
+- A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305, #355).
+- A new Snakemake workflow for kevlar's standard processing procedure (see #306, #355).
 - New `unband` module to merge augmented Fastq files produced with a *k*-mer banding strategy (see #316).
 - New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342, #354).
 - New dependency: `intervaltree` package (see #318).
@@ -26,7 +26,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - Corrected a bug that called adjacent substitutions as independent SNVs rather than an aggregate MNV (see #332).
 
 ### Removed
-- The `effcount`, `dump`, and `simplex` modules have been disabled (see #308, #316).
+- The `effcount`, `dump`, and `simplex` modules have been dropped (see #308, #316).
 - Internal handling of interesting read mate sequences has been dropped (see #353).
 
 

diff --git a/kevlar/workflows/bam-preproc/Snakefile b/kevlar/workflows/bam-preproc/Snakefile
@@ -8,14 +8,14 @@ import os
 
 
 NUMCONTROLS = len(config['controls'])
-SAMPLES = ['case.interleaved'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)]
+SAMPLES = ['case'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)]
 
 # -----------------------------------------------------------------------------
 # Primary target
 # -----------------------------------------------------------------------------
 
 rule reads:
-    input: expand('Reads/{prefix}.ec.reads.fq.gz', prefix=SAMPLES)
+    input: expand('Reads/{prefix}.reads.ec.fq.gz', prefix=SAMPLES)
     output: touch('preprocessing.complete')
 
 
@@ -37,54 +37,24 @@ rule link_bams:
             os.symlink(os.path.abspath(inbam), outbam)
 
 
-# -----------------------------------------------------------------------------
-# Collate reads, convert to interleaved Fastq, and compress using pipes
-# instead of normal files to reduce I/O intensity.
-#
-# $ samtools collate <args> | samtools fastq <args> | bgzip <args> > seqs.fq.gz
-# -----------------------------------------------------------------------------
-
-rule collate_bam:
-    input: 'Alignments/case.bam'
-    output: pipe('Alignments/case.collate.bam')
-    threads: 24
-    message: 'Collate read pairs.'
-    shell: 'samtools collate -f -r 1000000 -n 128 -@ {threads} -o {output} {input}'
-
-rule bam_to_paired_fastq:
-    input: rules.collate_bam.output
-    output: pipe('Reads/case.interleaved.reads.fq')
-    threads: 24
-    message: 'Convert collated alignments to interleaved Fastq format.'
-    shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}'
-
-rule compress_paired_fastq:
-    input: rules.bam_to_paired_fastq.output
-    output: 'Reads/case.interleaved.reads.fq.gz'
-    threads: 24
-    message: 'Compress interleaved reads.'
-    shell: 'bgzip -@ {threads} -c {input} > {output}'
-
-
 # -----------------------------------------------------------------------------
 # Convert reads to unpaired Fastq and compress.
 #
 # $ samtools fastq <args> | bgzip <args> > seqs.fq.gz
 # -----------------------------------------------------------------------------
 
 rule bam_to_fastq:
-    input: 'Alignments/ctrl{indiv}.bam'
+    input: 'Alignments/{indiv}.bam'
     output:
-        pipe('Reads/ctrl{indiv}.reads.fq'),
-        'Logs/ctrl{indiv}-fastq.log'
+        pipe('Reads/{indiv}.reads.fq'),
+        'Logs/{indiv}-fastq.log'
     threads: 36
     message: 'Convert read alignments to unpaired Fastq format.'
     shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output[0]} 2> >(tee {output[1]})'
 
 rule compress_unpaired_fastq:
-    input: 'Reads/ctrl{indiv}.reads.fq'
-    output: 'Reads/ctrl{indiv}.reads.fq.gz'
-    wildcard_constraints: indiv='\d+'
+    input: 'Reads/{indiv}.reads.fq'
+    output: 'Reads/{indiv}.reads.fq.gz'
     threads: 36
     message: 'Compress unpaired reads.'
     shell: 'bgzip -@ {threads} -c {input} > {output}'
@@ -97,7 +67,7 @@ rule compress_unpaired_fastq:
 rule qc:
     input: 'Reads/{prefix}.reads.fq.gz'
     output:
-        pipe('Reads/{prefix}.qc.reads.fq'),
+        pipe('Reads/{prefix}.reads.qc.fq'),
         'Logs/{prefix}-fastp.json',
         'Logs/{prefix}-fastp.html',
         'Logs/{prefix}-qc.log'
@@ -106,8 +76,8 @@ rule qc:
     shell: 'fastp -i {input} --interleaved_in --stdout -p --thread {threads} --json {output[1]} --html {output[2]} -q 15 -u 40 -l 15 > {output[0]} 2> >(tee {output[3]})'
 
 rule compress_qc_reads:
-    input: 'Reads/{prefix}.qc.reads.fq'
-    output: 'Reads/{prefix}.qc.reads.fq.gz'
+    input: 'Reads/{prefix}.reads.qc.fq'
+    output: 'Reads/{prefix}.reads.qc.fq.gz'
     threads: 56
     message: 'Compress quality controlled reads.'
     shell: 'bgzip -@ {threads} -c {input} > {output}'
@@ -118,7 +88,7 @@ rule compress_qc_reads:
 # -----------------------------------------------------------------------------
 
 rule trusted_kmers:
-    input: expand('Reads/{prefix}.qc.reads.fq.gz', prefix=SAMPLES)
+    input: expand('Reads/{prefix}.reads.qc.fq.gz', prefix=SAMPLES)
     output:
         'lighter.trustedkmers',
         'Logs/trustedkmers.log'
@@ -128,10 +98,10 @@ rule trusted_kmers:
 
 rule ec:
     input:
-        'Reads/{prefix}.qc.reads.fq.gz',
+        'Reads/{prefix}.reads.qc.fq.gz',
         'lighter.trustedkmers'
     output:
-        'Reads/{prefix}.ec.reads.fq.gz',
+        'Reads/{prefix}.reads.ec.fq.gz',
         'Logs/{prefix}-ec.log'
     threads: 72
     message: 'Perform error correction sample by sample.'

diff --git a/kevlar/workflows/mark-I/README.md b/kevlar/workflows/mark-I/README.md
@@ -1,13 +1,12 @@
 # *De novo* variant discovery workflow: Mark I
 
-Make a copy of `config.json` and replace the existing filenames with the full paths of your input filenames.
-An arbitrary number of Fasta/Fastq files is supported for each sample.
-Paired-end reads are not required, but pairing information will be retained if provided for the case/proband sample (pairing info is always ignored for control samples).
+The complete simplex *de novo* variant discovery workflow can be executed manually step-by-step, or as a single coordinated Snakemake workflow.
+To invoke the Snakemake workflow:
 
-The other parameters are reasonable for error-corrected human samples sequenced at ≈30x coverage.
-Without error correction much more memory will be required, or an alternative workflow will be required.
-kevlar will be accurate even when the *k*-mer counting false-positive rate (FPR) is fairly high in the case sample, as this is corrected in subsequent steps.
-Sensitivity will be lost if the *k*-mer counting FPR is too high in the control samples, however.
+- make sure you have Snakemake 5.0 or greater installed
+- copy `kevlar/workflows/mark-I/Snakemake` and `kevlar/workflows/mark-I/config.json` to your working directory
+- modify the copy of `config.json` and replace the existing filenames with the full paths of your input files
+- execute the workflow with the `snakemake` command, using the examples below as a guide
 
 ```
 # Do a dry run to make sure there are no warnings or errors.
@@ -16,3 +15,12 @@ snakemake --configfile myconfig.json --snakefile kevlar/workflows/mark-I/Snakefi
 # If everything looks good, go ahead an execute the procedure.
 snakemake --configfile myconfig.json --snakefile kevlar/workflows/mark-I/Snakefile --cores 64 --directory /path/to/desired/work/dir/ --printshellcmds calls
 ```
+
+Some notes on workflow configuration:
+
+- An arbitrary number of Fasta/Fastq files is supported for each sample.
+- Paired-end reads are not required, and any pairing information will be ignored.
+- Parameters are tuned for error-corrected whole genome shotgun sequencing of human samples at ≈30x coverage. Without error correction much more memory will be required, or an alternative workflow will be required.
+- kevlar will be accurate even when the *k*-mer counting false-positive rate (FPR) is fairly high in the case sample, as this is corrected in subsequent steps. Sensitivity will be lost if the *k*-mer counting FPR is too high in the control samples, however.
+- The mask should include your reference genome and any potential sources of contamination. For example, UniVec includes many sources of technical contamination (such as adapters) that often cause problems.
+- Accuracy can often be improved by filtering preliminary calls. Use the `varfilter` setting to provide a BED file of intervals from which to filter out variant predictions (by default, `varfilter: null` will disable this step). It's common to filter out variant calls in segmental duplications or SSRs, for example, as these are usually problematic. Also, filtering out common variants (using dbSNP, for example) will successfully remove inherited variants that are erroneously classified as *de novo* due to low coverage in the donor parent.
diff --git a/kevlar/workflows/mark-I/Snakefile b/kevlar/workflows/mark-I/Snakefile
@@ -124,6 +124,20 @@ class SimplexDesign(object):
                 yield control['label']
         return list(getlabels())
 
+    @property
+    def varfilter_input(self):
+        infiles = list()
+        if self._config['varfilter']:
+            infiles.append(self._config['varfilter'])
+        return infiles + expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
+
+    @property
+    def simlike_input(self):
+        if self._config['varfilter']:
+            return ['calls.filtered.vcf.gz']
+        else:
+            return expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
+
 
 simplex = SimplexDesign(config)
 
@@ -324,18 +338,28 @@ rule call:
     shell: 'kevlar --tee --logfile {output[1]} call --mask-mem 10M --refr {input[2]} --ksize {config[ksize]} --out {output[0]} {input[0]} {input[1]}'
 
 
+rule varfilter:
+    input: simplex.varfilter_input
+    output:
+        'calls.filtered.vcf.gz',
+        'Logs/varfilter.log'
+    threads: 1
+    message: 'Filter out calls from the provided regions (usually common variants, repetitive DNA, etc)'
+    shell: 'kevlar --tee --logfile {output[1]} varfilter --out {output[0]} {input}'
+
+
 rule like_scores:
     input:
         'Reference/refr-counts.smallcounttable',
         simplex.all_sample_counts,
-        expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
+        simplex.simlike_input
     output:
         'calls.scored.sorted.vcf.gz',
         'Logs/simlike.log'
     threads: 1
     message: 'Filter calls, compute likelihood scores, and sort calls by score.'
     run:
-        in_vcfs = expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
+        in_vcfs = simplex.simlike_input
         cmd = [
             'kevlar', '--tee', '--logfile', output[1], 'simlike',
             '--mu', config['samples']['coverage']['mean'],

diff --git a/kevlar/workflows/mark-I/config.json b/kevlar/workflows/mark-I/config.json
@@ -56,5 +56,6 @@
         "delta": 50,
         "seqpattern": ".",
         "maxdiff": 10000
-    }
+    },
+    "varfilter": "/data/refr/dbSNP_common.bed"
 }