New workflow (#305)

This update adds a proof-of-concept workflow for pre-processing BAM data prior to variant prediction with kevlar.
kevlar-dev · Nov 27, 2018 · 32c0c8c · 32c0c8c
1 parent 24aca29
commit 32c0c8c
Show file tree

Hide file tree

Showing 6 changed files with 188 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+### Added
+- Included a Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
+
 ### Fixed
 - Corrected a bug that reported the reference target sequence instead of the assembled contig sequence in the `CONTIG` attribute of indel calls in the VCF (see #304).
 

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,6 +1,7 @@
 include versioneer.py
 include kevlar/_version.py
 recursive-include kevlar/tests/data *
+recursive-include kevlar/workflows *
 recursive-include src *
 recursive-include inc *
 recursive-include third-party *
diff --git a/kevlar/workflows/Snakefile b/kevlar/workflows/Snakefile
@@ -0,0 +1,128 @@
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+import os
+
+
+NUMCONTROLS = len(config['controls'])
+SAMPLES = ['case.interleaved'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)]
+
+# -----------------------------------------------------------------------------
+# Primary target
+# -----------------------------------------------------------------------------
+
+rule reads:
+    input: expand('Reads/{prefix}.ec.reads.fq.gz', prefix=SAMPLES)
+    output: touch('preprocessing.complete')
+
+
+# -----------------------------------------------------------------------------
+# Create standardized named links to input BAM files for internal use.
+# -----------------------------------------------------------------------------
+
+rule link_bams:
+    input:
+        config['case'],
+        *config['controls'],
+    output:
+        'Alignments/case.bam',
+        expand('Alignments/ctrl{serial}.bam', serial=range(NUMCONTROLS)),
+    threads: 1
+    message: 'Create symlinks to input BAM files.'
+    run:
+        for inbam, outbam in zip(input, output):
+            os.symlink(os.path.abspath(inbam), outbam)
+
+
+# -----------------------------------------------------------------------------
+# Collate reads, convert to interleaved Fastq, and compress using pipes
+# instead of normal files to reduce I/O intensity.
+#
+# $ samtools collate <args> | samtools fastq <args> | bgzip <args> > seqs.fq.gz
+# -----------------------------------------------------------------------------
+
+rule collate_bam:
+    input: 'Alignments/case.bam'
+    output: pipe('Alignments/case.collate.bam')
+    threads: 24
+    message: 'Collate read pairs.'
+    shell: 'samtools collate -f -r 1000000 -n 128 -@ {threads} -o {output} {input}'
+
+rule bam_to_paired_fastq:
+    input: rules.collate_bam.output
+    output: pipe('Reads/case.interleaved.reads.fq')
+    threads: 24
+    message: 'Convert collated alignments to interleaved Fastq format.'
+    shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}'
+
+rule compress_paired_fastq:
+    input: rules.bam_to_paired_fastq.output
+    output: 'Reads/case.interleaved.reads.fq.gz'
+    threads: 24
+    message: 'Compress interleaved reads.'
+    shell: 'bgzip -@ {threads} -c {input} > {output}'
+
+
+# -----------------------------------------------------------------------------
+# Convert reads to unpaired Fastq and compress.
+#
+# $ samtools fastq <args> | bgzip <args> > seqs.fq.gz
+# -----------------------------------------------------------------------------
+
+rule bam_to_fastq:
+    input: 'Alignments/ctrl{indiv}.bam'
+    output: pipe('Reads/ctrl{indiv}.reads.fq')
+    threads: 36
+    message: 'Convert read alignments to unpaired Fastq format.'
+    shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}'
+
+rule compress_unpaired_fastq:
+    input: 'Reads/ctrl{indiv}.reads.fq'
+    output: 'Reads/ctrl{indiv}.reads.fq.gz'
+    wildcard_constraints: indiv='\d+'
+    threads: 36
+    message: 'Compress unpaired reads.'
+    shell: 'bgzip -@ {threads} -c {input} > {output}'
+
+
+# -----------------------------------------------------------------------------
+# Perform quality control on reads.
+# -----------------------------------------------------------------------------
+
+rule qc:
+    input: 'Reads/{prefix}.reads.fq.gz'
+    output: pipe('Reads/{prefix}.qc.reads.fq')
+    threads: 16
+    message: 'Perform quality control on reads.'
+    shell: 'fastp -i {input} --interleaved_in --stdout -p --thread {threads} -q 15 -u 40 -l 15 > {output}'
+
+rule compress_qc_reads:
+    input: 'Reads/{prefix}.qc.reads.fq'
+    output: 'Reads/{prefix}.qc.reads.fq.gz'
+    threads: 56
+    message: 'Compress quality controlled reads.'
+    shell: 'bgzip -@ {threads} -c {input} > {output}'
+
+
+# -----------------------------------------------------------------------------
+# Perform error correction on reads
+# -----------------------------------------------------------------------------
+
+rule trusted_kmers:
+    input: expand('Reads/{prefix}.qc.reads.fq.gz', prefix=SAMPLES)
+    output: 'lighter.trustedkmers'
+    threads: 72
+    message: 'Compute trusted k-mers across all samples for error correction.'
+    shell: 'lighter -K 27 3100000000 -r {input[0]} -r {input[1]} -r {input[2]} -saveTrustedKmers {output} -t {threads}'
+
+rule ec:
+    input:
+        'Reads/{prefix}.qc.reads.fq.gz',
+        'lighter.trustedkmers'
+    output: 'Reads/{prefix}.ec.reads.fq.gz'
+    threads: 72
+    message: 'Perform error correction sample by sample.'
+    shell: 'lighter -K 27 3100000000 -r {input[0]} -loadTrustedKmers {input[1]} -t {threads} && mv {wildcards.prefix}.qc.reads.cor.fq.gz {output}'
diff --git a/kevlar/workflows/bam-preproc/README.md b/kevlar/workflows/bam-preproc/README.md
@@ -0,0 +1,45 @@
+# Pre-processing procedure for mapped reads
+
+Predicting *de novo* variants with kevlar requires one or more files per individual in Fasta or Fastq format.
+Using error-corrected reads can [drastically reduce the amount of memory](https://standage.github.io/information-content-versus-data-volume-and-k-mer-counting-accuracy.html) required to count *k*-mers and detect variants.
+However, data from human studies are often distributed as pre-aligned reads in BAM format.
+The following procedure will convert the BAM files back into Fastq format, apply basic quality control, and correct sequencing errors in the reads.
+
+
+## Software and hardware configuration
+
+This procedure requires the following software environment.
+
+- [Snakemake](https://bitbucket.org/snakemake/snakemake/), version 5.0 or newer
+- [SAMtools and bgzip](https://github.com/samtools/samtools), version 1.9 or newer
+- [fastp](https://github.com/OpenGene/fastp), tested with version 0.19.5
+- [lighter](https://github.com/mourisl/Lighter), version 1.1.2 or newer
+
+The file `config.json` contains the full paths of the BAM files.
+Edit this as needed.
+Note that only a single case sample is supported, but an arbitrary number of controls is supported.
+
+**NOTE**: The number of required threads is hard-coded for some rules, and because these rules are usually grouped (run simultaneously through a "pipe" file) Snakemake's normal behavior of scaling threads down doesn't work as usual.
+Some steps run 3 commands at a time with 24 threads per command, while others run 2 commands at a time with 36 threads per command (72 total threads).
+I'd like to handle this more gracefully in the future (see [this thread](https://bitbucket.org/snakemake/snakemake/issues/1039/split-number-of-threads-for-a-rule)), but in the mean time it may be necessary to adjust the `threads` directive for the rules in the Snakefile to make better use of the resources available on your machine(s).
+
+
+## Executing the procedure
+
+Do a dry run to make sure there are no warnings or errors.
+
+```
+snakemake --configfile config.json --cores 72 --directory /path/to/desired/work/dir/ --printshellcmds --dryrun reads
+```
+
+If everything looks good, go ahead and execute the procedure.
+
+```
+snakemake --configfile config.json --cores 72 --directory /path/to/desired/work/dir/ --printshellcmds reads
+```
+
+**NOTE**: This procedure can be invoked from any directory on your system, assuming the following conditions are met.
+
+- the file paths in the `config.json` are correct absolute paths
+- the correct relative or absolute path to the `config.json` file is provided via the `--configfile` flag
+- the `--snakefile` flag is appended to the command to specify the correct relative or absolute path to the `Snakefile` file
diff --git a/kevlar/workflows/bam-preproc/config.json b/kevlar/workflows/bam-preproc/config.json
@@ -0,0 +1,7 @@
+{
+    "case": "/scratch/standage/12345/BAMs/12345.proband.bam",
+    "controls": [
+        "/scratch/standage/12345/BAMs/12345.mother.bam",
+        "/scratch/standage/12345/BAMs/12345.father.bam"
+    ]
+}
diff --git a/setup.py b/setup.py
@@ -56,7 +56,10 @@
       license='MIT',
       packages=['kevlar', 'kevlar.cli', 'kevlar.tests'],
       package_data={
-          'kevlar': ['kevlar/tests/data/*', 'kevlar/tests/data/*/*']
+          'kevlar': [
+              'kevlar/tests/data/*', 'kevlar/tests/data/*/*',
+              'kevlar/workflows/bam-preproc/*'
+          ]
       },
       include_package_data=True,
       ext_modules=[ksw2, fermilite, sequencemod],