-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This update adds a proof-of-concept workflow for pre-processing BAM data prior to variant prediction with kevlar.
- Loading branch information
Showing
6 changed files
with
188 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
include versioneer.py | ||
include kevlar/_version.py | ||
recursive-include kevlar/tests/data * | ||
recursive-include kevlar/workflows * | ||
recursive-include src * | ||
recursive-include inc * | ||
recursive-include third-party * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2018 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
import os | ||
|
||
|
||
NUMCONTROLS = len(config['controls']) | ||
SAMPLES = ['case.interleaved'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)] | ||
|
||
# ----------------------------------------------------------------------------- | ||
# Primary target | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule reads: | ||
input: expand('Reads/{prefix}.ec.reads.fq.gz', prefix=SAMPLES) | ||
output: touch('preprocessing.complete') | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# Create standardized named links to input BAM files for internal use. | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule link_bams: | ||
input: | ||
config['case'], | ||
*config['controls'], | ||
output: | ||
'Alignments/case.bam', | ||
expand('Alignments/ctrl{serial}.bam', serial=range(NUMCONTROLS)), | ||
threads: 1 | ||
message: 'Create symlinks to input BAM files.' | ||
run: | ||
for inbam, outbam in zip(input, output): | ||
os.symlink(os.path.abspath(inbam), outbam) | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# Collate reads, convert to interleaved Fastq, and compress using pipes | ||
# instead of normal files to reduce I/O intensity. | ||
# | ||
# $ samtools collate <args> | samtools fastq <args> | bgzip <args> > seqs.fq.gz | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule collate_bam: | ||
input: 'Alignments/case.bam' | ||
output: pipe('Alignments/case.collate.bam') | ||
threads: 24 | ||
message: 'Collate read pairs.' | ||
shell: 'samtools collate -f -r 1000000 -n 128 -@ {threads} -o {output} {input}' | ||
|
||
rule bam_to_paired_fastq: | ||
input: rules.collate_bam.output | ||
output: pipe('Reads/case.interleaved.reads.fq') | ||
threads: 24 | ||
message: 'Convert collated alignments to interleaved Fastq format.' | ||
shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}' | ||
|
||
rule compress_paired_fastq: | ||
input: rules.bam_to_paired_fastq.output | ||
output: 'Reads/case.interleaved.reads.fq.gz' | ||
threads: 24 | ||
message: 'Compress interleaved reads.' | ||
shell: 'bgzip -@ {threads} -c {input} > {output}' | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# Convert reads to unpaired Fastq and compress. | ||
# | ||
# $ samtools fastq <args> | bgzip <args> > seqs.fq.gz | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule bam_to_fastq: | ||
input: 'Alignments/ctrl{indiv}.bam' | ||
output: pipe('Reads/ctrl{indiv}.reads.fq') | ||
threads: 36 | ||
message: 'Convert read alignments to unpaired Fastq format.' | ||
shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}' | ||
|
||
rule compress_unpaired_fastq: | ||
input: 'Reads/ctrl{indiv}.reads.fq' | ||
output: 'Reads/ctrl{indiv}.reads.fq.gz' | ||
wildcard_constraints: indiv='\d+' | ||
threads: 36 | ||
message: 'Compress unpaired reads.' | ||
shell: 'bgzip -@ {threads} -c {input} > {output}' | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# Perform quality control on reads. | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule qc: | ||
input: 'Reads/{prefix}.reads.fq.gz' | ||
output: pipe('Reads/{prefix}.qc.reads.fq') | ||
threads: 16 | ||
message: 'Perform quality control on reads.' | ||
shell: 'fastp -i {input} --interleaved_in --stdout -p --thread {threads} -q 15 -u 40 -l 15 > {output}' | ||
|
||
rule compress_qc_reads: | ||
input: 'Reads/{prefix}.qc.reads.fq' | ||
output: 'Reads/{prefix}.qc.reads.fq.gz' | ||
threads: 56 | ||
message: 'Compress quality controlled reads.' | ||
shell: 'bgzip -@ {threads} -c {input} > {output}' | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# Perform error correction on reads | ||
# ----------------------------------------------------------------------------- | ||
|
||
rule trusted_kmers: | ||
input: expand('Reads/{prefix}.qc.reads.fq.gz', prefix=SAMPLES) | ||
output: 'lighter.trustedkmers' | ||
threads: 72 | ||
message: 'Compute trusted k-mers across all samples for error correction.' | ||
shell: 'lighter -K 27 3100000000 -r {input[0]} -r {input[1]} -r {input[2]} -saveTrustedKmers {output} -t {threads}' | ||
|
||
rule ec: | ||
input: | ||
'Reads/{prefix}.qc.reads.fq.gz', | ||
'lighter.trustedkmers' | ||
output: 'Reads/{prefix}.ec.reads.fq.gz' | ||
threads: 72 | ||
message: 'Perform error correction sample by sample.' | ||
shell: 'lighter -K 27 3100000000 -r {input[0]} -loadTrustedKmers {input[1]} -t {threads} && mv {wildcards.prefix}.qc.reads.cor.fq.gz {output}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Pre-processing procedure for mapped reads | ||
|
||
Predicting *de novo* variants with kevlar requires one or more files per individual in Fasta or Fastq format. | ||
Using error-corrected reads can [drastically reduce the amount of memory](https://standage.github.io/information-content-versus-data-volume-and-k-mer-counting-accuracy.html) required to count *k*-mers and detect variants. | ||
However, data from human studies are often distributed as pre-aligned reads in BAM format. | ||
The following procedure will convert the BAM files back into Fastq format, apply basic quality control, and correct sequencing errors in the reads. | ||
|
||
|
||
## Software and hardware configuration | ||
|
||
This procedure requires the following software environment. | ||
|
||
- [Snakemake](https://bitbucket.org/snakemake/snakemake/), version 5.0 or newer | ||
- [SAMtools and bgzip](https://github.com/samtools/samtools), version 1.9 or newer | ||
- [fastp](https://github.com/OpenGene/fastp), tested with version 0.19.5 | ||
- [lighter](https://github.com/mourisl/Lighter), version 1.1.2 or newer | ||
|
||
The file `config.json` contains the full paths of the BAM files. | ||
Edit this as needed. | ||
Note that only a single case sample is supported, but an arbitrary number of controls is supported. | ||
|
||
**NOTE**: The number of required threads is hard-coded for some rules, and because these rules are usually grouped (run simultaneously through a "pipe" file) Snakemake's normal behavior of scaling threads down doesn't work as usual. | ||
Some steps run 3 commands at a time with 24 threads per command, while others run 2 commands at a time with 36 threads per command (72 total threads). | ||
I'd like to handle this more gracefully in the future (see [this thread](https://bitbucket.org/snakemake/snakemake/issues/1039/split-number-of-threads-for-a-rule)), but in the mean time it may be necessary to adjust the `threads` directive for the rules in the Snakefile to make better use of the resources available on your machine(s). | ||
|
||
|
||
## Executing the procedure | ||
|
||
Do a dry run to make sure there are no warnings or errors. | ||
|
||
``` | ||
snakemake --configfile config.json --cores 72 --directory /path/to/desired/work/dir/ --printshellcmds --dryrun reads | ||
``` | ||
|
||
If everything looks good, go ahead and execute the procedure. | ||
|
||
``` | ||
snakemake --configfile config.json --cores 72 --directory /path/to/desired/work/dir/ --printshellcmds reads | ||
``` | ||
|
||
**NOTE**: This procedure can be invoked from any directory on your system, assuming the following conditions are met. | ||
|
||
- the file paths in the `config.json` are correct absolute paths | ||
- the correct relative or absolute path to the `config.json` file is provided via the `--configfile` flag | ||
- the `--snakefile` flag is appended to the command to specify the correct relative or absolute path to the `Snakefile` file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"case": "/scratch/standage/12345/BAMs/12345.proband.bam", | ||
"controls": [ | ||
"/scratch/standage/12345/BAMs/12345.mother.bam", | ||
"/scratch/standage/12345/BAMs/12345.father.bam" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters