Skip to content

Commit

Permalink
Add varfilter to mark-I workflow (#355)
Browse files Browse the repository at this point in the history
This update extends the mark I workflow to include the option to run the `varfilter` step. Closes #329.
  • Loading branch information
standage committed Feb 8, 2019
1 parent 0cfd001 commit 6fa2623
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 56 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
## Unreleased

### Added
- A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
- A new Snakemake workflow for kevlar's standard processing procedure (see #306).
- A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305, #355).
- A new Snakemake workflow for kevlar's standard processing procedure (see #306, #355).
- New `unband` module to merge augmented Fastq files produced with a *k*-mer banding strategy (see #316).
- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342, #354).
- New dependency: `intervaltree` package (see #318).
Expand All @@ -26,7 +26,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- Corrected a bug that called adjacent substitutions as independent SNVs rather than an aggregate MNV (see #332).

### Removed
- The `effcount`, `dump`, and `simplex` modules have been disabled (see #308, #316).
- The `effcount`, `dump`, and `simplex` modules have been dropped (see #308, #316).
- Internal handling of interesting read mate sequences has been dropped (see #353).


Expand Down
56 changes: 13 additions & 43 deletions kevlar/workflows/bam-preproc/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ import os


NUMCONTROLS = len(config['controls'])
SAMPLES = ['case.interleaved'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)]
SAMPLES = ['case'] + ['ctrl{}'.format(i) for i in range(NUMCONTROLS)]

# -----------------------------------------------------------------------------
# Primary target
# -----------------------------------------------------------------------------

rule reads:
input: expand('Reads/{prefix}.ec.reads.fq.gz', prefix=SAMPLES)
input: expand('Reads/{prefix}.reads.ec.fq.gz', prefix=SAMPLES)
output: touch('preprocessing.complete')


Expand All @@ -37,54 +37,24 @@ rule link_bams:
os.symlink(os.path.abspath(inbam), outbam)


# -----------------------------------------------------------------------------
# Collate reads, convert to interleaved Fastq, and compress using pipes
# instead of normal files to reduce I/O intensity.
#
# $ samtools collate <args> | samtools fastq <args> | bgzip <args> > seqs.fq.gz
# -----------------------------------------------------------------------------

rule collate_bam:
input: 'Alignments/case.bam'
output: pipe('Alignments/case.collate.bam')
threads: 24
message: 'Collate read pairs.'
shell: 'samtools collate -f -r 1000000 -n 128 -@ {threads} -o {output} {input}'

rule bam_to_paired_fastq:
input: rules.collate_bam.output
output: pipe('Reads/case.interleaved.reads.fq')
threads: 24
message: 'Convert collated alignments to interleaved Fastq format.'
shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output}'

rule compress_paired_fastq:
input: rules.bam_to_paired_fastq.output
output: 'Reads/case.interleaved.reads.fq.gz'
threads: 24
message: 'Compress interleaved reads.'
shell: 'bgzip -@ {threads} -c {input} > {output}'


# -----------------------------------------------------------------------------
# Convert reads to unpaired Fastq and compress.
#
# $ samtools fastq <args> | bgzip <args> > seqs.fq.gz
# -----------------------------------------------------------------------------

rule bam_to_fastq:
input: 'Alignments/ctrl{indiv}.bam'
input: 'Alignments/{indiv}.bam'
output:
pipe('Reads/ctrl{indiv}.reads.fq'),
'Logs/ctrl{indiv}-fastq.log'
pipe('Reads/{indiv}.reads.fq'),
'Logs/{indiv}-fastq.log'
threads: 36
message: 'Convert read alignments to unpaired Fastq format.'
shell: 'samtools fastq -N -F 2304 -@ {threads} {input} > {output[0]} 2> >(tee {output[1]})'

rule compress_unpaired_fastq:
input: 'Reads/ctrl{indiv}.reads.fq'
output: 'Reads/ctrl{indiv}.reads.fq.gz'
wildcard_constraints: indiv='\d+'
input: 'Reads/{indiv}.reads.fq'
output: 'Reads/{indiv}.reads.fq.gz'
threads: 36
message: 'Compress unpaired reads.'
shell: 'bgzip -@ {threads} -c {input} > {output}'
Expand All @@ -97,7 +67,7 @@ rule compress_unpaired_fastq:
rule qc:
input: 'Reads/{prefix}.reads.fq.gz'
output:
pipe('Reads/{prefix}.qc.reads.fq'),
pipe('Reads/{prefix}.reads.qc.fq'),
'Logs/{prefix}-fastp.json',
'Logs/{prefix}-fastp.html',
'Logs/{prefix}-qc.log'
Expand All @@ -106,8 +76,8 @@ rule qc:
shell: 'fastp -i {input} --interleaved_in --stdout -p --thread {threads} --json {output[1]} --html {output[2]} -q 15 -u 40 -l 15 > {output[0]} 2> >(tee {output[3]})'

rule compress_qc_reads:
input: 'Reads/{prefix}.qc.reads.fq'
output: 'Reads/{prefix}.qc.reads.fq.gz'
input: 'Reads/{prefix}.reads.qc.fq'
output: 'Reads/{prefix}.reads.qc.fq.gz'
threads: 56
message: 'Compress quality controlled reads.'
shell: 'bgzip -@ {threads} -c {input} > {output}'
Expand All @@ -118,7 +88,7 @@ rule compress_qc_reads:
# -----------------------------------------------------------------------------

rule trusted_kmers:
input: expand('Reads/{prefix}.qc.reads.fq.gz', prefix=SAMPLES)
input: expand('Reads/{prefix}.reads.qc.fq.gz', prefix=SAMPLES)
output:
'lighter.trustedkmers',
'Logs/trustedkmers.log'
Expand All @@ -128,10 +98,10 @@ rule trusted_kmers:

rule ec:
input:
'Reads/{prefix}.qc.reads.fq.gz',
'Reads/{prefix}.reads.qc.fq.gz',
'lighter.trustedkmers'
output:
'Reads/{prefix}.ec.reads.fq.gz',
'Reads/{prefix}.reads.ec.fq.gz',
'Logs/{prefix}-ec.log'
threads: 72
message: 'Perform error correction sample by sample.'
Expand Down
22 changes: 15 additions & 7 deletions kevlar/workflows/mark-I/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# *De novo* variant discovery workflow: Mark I

Make a copy of `config.json` and replace the existing filenames with the full paths of your input filenames.
An arbitrary number of Fasta/Fastq files is supported for each sample.
Paired-end reads are not required, but pairing information will be retained if provided for the case/proband sample (pairing info is always ignored for control samples).
The complete simplex *de novo* variant discovery workflow can be executed manually step-by-step, or as a single coordinated Snakemake workflow.
To invoke the Snakemake workflow:

The other parameters are reasonable for error-corrected human samples sequenced at ≈30x coverage.
Without error correction much more memory will be required, or an alternative workflow will be required.
kevlar will be accurate even when the *k*-mer counting false-positive rate (FPR) is fairly high in the case sample, as this is corrected in subsequent steps.
Sensitivity will be lost if the *k*-mer counting FPR is too high in the control samples, however.
- make sure you have Snakemake 5.0 or greater installed
- copy `kevlar/workflows/mark-I/Snakemake` and `kevlar/workflows/mark-I/config.json` to your working directory
- modify the copy of `config.json` and replace the existing filenames with the full paths of your input files
- execute the workflow with the `snakemake` command, using the examples below as a guide

```
# Do a dry run to make sure there are no warnings or errors.
Expand All @@ -16,3 +15,12 @@ snakemake --configfile myconfig.json --snakefile kevlar/workflows/mark-I/Snakefi
# If everything looks good, go ahead an execute the procedure.
snakemake --configfile myconfig.json --snakefile kevlar/workflows/mark-I/Snakefile --cores 64 --directory /path/to/desired/work/dir/ --printshellcmds calls
```

Some notes on workflow configuration:

- An arbitrary number of Fasta/Fastq files is supported for each sample.
- Paired-end reads are not required, and any pairing information will be ignored.
- Parameters are tuned for error-corrected whole genome shotgun sequencing of human samples at ≈30x coverage. Without error correction much more memory will be required, or an alternative workflow will be required.
- kevlar will be accurate even when the *k*-mer counting false-positive rate (FPR) is fairly high in the case sample, as this is corrected in subsequent steps. Sensitivity will be lost if the *k*-mer counting FPR is too high in the control samples, however.
- The mask should include your reference genome and any potential sources of contamination. For example, UniVec includes many sources of technical contamination (such as adapters) that often cause problems.
- Accuracy can often be improved by filtering preliminary calls. Use the `varfilter` setting to provide a BED file of intervals from which to filter out variant predictions (by default, `varfilter: null` will disable this step). It's common to filter out variant calls in segmental duplications or SSRs, for example, as these are usually problematic. Also, filtering out common variants (using dbSNP, for example) will successfully remove inherited variants that are erroneously classified as *de novo* due to low coverage in the donor parent.
28 changes: 26 additions & 2 deletions kevlar/workflows/mark-I/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,20 @@ class SimplexDesign(object):
yield control['label']
return list(getlabels())

@property
def varfilter_input(self):
infiles = list()
if self._config['varfilter']:
infiles.append(self._config['varfilter'])
return infiles + expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))

@property
def simlike_input(self):
if self._config['varfilter']:
return ['calls.filtered.vcf.gz']
else:
return expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))


simplex = SimplexDesign(config)

Expand Down Expand Up @@ -324,18 +338,28 @@ rule call:
shell: 'kevlar --tee --logfile {output[1]} call --mask-mem 10M --refr {input[2]} --ksize {config[ksize]} --out {output[0]} {input[0]} {input[1]}'


rule varfilter:
input: simplex.varfilter_input
output:
'calls.filtered.vcf.gz',
'Logs/varfilter.log'
threads: 1
message: 'Filter out calls from the provided regions (usually common variants, repetitive DNA, etc)'
shell: 'kevlar --tee --logfile {output[1]} varfilter --out {output[0]} {input}'


rule like_scores:
input:
'Reference/refr-counts.smallcounttable',
simplex.all_sample_counts,
expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
simplex.simlike_input
output:
'calls.scored.sorted.vcf.gz',
'Logs/simlike.log'
threads: 1
message: 'Filter calls, compute likelihood scores, and sort calls by score.'
run:
in_vcfs = expand('calls.{num}.prelim.vcf.gz', num=range(config['numsplit']))
in_vcfs = simplex.simlike_input
cmd = [
'kevlar', '--tee', '--logfile', output[1], 'simlike',
'--mu', config['samples']['coverage']['mean'],
Expand Down
3 changes: 2 additions & 1 deletion kevlar/workflows/mark-I/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,6 @@
"delta": 50,
"seqpattern": ".",
"maxdiff": 10000
}
},
"varfilter": "/data/refr/dbSNP_common.bed"
}

0 comments on commit 6fa2623

Please sign in to comment.