Skip to content

Commit

Permalink
Extend variant-spanning-kmer-mask capability to kevlar call (#302)
Browse files Browse the repository at this point in the history
This update extends the ability to create a mask of variant-spanning k-mers to `kevlar call` via the `--gen-mask` option, This was previously introduced to `kevlar alac` only. Both implementations now actually report if the FPR is higher than the user-supplied max FPR.

Closes #295.
  • Loading branch information
standage committed Nov 16, 2018
1 parent 151b4f1 commit 5fc24f5
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 6 deletions.
6 changes: 6 additions & 0 deletions kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def alac(pstream, refrfile, threads=1, ksize=31, maxreads=10000, delta=50,
window = call.attribute('ALTWINDOW')
if window is not None and len(window) >= ksize:
mask.consume(window)
fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0)
if fpr > maskmaxfpr:
message = 'WARNING: mask FPR is {:.4f}'.format(fpr)
message += '; exceeds user-specified limit'
message += ' of {:.4f}'.format(maskmaxfpr)
print('[kevlar::alac]', message, file=logstream)
mask.save(maskfile)
for call in calls:
yield call
Expand Down
21 changes: 21 additions & 0 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from kevlar.reference import bwa_align
from kevlar.varmap import VariantMapping
from kevlar.vcf import VariantFilter as vf
import khmer
from khmer import _buckets_per_byte


def align_mates(record, refrfile):
Expand Down Expand Up @@ -156,6 +158,13 @@ def main(args):
gdnastream = kevlar.parse_partitioned_reads(
kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r'))
)
mask = None
if args.gen_mask:
message = 'generating mask of variant-spanning k-mers'
print('[kevlar::call]', message, file=args.logfile)
ntables = 4
buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables
mask = khmer.Nodetable(args.ksize, buckets, ntables)
progress_indicator = kevlar.ProgressIndicator(
'[kevlar::call] processed contigs/gDNAs for {counter} partitions',
interval=10, breaks=[100, 1000, 10000], logstream=args.logfile,
Expand All @@ -170,4 +179,16 @@ def main(args):
logstream=args.logfile
)
for varcall in caller:
if args.gen_mask:
window = varcall.attribute('ALTWINDOW')
if window is not None and len(window) >= args.ksize:
mask.consume(window)
writer.write(varcall)
if args.gen_mask:
fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0)
if fpr > args.mask_max_fpr:
message = 'WARNING: mask FPR is {:.4f}'.format(fpr)
message += '; exceeds user-specified limit'
message += ' of {:.4f}'.format(args.mask_max_fpr)
print('[kevlar::call]', message, file=args.logfile)
mask.save(args.gen_mask)
15 changes: 15 additions & 0 deletions kevlar/cli/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import khmer
from khmer import khmer_args


def subparser(subparsers):
"""Define the `kevlar call` command-line interface."""
Expand All @@ -29,6 +32,18 @@ def subparser(subparsers):
score_args.add_argument('-E', '--extend', type=int, default=0, metavar='E',
help='gap extension penalty; default is 0')

mask_args = subparser.add_argument_group('Mask generation settings')
mask_args.add_argument('--gen-mask', metavar='FILE', help='generate '
'a nodetable containing all k-mers that span any '
'variant call')
mask_args.add_argument('--mask-mem', type=khmer_args.memory_setting,
default=1e6, metavar='MEM',
help='memory to allocate for the node table')
mask_args.add_argument('--mask-max-fpr', type=float, default=0.01,
metavar='FPR', help='terminate if the estimated '
'false positive rate is higher than "FPR"; default '
'is 0.01')

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
Expand Down
Binary file added kevlar/tests/data/fiveparts.gdnas.fa.gz
Binary file not shown.
20 changes: 18 additions & 2 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
# -----------------------------------------------------------------------------

import filecmp
import kevlar
from kevlar.tests import data_file
import pytest
from tempfile import NamedTemporaryFile
import sys
import kevlar
from kevlar.tests import data_file


def test_pico_4(capsys):
Expand Down Expand Up @@ -186,6 +186,22 @@ def test_alac_generate_mask():
assert filecmp.cmp(testfilename, maskfile.name) is True


def test_alac_generate_mask_lowmem(capsys):
readfile = data_file('fiveparts.augfastq.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
partstream = kevlar.parse_partitioned_reads(readstream)
with NamedTemporaryFile(suffix='.nt') as maskfile:
calls = list(
kevlar.alac.alac(partstream, refrfile, maskfile=maskfile.name,
maskmem=100, logstream=sys.stderr)
)
assert len(calls) == 5
out, err = capsys.readouterr()
message = 'WARNING: mask FPR is 0.8065; exceeds user-specified limit'
assert message in out or message in err


def test_alac_matedist():
readfile = data_file('mate-dist/cc130.augfastq.gz')
refrfile = data_file('mate-dist/cc130.refr.fa.gz')
Expand Down
39 changes: 35 additions & 4 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
# -----------------------------------------------------------------------------


import sys
import khmer
import filecmp
import kevlar
from kevlar.call import call
from kevlar.sequence import Record
from kevlar.tests import data_file
import pytest
from tempfile import NamedTemporaryFile


def test_align():
Expand Down Expand Up @@ -127,7 +127,7 @@ def test_funky_cigar(part, coord, window):

calls = list(call(targets, contigs))
assert len(calls) == 1
print('DEBUG', calls[0].vcf, file=sys.stderr)
print('DEBUG', calls[0].vcf)
assert calls[0].seqid == '17'
assert calls[0].position == coord - 1
assert calls[0].attribute('ALTWINDOW') == window
Expand Down Expand Up @@ -220,7 +220,7 @@ def test_align_mates():
positions = list(kevlar.call.align_mates(record, refrfile))
seqids = set([seqid for seqid, start, end in positions])
coords = sorted([(start, end) for seqid, start, end in positions])
print('DEBUG', coords, file=sys.stderr)
print('DEBUG', coords)
assert seqids == set(['seq1'])
assert coords == [
(45332, 45432), (45377, 45477), (45393, 45493), (45428, 45528),
Expand Down Expand Up @@ -274,3 +274,34 @@ def test_debug_mode(capsys):
out, err = capsys.readouterr()
alignstr = kevlar.open(data_file('wasp-align.txt'), 'r').read().strip()
assert alignstr in err


def test_call_generate_mask():
contigfile = data_file('fiveparts.contigs.augfasta.gz')
gdnafile = data_file('fiveparts.gdnas.fa.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
with NamedTemporaryFile(suffix='.nt') as maskfile:
arglist = [
'call', '--gen-mask', maskfile.name, '--mask-mem', '1M',
'--refr', refrfile, contigfile, gdnafile
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.call.main(args)
testfilename = data_file('fiveparts-genmask.nodetable')
assert filecmp.cmp(testfilename, maskfile.name) is True


def test_call_generate_mask_lowmem(capsys):
contigfile = data_file('fiveparts.contigs.augfasta.gz')
gdnafile = data_file('fiveparts.gdnas.fa.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
with NamedTemporaryFile(suffix='.nt') as maskfile:
arglist = [
'call', '--gen-mask', maskfile.name, '--mask-mem', '100',
'--refr', refrfile, contigfile, gdnafile
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.call.main(args)
out, err = capsys.readouterr()
message = 'WARNING: mask FPR is 0.8065; exceeds user-specified limit'
assert message in out or message in err

0 comments on commit 5fc24f5

Please sign in to comment.