Skip to content

Commit

Permalink
Generate a mask from variant-spanning k-mers (#289)
Browse files Browse the repository at this point in the history
The two stages of the kevlar workflow that are currently the most memory-intensive, and therefore the most expensive, are the novel k-mer finding and the likelihood calculations. Both stages currently require loading k-mer counts for all 3-4 samples into memory.

We can drastically reduce memory during the novel k-mer finding by 1) performing error correction on the reads; and 2) reducing the CountMin sketch sizes. While 2) will reduce the accuracy of the k-mer abundance estimates, this can be overcome during the subsequent filtering step.

However, we still need accurate k-mer abundances for all samples while calculating variant likelihoods. This update adds a feature to kevlar alac which creates a mask of all variant-spanning k-mers. This allows the user to exchange additional runtime (recount the k-mers) for a drastic reduction in memory required for the final step of the pipeline.
  • Loading branch information
standage committed Oct 30, 2018
1 parent c64eaf1 commit de712ec
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 1 deletion.
14 changes: 13 additions & 1 deletion kevlar/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from kevlar.assemble import assemble_fml_asm
from kevlar.localize import localize
from kevlar.call import call
import khmer


def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,
Expand Down Expand Up @@ -73,7 +74,7 @@ def make_call_from_reads(queue, idx, calls, refrfile, ksize=31, delta=50,
def alac(pstream, refrfile, threads=1, ksize=31, bigpart=10000, delta=50,
seedsize=31, maxdiff=None, inclpattern=None, exclpattern=None,
match=1, mismatch=2, gapopen=5, gapextend=0, min_ikmers=None,
logstream=sys.stderr):
maskfile=None, maskmem=1e6, maskmaxfpr=0.01, logstream=sys.stderr):
part_queue = Queue(maxsize=max(32, 12 * threads))

refrstream = kevlar.open(refrfile, 'r')
Expand Down Expand Up @@ -106,6 +107,17 @@ def alac(pstream, refrfile, threads=1, ksize=31, bigpart=10000, delta=50,

part_queue.join()
allcalls = sorted(chain(*call_lists), key=lambda c: (c.seqid, c.position))
if maskfile:
message = 'generating mask of variant-spanning k-mers'
print('[kevlar::alac]', message, file=logstream)
numtables = 4
buckets = maskmem * khmer._buckets_per_byte['nodegraph'] / numtables
mask = khmer.Nodetable(ksize, buckets, numtables)
for call in allcalls:
window = call.attribute('ALTWINDOW')
if window is not None:
mask.consume(window)
mask.save(maskfile)
for call in allcalls:
yield call

Expand Down
14 changes: 14 additions & 0 deletions kevlar/cli/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import khmer
from khmer import khmer_args
import re


Expand Down Expand Up @@ -58,6 +60,18 @@ def subparser(subparsers):
score_args.add_argument('-E', '--extend', type=int, default=0, metavar='E',
help='gap extension penalty; default is 0')

mask_args = subparser.add_argument_group('Mask generation settings')
mask_args.add_argument('--gen-mask', metavar='FILE', help='generate '
'a nodetable containing all k-mers that span any '
'variant call')
mask_args.add_argument('--mask-mem', type=khmer_args.memory_setting,
default=1e6, metavar='MEM',
help='memory to allocate for the node table')
mask_args.add_argument('--mask-max-fpr', type=float, default=0.01,
metavar='FPR', help='terminate if the estimated '
'false positive rate is higher than "FPR"; default '
'is 0.01')

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
Expand Down
Binary file added kevlar/tests/data/fiveparts-genmask.nodetable
Binary file not shown.
19 changes: 19 additions & 0 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import filecmp
import pytest
from tempfile import NamedTemporaryFile
import sys
import kevlar
from kevlar.tests import data_file
Expand Down Expand Up @@ -166,6 +168,23 @@ def test_alac_bigpart():
assert len(calls) == 3


def test_alac_generate_mask():
readfile = data_file('fiveparts.augfastq.gz')
refrfile = data_file('fiveparts-refr.fa.gz')
readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
partstream = kevlar.parse_partitioned_reads(readstream)
with NamedTemporaryFile(suffix='.nt') as maskfile:
calls = list(
kevlar.alac.alac(partstream, refrfile, maskfile=maskfile.name,
maskmem=1e6)
)
assert len(calls) == 5
for c in calls:
print(c.vcf)
testfilename = data_file('fiveparts-genmask.nodetable')
assert filecmp.cmp(testfilename, maskfile.name) is True


def test_alac_matedist():
readfile = data_file('mate-dist/cc130.augfastq.gz')
refrfile = data_file('mate-dist/cc130.refr.fa.gz')
Expand Down

0 comments on commit de712ec

Please sign in to comment.