Skip to content

Commit

Permalink
Merge pull request #277 from dib-lab/count/mask
Browse files Browse the repository at this point in the history
Counter size and mask support for kevlar count
  • Loading branch information
standage committed Jul 10, 2018
2 parents 09abff0 + a4849ee commit ab27713
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 12 deletions.
13 changes: 13 additions & 0 deletions kevlar/cli/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,26 @@ def subparser(subparsers):

subparser.add_argument('-k', '--ksize', type=int, default=31, metavar='K',
help='k-mer size; default is 31')
subparser.add_argument('-c', '--counter-size', type=int, choices=(1, 4, 8),
metavar='C', default=8, help='number of bits to '
'allocate for counting each k-mer; options are 1 '
'(max count: 1), 4 (max count: 15), and 8 (max '
'count: 255); default is 8)')
subparser.add_argument('-M', '--memory', type=khmer_args.memory_setting,
default=1e6, metavar='MEM',
help='memory to allocate for the count table')
subparser.add_argument('--max-fpr', type=float, default=0.2,
metavar='FPR', help='terminate if the estimated '
'false positive rate for any sample is higher than '
'"FPR"; default is 0.2')
subparser.add_argument('--mask', metavar='MSK', help='counttable or '
'nodetable of k-mers to ignore when counting '
'k-mers')
subparser.add_argument('--count-masked', action='store_true',
help='by default, when a mask is provided k-mers '
'in the mask are ignored; this setting inverts the '
'behavior so that only k-mers in the mask are '
'counted')
subparser.add_argument('--num-bands', type=int, metavar='N', default=None,
help='number of bands into which to divide the '
'hashed k-mer space')
Expand Down
36 changes: 25 additions & 11 deletions kevlar/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,44 @@
import sys
import khmer
import kevlar
from kevlar.sketch import allocate, get_extension


def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
mask=None, maskmaxabund=1, numbands=None, band=None,
def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, count=True,
smallcount=False, mask=None, maskmaxabund=0,
consume_masked=False, numbands=None, band=None,
outfile=None, numthreads=1, logfile=sys.stderr):
"""
Compute k-mer abundances for the specified sequence input.
"""Compute k-mer abundances for the specified sequence input.
Expected input is a list of one or more FASTA/FASTQ files corresponding
to a single sample. A counttable is created and populated with abundances
to a single sample. A sketch is created and populated with abundances
of all k-mers observed in the input. If `mask` is provided, only k-mers not
present in the mask will be loaded.
"""
sketch = khmer.Counttable(ksize, memory / 4, 4)
sketch = allocate(ksize, memory / 4, count=count, smallcount=smallcount)
numreads = 0
for seqfile in seqfiles:
print('[kevlar::count] loading from', seqfile, file=logfile)
parser = khmer.ReadParser(seqfile)
threads = list()
for _ in range(numthreads):
if mask:
threshold = 1 if consume_masked else maskmaxabund
kwargs = {
'consume_masked': consume_masked,
'threshold': threshold
}
if numbands:
thread = threading.Thread(
target=sketch.consume_seqfile_banding_with_mask,
args=(parser, numbands, band, mask, ),
kwargs=kwargs,
)
else:
thread = threading.Thread(
target=sketch.consume_seqfile_with_mask,
args=(parser, mask, ),
kwargs=kwargs,
)
else:
if numbands:
Expand Down Expand Up @@ -74,8 +82,9 @@ def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2,
raise kevlar.sketch.KevlarUnsuitableFPRError(message)

if outfile:
if not outfile.endswith(('.ct', '.counttable')):
outfile += '.counttable'
extensions = get_extension(count=count, smallcount=smallcount)
if not outfile.endswith(extensions):
outfile += extensions[1]
sketch.save(outfile)
message += ';\n saved to "{:s}"'.format(outfile)
print('[kevlar::count] ', message, file=logfile)
Expand All @@ -87,14 +96,19 @@ def main(args):
if (args.num_bands is None) is not (args.band is None):
raise ValueError('Must specify --num-bands and --band together')
myband = args.band - 1 if args.band else None
if args.mask:
args.mask = kevlar.sketch.load(args.mask)

timer = kevlar.Timer()
timer.start()

docount = args.counter_size > 1
dosmallcount = args.counter_size == 4
sketch = load_sample_seqfile(
args.seqfile, args.ksize, args.memory, args.max_fpr,
numbands=args.num_bands, band=myband, numthreads=args.threads,
outfile=args.counttable, logfile=args.logfile
args.seqfile, args.ksize, args.memory, args.max_fpr, count=docount,
smallcount=dosmallcount, mask=args.mask,
consume_masked=args.count_masked, numbands=args.num_bands, band=myband,
numthreads=args.threads, outfile=args.counttable, logfile=args.logfile
)

total = timer.stop()
Expand Down
28 changes: 28 additions & 0 deletions kevlar/sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,30 @@
'.smallcountgraph': khmer.SmallCountgraph.load,
}

# count(?)->graph(?)->small(?)
sketch_extensions_by_trait = {
True: {
True: {
True: ('.scg', '.smallcountgraph'),
False: ('.cg', '.countgraph'),
},
False: {
True: ('.sct', '.smallcounttable'),
False: ('.ct', '.counttable'),
},
},
False: {
True: {
True: ('.ng', '.nodegraph'),
False: ('.ng', '.nodegraph'),
},
False: {
True: ('.nt', '.nodetable'),
False: ('.nt', '.nodetable'),
},
},
}


class KevlarSketchTypeError(ValueError):
pass
Expand Down Expand Up @@ -68,6 +92,10 @@ def load(filename):
return loadfunc(filename)


def get_extension(count=False, graph=False, smallcount=False):
return sketch_extensions_by_trait[count][graph][smallcount]


def allocate(ksize, target_tablesize, num_tables=4, count=False, graph=False,
smallcount=False):
"""Convenience function for allocating memory for a new sketch."""
Expand Down
73 changes: 72 additions & 1 deletion kevlar/tests/test_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@
# -----------------------------------------------------------------------------

import glob
import os
import pytest
import re
from tempfile import NamedTemporaryFile
import screed
import time
import kevlar
from khmer import Nodetable
from kevlar.tests import data_file, data_glob


Expand Down Expand Up @@ -106,7 +109,8 @@ def test_effcount_smoketest():
'--sample', data_file('trio1/ctrl2.fq'),
'--sample', data_file('trio1/case2.fq'),
'--ksize', '21', '--memory', '200K', '--memfrac', '0.005',
'--max-fpr', '0.1', '--threads', '2', o1.name, o2.name, o3.name
'--max-fpr', '0.1', '--max-abund', '0', '--threads', '2',
o1.name, o2.name, o3.name
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.effcount.main(args)
Expand Down Expand Up @@ -140,3 +144,70 @@ def test_effcount_problematic():
with pytest.raises(ValueError) as ve:
kevlar.effcount.main(args)
assert 'Must specify --num-bands and --band together' in str(ve)


@pytest.mark.parametrize('count,smallcount,extension,shortext', [
(True, True, '.smallcounttable', '.sct'),
(True, False, '.counttable', '.ct'),
(False, True, '.nodetable', '.nt'),
(False, False, '.nodetable', '.nt'),
])
def test_load_sample_seqfile(count, smallcount, extension, shortext):
infile = data_file('bogus-genome/refr.fa')
with NamedTemporaryFile() as outfile:
sketch = kevlar.count.load_sample_seqfile(
[infile], 21, 1e6, count=count, smallcount=smallcount,
outfile=outfile.name
)
assert sketch.get('GAATCGGTGGCTGGTTGCCGT') > 0
assert sketch.get('GATTACAGATTACAGATTACA') == 0
assert os.path.exists(outfile.name + extension)

with NamedTemporaryFile(suffix=shortext) as outfile:
sketch = kevlar.count.load_sample_seqfile(
[infile], 21, 1e6, count=count, smallcount=smallcount,
outfile=outfile.name
)
assert sketch.get('GAATCGGTGGCTGGTTGCCGT') > 0
assert sketch.get('GATTACAGATTACAGATTACA') == 0
assert not os.path.exists(outfile.name + extension)
assert os.path.exists(outfile.name)


@pytest.mark.parametrize('count,smallcount,count_masked,kpresent,kabsent', [
(True, True, True, 'CACCAATCCGTACGGAGAGCC', 'GAATCGGTGGCTGGTTGCCGT'),
(True, False, True, 'CACCAATCCGTACGGAGAGCC', 'GAATCGGTGGCTGGTTGCCGT'),
(False, True, True, 'CACCAATCCGTACGGAGAGCC', 'GAATCGGTGGCTGGTTGCCGT'),
(False, False, True, 'CACCAATCCGTACGGAGAGCC', 'GAATCGGTGGCTGGTTGCCGT'),
(True, True, False, 'GAATCGGTGGCTGGTTGCCGT', 'CACCAATCCGTACGGAGAGCC'),
(True, False, False, 'GAATCGGTGGCTGGTTGCCGT', 'CACCAATCCGTACGGAGAGCC'),
(False, True, False, 'GAATCGGTGGCTGGTTGCCGT', 'CACCAATCCGTACGGAGAGCC'),
(False, False, False, 'GAATCGGTGGCTGGTTGCCGT', 'CACCAATCCGTACGGAGAGCC'),
])
def test_load_sample_seqfile_withmask(count, smallcount, count_masked,
kpresent, kabsent):
mask = Nodetable(21, 1e4, 4)
mask.consume('CACCAATCCGTACGGAGAGCCGTATATATAGACTGCTATACTATTGGATCGTACGGGGC')
sketch = kevlar.count.load_sample_seqfile(
[data_file('bogus-genome/refr.fa')], 21, 1e6, mask=mask,
consume_masked=count_masked, count=count, smallcount=smallcount,
)
assert sketch.get(kpresent) > 0
assert sketch.get(kabsent) == 0
assert sketch.get('GATTACAGATTACAGATTACA') == 0


def test_count_cli_with_mask(capsys):
mask = Nodetable(21, 1e4, 4)
mask.consume('CACCAATCCGTACGGAGAGCCGTATATATAGACTGCTATACTATTGGATCGTACGGGGC')
with NamedTemporaryFile(suffix='.nt') as maskfile, \
NamedTemporaryFile(suffix='.sct') as countfile:
mask.save(maskfile.name)
arglist = [
'count', '--ksize', '21', '--mask', maskfile.name,
'--memory', '1M', countfile.name, data_file('bogus-genome/refr.fa')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.count.main(args)
out, err = capsys.readouterr()
assert '36898 distinct k-mers stored' in err
8 changes: 8 additions & 0 deletions kevlar/tests/test_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest
import khmer
import kevlar
from kevlar.sketch import get_extension
from kevlar.tests import data_file, data_glob


Expand Down Expand Up @@ -97,3 +98,10 @@ def test_load_sketches_fpr_fail():
with pytest.raises(kevlar.sketch.KevlarUnsuitableFPRError) as e:
sketches = kevlar.sketch.load_sketchfiles(infiles, maxfpr=0.001)
assert 'FPR too high, bailing out!!!' in str(e)


def test_get_extensions():
assert get_extension() == ('.nt', '.nodetable')
assert get_extension(count=True) == ('.ct', '.counttable')
assert get_extension(count=True, smallcount=True) == ('.sct',
'.smallcounttable')

0 comments on commit ab27713

Please sign in to comment.