Skip to content

Commit

Permalink
Merge pull request #220 from dib-lab/feature/novel-save-counts
Browse files Browse the repository at this point in the history
Add option to save counts to kevlar novel and kevlar simplex
  • Loading branch information
standage committed Mar 16, 2018
2 parents 20c4a6a + 90a49e8 commit ccbf343
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 13 deletions.
17 changes: 14 additions & 3 deletions kevlar/cli/novel.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def subparser(subparsers):
kevlar novel --out output.augfastq \\
--case proband1.fq --case proband2.fq \\
--control control1a.fq control1b.fq \\
--control control2a.fq control2b.fq"""
--control control2a.fq control2b.fq \\
--save-case-counts p1.ct p2.ct --save-ctrl-counts c1.ct c2.ct"""
epilog = textwrap.dedent(epilog)
subparser = subparsers.add_parser(
'novel', description=desc, epilog=epilog, add_help=False,
Expand Down Expand Up @@ -125,13 +126,23 @@ def subparser(subparsers):
help='a number between 1 and N (inclusive) '
'indicating the band to be processed')

out_args = subparser.add_argument_group('Output settings')
out_args.add_argument('-o', '--out', metavar='FILE',
help='file to which interesting reads will be '
'written; default is terminal (stdout)')
out_args.add_argument('--save-case-counts', metavar='CT', nargs='+',
help='save the computed k-mer counts for each case '
'sample to the specified count table file(s)')
out_args.add_argument('--save-ctrl-counts', metavar='CT', nargs='+',
help='save the computed k-mer counts for each '
'control sample to the specified count table '
'file(s)')

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
misc_args.add_argument('-k', '--ksize', type=int, default=31, metavar='K',
help='k-mer size; default is 31')
misc_args.add_argument('-o', '--out', metavar='FILE',
help='output file; default is terminal (stdout)')
misc_args.add_argument('--upint', type=float, default=1e6, metavar='INT',
help='update interval for log messages; default is '
'1000000 (1 update message per millon reads)')
Expand Down
3 changes: 3 additions & 0 deletions kevlar/cli/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from argparse import SUPPRESS
from khmer import khmer_args


Expand Down Expand Up @@ -163,3 +164,5 @@ def subparser(subparsers):
'-k', '--ksize', type=int, default=31, metavar='K', help='k-mer size; '
'default is 31'
)
misc_args.add_argument('--save-case-counts', nargs='+', help=SUPPRESS)
misc_args.add_argument('--save-ctrl-counts', nargs='+', help=SUPPRESS)
30 changes: 30 additions & 0 deletions kevlar/novel.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,33 @@ def load_samples(counttables=None, filelists=None, ksize=31, memory=1e6,
return samples


def save_counts(filelist, tablelist, logstream=sys.stderr):
if len(filelist) != len(tablelist):
msg = 'number of filenames provided ({:d})'.format(len(filelist))
msg += 'does not match the number of '
msg += 'samples provided ({:d})'.format(len(tablelist))
msg += '; stubbornly refusing to save k-mer counts'
print('[kevlar::novel] WARNING:', msg, file=logstream)
return
for outfile, counttable in zip(filelist, tablelist):
if not outfile.endswith(('.ct', '.counttable')):
outfile += '.counttable'
print(' ', outfile, file=logstream)
counttable.save(outfile)


def save_all_counts(casecounts, casefiles, ctrlcounts, ctrlfiles,
logstream=sys.stderr):
if casefiles:
message = 'saving k-mer counts for case samples'
print('[kevlar::novel]', message, file=logstream)
save_counts(casefiles, casecounts, logstream=logstream)
if ctrlfiles:
message = 'saving k-mer counts for control samples'
print('[kevlar::novel]', message, file=logstream)
save_counts(ctrlfiles, ctrlcounts, logstream=logstream)


def novel(casestream, casecounts, controlcounts, ksize=31, abundscreen=None,
casemin=5, ctrlmax=0, numbands=None, band=None, skipuntil=None,
updateint=10000, logstream=sys.stderr):
Expand Down Expand Up @@ -190,6 +217,9 @@ def main(args):
print('[kevlar::novel] All samples loaded in {:.2f} sec'.format(elapsed),
file=args.logfile)

save_all_counts(cases, args.save_case_counts, controls,
args.save_ctrl_counts, logstream=args.logfile)

timer.start('iter')
ncases = len(args.case)
message = 'Iterating over reads from {:d} case sample(s)'.format(ncases)
Expand Down
4 changes: 4 additions & 0 deletions kevlar/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ def main(args):
args.mask_files, args.ksize, args.mask_memory, maxfpr=args.filter_fpr,
logstream=args.logfile
)
kevlar.novel.save_all_counts(
cases, args.save_case_counts, controls, args.save_ctrl_counts,
logstream=args.logfile
)

outstream = kevlar.open(args.out, 'w')
caserecords = kevlar.multi_file_iter_screed(args.case)
Expand Down
74 changes: 65 additions & 9 deletions kevlar/tests/test_novel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import filecmp
import glob
import json
import pytest
Expand All @@ -16,6 +17,7 @@
import screed
from shutil import rmtree
import kevlar
from kevlar.tests import data_file
from khmer import Counttable


Expand Down Expand Up @@ -87,7 +89,7 @@ def test_assumptions(kmer):
])
def test_novel_single_mutation(case, ctrl, mem, capsys):
from sys import stdout, stderr
casestr = kevlar.tests.data_file(case)
casestr = data_file(case)
ctrls = kevlar.tests.data_glob(ctrl)
arglist = ['novel', '--case', casestr, '--ksize', '13', '--case-min', '8',
'--control', ctrls[0], '--control', ctrls[1],
Expand Down Expand Up @@ -180,8 +182,8 @@ def test_iter_read_multi_file():


def test_novel_abund_screen(capsys):
case = kevlar.tests.data_file('screen-case.fa')
ctrl = kevlar.tests.data_file('screen-ctrl.fa')
case = data_file('screen-case.fa')
ctrl = data_file('screen-ctrl.fa')
arglist = ['novel', '--ksize', '25', '--ctrl-max', '1', '--case-min', '8',
'--case', case, '--control', ctrl, '--abund-screen', '3']
args = kevlar.cli.parser().parse_args(arglist)
Expand All @@ -193,7 +195,7 @@ def test_novel_abund_screen(capsys):

def test_skip_until(capsys):
readname = 'bogus-genome-chr1_115_449_0:0:0_0:0:0_1f4/1'
case = kevlar.tests.data_file('trio1/case1.fq')
case = data_file('trio1/case1.fq')
ctrls = kevlar.tests.data_glob('trio1/ctrl[1,2].fq')
arglist = ['novel', '--ctrl-max', '0', '--case-min', '6',
'--skip-until', readname, '--upint', '50',
Expand All @@ -219,11 +221,11 @@ def test_skip_until(capsys):


def test_novel_output_has_mates():
kid = kevlar.tests.data_file('minitrio/trio-proband.fq.gz')
mom = kevlar.tests.data_file('minitrio/trio-mother.fq.gz')
dad = kevlar.tests.data_file('minitrio/trio-father.fq.gz')
testnovel = kevlar.tests.data_file('minitrio/novel.augfastq.gz')
testmates = kevlar.tests.data_file('minitrio/novel-mates.fastq.gz')
kid = data_file('minitrio/trio-proband.fq.gz')
mom = data_file('minitrio/trio-mother.fq.gz')
dad = data_file('minitrio/trio-father.fq.gz')
testnovel = data_file('minitrio/novel.augfastq.gz')
testmates = data_file('minitrio/novel-mates.fastq.gz')

with NamedTemporaryFile(suffix='.augfastq') as novelfile:
arglist = [
Expand All @@ -248,3 +250,57 @@ def test_novel_output_has_mates():
stream = kevlar.parse_augmented_fastx(kevlar.open(testmates, 'r'))
test_mate_seqs = set([r.sequence for r in stream])
assert mate_seqs == test_mate_seqs


def test_novel_save_counts():
outdir = mkdtemp()
try:
for ind in ('father', 'mother', 'proband'):
outfile = '{:s}/{:s}.ct'.format(outdir, ind)
infile = data_file('minitrio/trio-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '5M', outfile,
infile]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.count.main(args)

arglist = [
'novel', '--ksize', '27', '--out', outdir + '/novel.augfastq.gz',
'--save-case-counts', outdir + '/kid.ct', '--save-ctrl-counts',
outdir + '/mom.ct', outdir + '/dad.ct', '--case',
data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--memory', '5M'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)

counts = ('father', 'mother', 'proband')
testcounts = ('dad', 'mom', 'kid')
for c1, c2 in zip(counts, testcounts):
f1 = '{:s}/{:s}.ct'.format(outdir, c1)
f2 = '{:s}/{:s}.ct'.format(outdir, c2)
assert filecmp.cmp(f1, f2)
finally:
rmtree(outdir)


def test_novel_save_counts_mismatch(capsys):
outdir = mkdtemp()
try:
arglist = [
'novel', '--ksize', '27', '--out', outdir + '/novel.augfastq.gz',
'--save-case-counts', outdir + '/kid.ct', '--save-ctrl-counts',
outdir + '/mom.ct', outdir + '/dad.ct', outdir + '/sibling.ct',
'--case', data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--memory', '5M'
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.novel.main(args)
finally:
rmtree(outdir)

out, err = capsys.readouterr()
assert 'stubbornly refusing to save k-mer counts' in err
37 changes: 36 additions & 1 deletion kevlar/tests/test_simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import filecmp
from os import remove
import pytest
import subprocess
import sys
from tempfile import NamedTemporaryFile
from tempfile import NamedTemporaryFile, mkdtemp
from shutil import rmtree
import kevlar
from kevlar.tests import data_file, data_glob

Expand Down Expand Up @@ -111,3 +113,36 @@ def test_simplex_minitrio():
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.simplex.main(args)


def test_simplex_save_counts():
outdir = mkdtemp()
try:
for ind in ('father', 'mother', 'proband'):
outfile = '{:s}/{:s}.ct'.format(outdir, ind)
infile = data_file('minitrio/trio-{:s}.fq.gz'.format(ind))
arglist = ['count', '--ksize', '27', '--memory', '5M', outfile,
infile]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.count.main(args)

arglist = [
'simplex', '--ksize', '27', '--out', outdir + '/calls.vcf',
'--save-case-counts', outdir + '/kid', '--save-ctrl-counts',
outdir + '/mom', outdir + '/dad', '--case',
data_file('minitrio/trio-proband.fq.gz'),
'--control', data_file('minitrio/trio-mother.fq.gz'),
'--control', data_file('minitrio/trio-father.fq.gz'),
'--novel-memory', '5M', data_file('minitrio/refr.fa')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.simplex.main(args)

counts = ('father', 'mother', 'proband')
testcounts = ('dad', 'mom', 'kid')
for c1, c2 in zip(counts, testcounts):
f1 = '{:s}/{:s}.ct'.format(outdir, c1)
f2 = '{:s}/{:s}.counttable'.format(outdir, c2)
assert filecmp.cmp(f1, f2)
finally:
rmtree(outdir)

0 comments on commit ccbf343

Please sign in to comment.