Skip to content

Commit

Permalink
Merge pull request #69 from dib-lab/feature/2pass
Browse files Browse the repository at this point in the history
Second abundance recalculation in kevlar filter
  • Loading branch information
standage committed May 4, 2017
2 parents 08d7d6f + 00cb491 commit fe2ff5f
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 3 deletions.
16 changes: 13 additions & 3 deletions kevlar/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ def subparser(subparsers):
help='minimum abundance required to call a '
'k-mer novel; should be the same value used for '
'--case_min in `kevlar find`; default is 5')
filter_args.add_argument('--skip2', default=False, action='store_true',
help='skip the second pass over the reads that '
'recalculates abundance after reference and '
'contaminant k-mers are discarded')
filter_args.add_argument('--ignore', metavar='KM', nargs='+',
help='ignore the specified k-mer(s)')

Expand Down Expand Up @@ -162,8 +166,14 @@ def load_input(filelist, ksize, memory, maxfpr=0.001, logfile=sys.stderr):


def validate_and_print(readset, countgraph, refr=None, contam=None, minabund=5,
outfile=sys.stdout, augout=None, logfile=sys.stderr):
readset.validate(countgraph, refr, contam, minabund)
skip2=False, outfile=sys.stdout, augout=None,
logfile=sys.stderr):
readset.validate(countgraph, refr=refr, contam=contam, minabund=minabund)
if not skip2:
ksize, tablesizes = countgraph.ksize(), countgraph.hashsizes()
countgraph = khmer._Countgraph(ksize, tablesizes)
readset.recalc_abund(countgraph, minabund)

n = 0 # Get an unbound var error later (printing report) without this?!?!
for n, record in enumerate(readset):
khmer.utils.write_record(record, outfile)
Expand Down Expand Up @@ -250,7 +260,7 @@ def main(args):
print('[kevlar::filter] Validate k-mers and print reads',
file=args.logfile)
validate_and_print(readset, countgraph, refr, contam, args.min_abund,
args.out, args.aug_out, args.logfile)
args.skip2, args.out, args.aug_out, args.logfile)
elapsed = timer.stop('validate')
print('[kevlar::filter] k-mers validated and reads printed',
'in {:.2f} sec'.format(elapsed), file=args.logfile)
Expand Down
7 changes: 7 additions & 0 deletions kevlar/seqio.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,13 @@ def validate(self, counts, refr=None, contam=None, minabund=5):
if len(validated_kmers) == 0:
self._novalidkmers_count += 1

def recalc_abund(self, newcounts, minabund=5):
for read in self:
newcounts.consume(read.sequence)
self._valid = defaultdict(int)
self._novalidkmers_count = 0
self.validate(newcounts, minabund=minabund)

def group_reads_by_novel_kmers(self, ccprefix, upint=10000,
logstream=None):
try:
Expand Down
133 changes: 133 additions & 0 deletions kevlar/tests/data/collect.gamma.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
@notobviouscontam1
CGTGGGCGGACGCCCATACCGCGATAGCACGGGAGCTCACTTCGTTGACG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
GTGGGCGGACGCCCATACC 12 0 0#
@notobviouscontam2
CGTGGGCGGACGCCCATACCGCGATAGCACGGGAGCTCACTTCGTTGACG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
GTGGGCGGACGCCCATACC 12 0 0#
@notobviouscontam3
CGTGGGCGGACGCCCATACCGCGATAGCACGGGAGCTCACTTCGTTGACG
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam1
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam2
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam3
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam4
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam5
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam6
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam7
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam8
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@obviouscontam9
CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
CAGGCCAGGGATCGCCGTG 9 0 0#
GTGGGCGGACGCCCATACC 12 0 0#
@good1
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good2
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good3
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good4
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good5
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good6
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good7
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
@good8
TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT
+
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
TAGGGGCGTGACTTAATAA 8 0 0#
AGGGGCGTGACTTAATAAG 8 0 0#
GGGGCGTGACTTAATAAGG 8 0 0#
GGGCGTGACTTAATAAGGT 8 0 0#
24 changes: 24 additions & 0 deletions kevlar/tests/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,30 @@ def test_validate_withrefr():
assert kevlar.revcom(ikmer.sequence) != kmer


def test_validate_withcontam():
contam = khmer.Nodetable(19, 1e3, 2)
contam.consume('CCAGCTGCAGGCCAGGGATCGCCGTGGGCGGACGCCCATACCGCGATAGC')
filelist = kevlar.tests.data_glob('collect.gamma.txt')

# First, without second pass
readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3)
kevlar.filter.validate_and_print(readset, countgraph, contam=contam,
minabund=8, skip2=True)
assert readset.valid == (5, 35)
assert readset.lowabund == (0, 0)
assert readset.discarded == 0
assert readset.contam == 9

# Then, with second pass
readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3)
kevlar.filter.validate_and_print(readset, countgraph, contam=contam,
minabund=8)
assert readset.valid == (4, 32)
assert readset.lowabund == (2, 21)
assert readset.discarded == 12
assert readset.contam == 9


def test_ctrl3(ctrl3):
readset, countgraph = ctrl3
kevlar.filter.validate_and_print(readset, countgraph, minabund=6)
Expand Down

0 comments on commit fe2ff5f

Please sign in to comment.