Discard short alignment blocks prior to variant calling (#303)

Short alignment blocks are a cause for suspicion, and in any case can result in variant-spanning windows shorter than ksize, for which likelihood scores cannot be computed. This update discards alignment blocks shorter than ksize in length prior to variant calling. Closes #260.
kevlar-dev · Nov 16, 2018 · 7a4ba76 · 7a4ba76
1 parent 5fc24f5
commit 7a4ba76
Show file tree

Hide file tree

Showing 5 changed files with 29 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - A Jupyter notebook and supporting code and data for evaluating kevlar's performance on a simulated data set (see #271).
 - New flags for filtering gDNA cutouts or calls from specified sequences (see #285).
 - New filter that discards any contig/gDNA alignment with more than 4 mismatches (see #288).
-- A new feature that generates a Nodetable containing only variant-spanning k-mers to support re-counting k-mers and computing likelihood scores in low memory (see #289, #292).
+- A new feature that generates a Nodetable containing only variant-spanning k-mers to support re-counting k-mers and computing likelihood scores in low memory (see #289, #292, #302).
 - A new `ProgressIndicator` class that provides gradually less frequent updates over time (see #299).
 
 ### Changed
@@ -18,6 +18,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - Cleaned up overlap-related code with a new `ReadPair` class (see #283).
 - Updated `kevlar assemble`, `kevlar localize`, and `kevlar call` to accept streams of partitioned reads; previously, only reads for a single partition were permitted (see #294).
 - Overhauled the `kevlar localize` command to compute seed locations for all seeds in all partitions with a single BWA call, massively improving efficiency (see #294 and #301).
+- Updated the variant calling procedure to discard alignment blocks less than `ksize` in length (see #303).
 
 ### Fixed
 - Minor bug with .gml output due to a change in the networkx package (see #278).

diff --git a/kevlar/cli/alac.py b/kevlar/cli/alac.py
@@ -38,12 +38,14 @@ def subparser(subparsers):
                             'reference by extending beyond the span of all '
                             'k-mer starting positions by D bp')
     local_args.add_argument('-x', '--max-diff', type=int, metavar='X',
-                            default=None, help='split seed matches into '
-                            'distinct bins if the distance between two seed '
-                            'matches is > X; by default, X is 3 times the '
-                            'length of the longest contig; each bin specifies '
-                            'a reference target sequence against which '
-                            'assembled contigs will be aligned')
+                            default=None, help='split and report multiple '
+                            'reference targets if the distance between two '
+                            'seed matches is > X; by default, X is set '
+                            'dynamically for each partition and is equal to 3 '
+                            'times the length of the longest contig in the '
+                            'partition; each resulting bin specifies a '
+                            'reference target sequence to which assembled '
+                            'contigs will subsequently be aligned')
     local_args.add_argument('--include', metavar='REGEX', type=str,
                             help='discard alignments to any chromosomes whose '
                             'sequence IDs do not match the given pattern')

diff --git a/kevlar/cli/localize.py b/kevlar/cli/localize.py
@@ -33,8 +33,12 @@ def subparser(subparsers):
     subparser.add_argument('-x', '--max-diff', type=int, metavar='X',
                            default=None, help='split and report multiple '
                            'reference targets if the distance between two '
-                           'seed matches is > X; by default, X is 3 times the '
-                           'length of the longest contig')
+                           'seed matches is > X; by default, X is set '
+                           'dynamically for each partition and is equal to 3 '
+                           'times the length of the longest contig in the '
+                           'partition; each resulting bin specifies a '
+                           'reference target sequence to which assembled '
+                           'contigs will subsequently be aligned')
     subparser.add_argument('--include', metavar='REGEX', type=str,
                            help='discard alignments to any chromosomes whose '
                            'sequence IDs do not match the given pattern')

diff --git a/kevlar/tests/test_varmap.py b/kevlar/tests/test_varmap.py
@@ -7,7 +7,6 @@
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
-from io import StringIO
 import sys
 import kevlar
 from kevlar.varmap import VariantMapping
@@ -131,14 +130,13 @@ def test_variant_mapping():
     assert mapping.interval == ('chr1', 10000, 10060)
 
 
-@pytest.mark.parametrize('query,target,dist,n,msgcount', [
+@pytest.mark.parametrize('query,target,dist,n,trimcount', [
     ('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 5, 1, 1),
     ('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', 5, 1, 1),
     ('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 2, 2, 0),
     ('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', None, 2, 0),
 ])
-def test_call_near_end(query, target, dist, n, msgcount):
-    log = StringIO()
+def test_call_near_end(query, target, dist, n, trimcount):
     contig = next(
         kevlar.parse_augmented_fastx(
             kevlar.open(data_file(query), 'r')
@@ -150,12 +148,9 @@ def test_call_near_end(query, target, dist, n, msgcount):
         )
     )
     aln = VariantMapping(contig, cutout)
-    calls = list(aln.call_variants(31, mindist=dist, logstream=log))
+    calls = list(aln.call_variants(31, mindist=dist))
     assert len(calls) == n
-
-    err = log.getvalue()
-    count = err.count('discarding SNV due to proximity to end of the contig')
-    assert count == msgcount
+    assert aln.trimmed == trimcount
 
 
 @pytest.mark.parametrize('query,target,vw,rw', [

diff --git a/kevlar/varmap.py b/kevlar/varmap.py
@@ -36,6 +36,7 @@ def __init__(self, contig, cutout, score=None, cigar=None, strand=1,
         self.score = score
         self.strand = strand
         self.matedist = None
+        self.trimmed = 0
 
         self.tok = AlignmentTokenizer(self.varseq, self.refrseq, cigar)
         self.cigar = self.tok._cigar
@@ -218,9 +219,11 @@ def call_snv(self, qseq, tseq, offset, ksize, mindist=6, donocall=True,
         """
         length = len(qseq)
         assert len(tseq) == length
+        if length < ksize:
+            return
         diffs = [i for i in range(length) if tseq[i] != qseq[i]]
         if mindist:
-            diffs = trim_terminal_snvs(diffs, length, mindist, logstream)
+            self.trimmed, diffs = trim_terminal_snvs(diffs, length, mindist)
         if len(diffs) == 0 or len(diffs) > 4:
             if donocall:
                 nocall = Variant(
@@ -295,10 +298,12 @@ def n_ikmers_present(record, window):
 
 def trim_terminal_snvs(mismatches, alnlength, mindist=5, logstream=sys.stderr):
     valid = list()
+    trimcount = 0
     for mm in mismatches:
         if mm < mindist or alnlength - mm < mindist:
-            msg = 'discarding SNV due to proximity to end of the contig'
-            print('[kevlar::call] NOTE:', msg, file=logstream)
+            trimcount += 1
+            # msg = 'discarding SNV due to proximity to end of the contig'
+            # print('[kevlar::call] NOTE:', msg, file=logstream)
         else:
             valid.append(mm)
-    return valid
+    return trimcount, valid