Skip to content

Commit

Permalink
Discard short alignment blocks prior to variant calling (#303)
Browse files Browse the repository at this point in the history
Short alignment blocks are a cause for suspicion, and in any case can result in variant-spanning windows shorter than ksize, for which likelihood scores cannot be computed. This update discards alignment blocks shorter than ksize in length prior to variant calling.

Closes #260.
  • Loading branch information
standage committed Nov 16, 2018
1 parent 5fc24f5 commit 7a4ba76
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 22 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- A Jupyter notebook and supporting code and data for evaluating kevlar's performance on a simulated data set (see #271).
- New flags for filtering gDNA cutouts or calls from specified sequences (see #285).
- New filter that discards any contig/gDNA alignment with more than 4 mismatches (see #288).
- A new feature that generates a Nodetable containing only variant-spanning k-mers to support re-counting k-mers and computing likelihood scores in low memory (see #289, #292).
- A new feature that generates a Nodetable containing only variant-spanning k-mers to support re-counting k-mers and computing likelihood scores in low memory (see #289, #292, #302).
- A new `ProgressIndicator` class that provides gradually less frequent updates over time (see #299).

### Changed
Expand All @@ -18,6 +18,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- Cleaned up overlap-related code with a new `ReadPair` class (see #283).
- Updated `kevlar assemble`, `kevlar localize`, and `kevlar call` to accept streams of partitioned reads; previously, only reads for a single partition were permitted (see #294).
- Overhauled the `kevlar localize` command to compute seed locations for all seeds in all partitions with a single BWA call, massively improving efficiency (see #294 and #301).
- Updated the variant calling procedure to discard alignment blocks less than `ksize` in length (see #303).

### Fixed
- Minor bug with .gml output due to a change in the networkx package (see #278).
Expand Down
14 changes: 8 additions & 6 deletions kevlar/cli/alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ def subparser(subparsers):
'reference by extending beyond the span of all '
'k-mer starting positions by D bp')
local_args.add_argument('-x', '--max-diff', type=int, metavar='X',
default=None, help='split seed matches into '
'distinct bins if the distance between two seed '
'matches is > X; by default, X is 3 times the '
'length of the longest contig; each bin specifies '
'a reference target sequence against which '
'assembled contigs will be aligned')
default=None, help='split and report multiple '
'reference targets if the distance between two '
'seed matches is > X; by default, X is set '
'dynamically for each partition and is equal to 3 '
'times the length of the longest contig in the '
'partition; each resulting bin specifies a '
'reference target sequence to which assembled '
'contigs will subsequently be aligned')
local_args.add_argument('--include', metavar='REGEX', type=str,
help='discard alignments to any chromosomes whose '
'sequence IDs do not match the given pattern')
Expand Down
8 changes: 6 additions & 2 deletions kevlar/cli/localize.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,12 @@ def subparser(subparsers):
subparser.add_argument('-x', '--max-diff', type=int, metavar='X',
default=None, help='split and report multiple '
'reference targets if the distance between two '
'seed matches is > X; by default, X is 3 times the '
'length of the longest contig')
'seed matches is > X; by default, X is set '
'dynamically for each partition and is equal to 3 '
'times the length of the longest contig in the '
'partition; each resulting bin specifies a '
'reference target sequence to which assembled '
'contigs will subsequently be aligned')
subparser.add_argument('--include', metavar='REGEX', type=str,
help='discard alignments to any chromosomes whose '
'sequence IDs do not match the given pattern')
Expand Down
13 changes: 4 additions & 9 deletions kevlar/tests/test_varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from io import StringIO
import sys
import kevlar
from kevlar.varmap import VariantMapping
Expand Down Expand Up @@ -131,14 +130,13 @@ def test_variant_mapping():
assert mapping.interval == ('chr1', 10000, 10060)


@pytest.mark.parametrize('query,target,dist,n,msgcount', [
@pytest.mark.parametrize('query,target,dist,n,trimcount', [
('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 5, 1, 1),
('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', 5, 1, 1),
('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 2, 2, 0),
('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', None, 2, 0),
])
def test_call_near_end(query, target, dist, n, msgcount):
log = StringIO()
def test_call_near_end(query, target, dist, n, trimcount):
contig = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file(query), 'r')
Expand All @@ -150,12 +148,9 @@ def test_call_near_end(query, target, dist, n, msgcount):
)
)
aln = VariantMapping(contig, cutout)
calls = list(aln.call_variants(31, mindist=dist, logstream=log))
calls = list(aln.call_variants(31, mindist=dist))
assert len(calls) == n

err = log.getvalue()
count = err.count('discarding SNV due to proximity to end of the contig')
assert count == msgcount
assert aln.trimmed == trimcount


@pytest.mark.parametrize('query,target,vw,rw', [
Expand Down
13 changes: 9 additions & 4 deletions kevlar/varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(self, contig, cutout, score=None, cigar=None, strand=1,
self.score = score
self.strand = strand
self.matedist = None
self.trimmed = 0

self.tok = AlignmentTokenizer(self.varseq, self.refrseq, cigar)
self.cigar = self.tok._cigar
Expand Down Expand Up @@ -218,9 +219,11 @@ def call_snv(self, qseq, tseq, offset, ksize, mindist=6, donocall=True,
"""
length = len(qseq)
assert len(tseq) == length
if length < ksize:
return
diffs = [i for i in range(length) if tseq[i] != qseq[i]]
if mindist:
diffs = trim_terminal_snvs(diffs, length, mindist, logstream)
self.trimmed, diffs = trim_terminal_snvs(diffs, length, mindist)
if len(diffs) == 0 or len(diffs) > 4:
if donocall:
nocall = Variant(
Expand Down Expand Up @@ -295,10 +298,12 @@ def n_ikmers_present(record, window):

def trim_terminal_snvs(mismatches, alnlength, mindist=5, logstream=sys.stderr):
valid = list()
trimcount = 0
for mm in mismatches:
if mm < mindist or alnlength - mm < mindist:
msg = 'discarding SNV due to proximity to end of the contig'
print('[kevlar::call] NOTE:', msg, file=logstream)
trimcount += 1
# msg = 'discarding SNV due to proximity to end of the contig'
# print('[kevlar::call] NOTE:', msg, file=logstream)
else:
valid.append(mm)
return valid
return trimcount, valid

0 comments on commit 7a4ba76

Please sign in to comment.