Skip to content

Commit

Permalink
Merge pull request #222 from dib-lab/fix/contig-ends
Browse files Browse the repository at this point in the history
Exclude SNV calls too close to the end of the contig alignment
  • Loading branch information
standage committed Mar 22, 2018
2 parents ccbf343 + 148ab94 commit 69ef9d2
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 7 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- New `-p/--part-id` flag in `kevlar alac` for processing a single partition in a partitioned augfastq file (see #206).
- New reader/parser for parititioned augfastx files (see #206).
- New strategy for discriminating between variants and off-target calls using pairing information (see #210).
- New "fallback" assembly strategy: if fermi-lite fails, try our homegrown greedy assembly algorithm (see #214).
- New optional "fallback" assembly strategy: if fermi-lite fails, try our homegrown greedy assembly algorithm (see #214 and #219).
- New parameter for excluding SNV calls too near to the end of a contig (see #222).

### Changed
- Replaced `pep8` with `pycodestyle` for enforcing code style in development (see #167).
Expand Down
19 changes: 13 additions & 6 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import khmer
import kevlar
import re
import sys


class VariantMapping(object):
Expand Down Expand Up @@ -45,21 +46,21 @@ def seqid(self):
def pos(self):
return self.cutout._startpos

def call_variants(self, ksize):
def call_variants(self, ksize, mindist=5, logstream=sys.stderr):
snvmatch = re.search('^(\d+)([DI])(\d+)M(\d+)[DI]$', self.cigar)
snvmatch2 = re.search('^(\d+)([DI])(\d+)M(\d+)[DI](\d+)M$', self.cigar)
if snvmatch:
offset = int(snvmatch.group(1))
if snvmatch.group(2) == 'I':
offset *= -1
length = int(snvmatch.group(3))
return call_snv(self, offset, length, ksize)
return call_snv(self, offset, length, ksize, mindist, logstream)
elif snvmatch2 and int(snvmatch2.group(5)) <= 5:
offset = int(snvmatch2.group(1))
if snvmatch2.group(2) == 'I':
offset *= -1
length = int(snvmatch2.group(3))
return call_snv(self, offset, length, ksize)
return call_snv(self, offset, length, ksize, mindist, logstream)

indelmatch = re.search(
'^(\d+)([DI])(\d+)M(\d+)([ID])(\d+)M(\d+)[DI]$', self.cigar
Expand Down Expand Up @@ -228,7 +229,7 @@ def __str__(self):
return '{:s}:{:d}:I->{:s}'.format(self._seqid, pos, insertion)


def call_snv(aln, offset, length, ksize):
def call_snv(aln, offset, length, ksize, mindist, logstream=sys.stderr):
targetshort = False
if offset < 0:
offset *= -1
Expand All @@ -248,6 +249,11 @@ def call_snv(aln, offset, length, ksize):

snvs = list()
for diff in diffs:
if mindist:
if diff[0] < mindist or length - diff[0] < mindist:
msg = 'discarding SNV due to proximity to end of the contig'
print('[kevlar::call] NOTE:', msg, file=logstream)
continue
minpos = max(diff[0] - ksize + 1, 0)
maxpos = min(diff[0] + ksize, length)
window = q[minpos:maxpos]
Expand Down Expand Up @@ -424,7 +430,8 @@ def alignments_to_report(alignments):


def call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
gapextend=0, ksize=31, refrfile=None):
gapextend=0, ksize=31, refrfile=None, mindist=5,
logstream=sys.stderr):
"""Wrap the `kevlar call` procedure as a generator function.
Input is the following.
Expand Down Expand Up @@ -460,7 +467,7 @@ def call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
aligns2report.sort(key=lambda aln: aln.matedist)

for n, alignment in enumerate(aligns2report):
for varcall in alignment.call_variants(ksize):
for varcall in alignment.call_variants(ksize, mindist, logstream):
if alignment.matedist:
varcall.info['MD'] = '{:.2f}'.format(alignment.matedist)
if n > 0:
Expand Down
2 changes: 2 additions & 0 deletions kevlar/tests/data/phony-snv-01b.contig.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>contig1:cc=2
TAGCATACAGGAAGTCAGGGGGTGTCTGCGACCACAGGTGAACATGACGAAACGGGTGGCATCCACATAATAGCCCGTCTATCTGATGCCTCTTGTTC
2 changes: 2 additions & 0 deletions kevlar/tests/data/phony-snv-02b.contig.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>contig4:cc=1
CTCGGGACCCATCACAACAGATACGATTCGTATTACCCCTGGGATATGGGAGCTGGTCTATATAGGTTAACTGGTCCCAGATGCGAAATTAATGGACGC
28 changes: 28 additions & 0 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from io import StringIO
import pytest
import re
import sys
Expand Down Expand Up @@ -372,3 +373,30 @@ def test_mate_distance():
positions = [('seq2', 4000), ('seq2', 3000), ('seq2', 5100), ('seq3', 1)]
gdna_pos = ('seq2', 5000, 5500)
assert kevlar.call.mate_distance(positions, gdna_pos) == 1000.0


@pytest.mark.parametrize('query,target,dist,n,msgcount', [
('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 5, 1, 1),
('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', 5, 1, 1),
('phony-snv-01b.contig.fa', 'phony-snv-01.gdna.fa', 2, 2, 0),
('phony-snv-02b.contig.fa', 'phony-snv-02.gdna.fa', None, 2, 0),
])
def test_call_near_end(query, target, dist, n, msgcount):
log = StringIO()
contig = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file(query), 'r')
)
)
cutout = next(
kevlar.reference.load_refr_cutouts(
kevlar.open(data_file(target), 'r')
)
)
aln = kevlar.call.align_both_strands(cutout, contig)
calls = list(aln.call_variants(31, mindist=dist, logstream=log))
assert len(calls) == n

err = log.getvalue()
count = err.count('discarding SNV due to proximity to end of the contig')
assert count == msgcount

0 comments on commit 69ef9d2

Please sign in to comment.