Skip to content

Commit

Permalink
Merge pull request #224 from dib-lab/fix/trunc
Browse files Browse the repository at this point in the history
Add bounds checking on window calculations
  • Loading branch information
standage committed Mar 22, 2018
2 parents 69ef9d2 + d5c9444 commit 9b6f08e
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 5 deletions.
9 changes: 4 additions & 5 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ def call_snv(aln, offset, length, ksize, mindist, logstream=sys.stderr):


def deletion_allele(target, query, offset, ksize, leftmatch, indellength):
minpos = leftmatch - ksize + 1
maxpos = leftmatch + ksize - 1
minpos = max(leftmatch - ksize + 1, 0)
maxpos = min(leftmatch + ksize - 1, len(query))
altwindow = query[minpos:maxpos]
minpos += offset
maxpos += offset + indellength
Expand All @@ -287,8 +287,8 @@ def deletion_allele(target, query, offset, ksize, leftmatch, indellength):


def insertion_allele(target, query, offset, ksize, leftmatch, indellength):
minpos = leftmatch - ksize + 1
maxpos = leftmatch + ksize + indellength - 1
minpos = max(leftmatch - ksize + 1, 0)
maxpos = min(leftmatch + ksize + indellength - 1, len(query))
altwindow = query[minpos:maxpos]
minpos += offset
maxpos += offset - indellength
Expand Down Expand Up @@ -333,7 +333,6 @@ def call_insertion(aln, offset, ksize, leftmatch, indellength):
refr, alt, refrwindow, altwindow = insertion_allele(
aln.refrseq, aln.varseq, offset, ksize, leftmatch, indellength
)

# This assertion is no longer valid when query is longer than target
# assert len(alt) == indellength + 1
localcoord = leftmatch
Expand Down
56 changes: 56 additions & 0 deletions kevlar/tests/data/trunc-indel.contig.augfasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
>contig1
ACACATTATATACACATATATGTGTATATATGTGTACACATACATATACACATGTGTACATACACATGTGTATATACACACATATACACATATATACATACACACATGTATATACACATATATACACATATATACACATATATGTGTATATAC
CATACATATACACATGTGTACATACACATGT 19 0 0#
ATACATATACACATGTGTACATACACATGTG 18 0 0#
TACATATACACATGTGTACATACACATGTGT 18 0 0#
ACATATACACATGTGTACATACACATGTGTA 17 0 0#
CATATACACATGTGTACATACACATGTGTAT 17 0 1#
ATATACACATGTGTACATACACATGTGTATA 17 0 0#
TATACACATGTGTACATACACATGTGTATAT 18 0 1#
ATACACATGTGTACATACACATGTGTATATA 18 0 0#
TACACATGTGTACATACACATGTGTATATAC 18 0 0#
ACACATGTGTACATACACATGTGTATATACA 18 0 0#
CACATGTGTACATACACATGTGTATATACAC 16 0 1#
ACATGTGTACATACACATGTGTATATACACA 16 1 0#
CATGTGTACATACACATGTGTATATACACAC 17 1 0#
ATGTGTACATACACATGTGTATATACACACA 19 0 0#
TGTGTACATACACATGTGTATATACACACAT 18 1 1#
GTGTACATACACATGTGTATATACACACATA 19 0 0#
TGTACATACACATGTGTATATACACACATAT 19 0 1#
GTACATACACATGTGTATATACACACATATA 19 1 0#
TACATACACATGTGTATATACACACATATAC 19 0 0#
ACATACACATGTGTATATACACACATATACA 19 0 0#
CATACACATGTGTATATACACACATATACAC 18 0 0#
ATACACATGTGTATATACACACATATACACA 18 0 0#
TACACATGTGTATATACACACATATACACAT 18 0 0#
ACACATGTGTATATACACACATATACACATA 18 0 1#
CACATGTGTATATACACACATATACACATAT 17 0 0#
ACATGTGTATATACACACATATACACATATA 17 0 0#
CATGTGTATATACACACATATACACATATAT 17 0 1#
ATGTGTATATACACACATATACACATATATA 17 0 0#
TGTGTATATACACACATATACACATATATAC 17 0 0#
#mateseq=AATAAATCATATGGTTCCTTTAAAGGTATTGATGGAGGCCAGGCATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGGTGGATCAC#
#mateseq=ACTGAAGCTATTTTTCCTCCAATTTCTCTCAAAACTTATGCTATGAATAAATCATATGGTTCCTTTAAAGGTATTGATGGAGGCCAGGCATGGTGGCTCA#
#mateseq=ATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTTTATATATGTGTAAATATGTGTATATATGTGT#
#mateseq=ATATACATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTAT#
#mateseq=ATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATATTTGTATATACATATATGT#
#mateseq=ATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGT#
#mateseq=ATGTGTATATGTGTGTATATATGTGTATATATGTGTATATACAAATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTAT#
#mateseq=CAGGAGTTCAAGACCAGCCTGGCCAAGATAGTGAAACCCTATCTCTACTAAAAATACAAAAATTAGCTAGGCGAGCACCTGTAAATCCCAGCTATTTGGG#
#mateseq=CCTTTAAAGGTATTGATGGAGGCCAGGCATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGGTGGATCACAAGGTCAGGAGTTCAA#
#mateseq=GAAATAAAAGATTTGGCACAGTGATGGAGTGATCACTGAAGCTATTTTTCCTCCAATTTCTCTCAAAACTTATGCTATGAATAAATCATATGGTACCTTT#
#mateseq=GCAATGAATAAATCATATGGTTCCTTTAAAGGTATTGATGGAGGCCAGGCATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGGTG#
#mateseq=GGCCGAGGCTGGTGGATCACAAGGTCAGGAGTTCAAGACCAGCCTGGCCAAGATAGTGACACCCTATCTCTACTAAAAATACAAAAATTAGCTAGGCGAG#
#mateseq=GGCTGGTGGATCACAAGGTCAGGAGTTCAAGACCAGCCTGTCCAAGATAGTGAAACCCTATCTCTACTAAAAATACAAAAATTAGCTAGGCGATCACCTG#
#mateseq=GTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATAA#
#mateseq=GTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATAT#
#mateseq=TAATCCCAGCACTTTGGGAGGCCGAGGCTGGTGGATCACAAGGTCAGGAGTTCAAGACCAGCCTGGCCAAGATAGTGAAACCCTATCTCTACTAAAAATA#
#mateseq=TATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACA#
#mateseq=TATGAATAAATCATATGGTTCCTTTAAAGGTATTGATGGAGGCCAGGCATGGTGGCTCATGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCTGGTGGA#
#mateseq=TATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTA#
#mateseq=TATGTGTATATATGTGTATATACATATCTGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTA#
#mateseq=TATGTGTATATATGTTTATATACATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTA#
#mateseq=TGATCACTGAAGCTATTTTTCCTCCAATTTCTCTCAAAACTTATGCTATGCATAAATCATATGGTTCCTTTAAAGGTATTGATGGAGGCCAGGCATGGTG#
#mateseq=TGTATATACATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTG#
#mateseq=TGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATA#
#mateseq=TGTGTATATGTGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATA#
2 changes: 2 additions & 0 deletions kevlar/tests/data/trunc-indel.gdna.augfasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>chr17_70858900-70859145
ATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATACATATATGTGTATATATGTGTATATATGTGTATATACATGTGTGTATGTATATATGTGTATATATGTGTATATACATATGTGTATGTACACATGTGTATATGTATGTGTACACATATATACACATATATGTGTATATAATGTGTATATATACACACATAATATAGATACGTGTGTGTGTATATATATATACACA
2 changes: 2 additions & 0 deletions kevlar/tests/data/trunc-snv.contig.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>contig1:cc=2
TAGCATACAGGTAGTCAGGGGGTGTCTGCGACCACAGCTGAACATGACGAAACGGGTGGCATCCACATAATAGCCCGTCTATCTGATGCCTCTTTTTC
2 changes: 2 additions & 0 deletions kevlar/tests/data/trunc-snv.gdna.augfasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>scaf1_2030-2178
CTGTACTATACCGTTATCGCTCTAATAGCATACAGGAAGTCAGGGGGTGTCTGCGACCACAGCTGAACATGACGAAACGGGTGGCATCCACATAATAGCCCGTCTATCTGATGCCTCTTTTTCAATGCAGGCGGAAGTGGCGCTTCAT
2 changes: 2 additions & 0 deletions kevlar/tests/data/trunc-snv.gdna.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>scaf1_2030-2178
CTGTACTATACCGTTATCGCTCTAATAGCATACAGGAAGTCAGGGGGTGTCTGCGACCACAGCTGAACATGACGAAACGGGTGGCATCCACATAATAGCCCGTCTATCTGATGCCTCTTTTTCAATGCAGGCGGAAGTGGCGCTTCAT
37 changes: 37 additions & 0 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,3 +400,40 @@ def test_call_near_end(query, target, dist, n, msgcount):
err = log.getvalue()
count = err.count('discarding SNV due to proximity to end of the contig')
assert count == msgcount


def test_call_truncated_windows_snv():
contig = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file('trunc-snv.contig.fa'), 'r')
)
)
cutout = next(
kevlar.reference.load_refr_cutouts(
kevlar.open(data_file('trunc-snv.gdna.fa'), 'r')
)
)
aln = kevlar.call.align_both_strands(cutout, contig)
calls = list(aln.call_variants(31))
assert len(calls) == 1
assert calls[0].window == 'TAGCATACAGGTAGTCAGGGGGTGTCTGCGACCACAGCTGAA'
assert calls[0].refrwindow == 'TAGCATACAGGAAGTCAGGGGGTGTCTGCGACCACAGCTGAA'


def test_call_truncated_windows_indel():
contig = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file('trunc-indel.contig.augfasta'), 'r')
)
)
cutout = next(
kevlar.reference.load_refr_cutouts(
kevlar.open(data_file('trunc-indel.gdna.augfasta'), 'r')
)
)
aln = kevlar.call.align_both_strands(cutout, contig)
calls = list(aln.call_variants(31))
assert len(calls) == 1
assert calls[0].window == 'GTATATACACATATATGTGTATATATGTGTATATATGT'
assert calls[0].refrwindow == ('GTATATACATATATGTGTATATATGTGTATATACATATATGT'
'GTATATATGTGTATATATGT')

0 comments on commit 9b6f08e

Please sign in to comment.