Skip to content

Commit

Permalink
Fix CIGAR filter
Browse files Browse the repository at this point in the history
PR #242 introduced some improvements in kevlar's calling procedure, including more flexible CIGAR pattern matching to determine variant-associated alignments, and a CIGAR tokenizer to facilitate a gap re-alignment strategy to simplify variant calling. This update includes a small bugfix in the gap re-alignment code, along with associated regression tests.
  • Loading branch information
standage committed Oct 29, 2018
1 parent 3b45276 commit 462ef53
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 17 deletions.
8 changes: 5 additions & 3 deletions kevlar/cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,15 @@ def _endcheck(self):
if self.blocks[-2].type == 'D':
prevseq = self.blocks[-2].target
lastseq = self.blocks[-1].target
endseq = self.blocks[-1].query
else:
prevseq = self.blocks[-2].query
lastseq = self.blocks[-1].query
if prevseq.startswith(lastseq):
endseq = self.blocks[-1].target
longseq = prevseq + lastseq
if longseq.startswith(endseq):
self.blocks[-3] = AlignmentBlock(
self.blocks[-3].length + self.blocks[-1].length,
'M',
self.blocks[-3].length + self.blocks[-1].length, 'M',
self.blocks[-3].target + self.blocks[-1].target,
self.blocks[-3].query + self.blocks[-1].query,
)
Expand Down
Binary file added kevlar/tests/data/14153.cc5463.contig.augfasta.gz
Binary file not shown.
Binary file added kevlar/tests/data/14153.cc5463.gdna.augfasta.gz
Binary file not shown.
2 changes: 2 additions & 0 deletions kevlar/tests/data/cigar/d.contig.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>contig1
AGATTGTCTATTTTTATCAACATATATAAATATGTGAGTTGATAAAAATAAGCCTAGATCACACCTAAATCATCCTGTACACTCTATATACCACCCTCATTATACCATACAGTATTCAAGTTAGCCATGAAGCAACTATCTTTTTAAGAATTTATGGTACACACACACACAC
2 changes: 2 additions & 0 deletions kevlar/tests/data/cigar/d.gdna.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>6_154734072-154734293
GACCCAATACTCAGGAAAGATATGCTGAATAAATGAGCCAGATTGTCTAGTTTTATCAACATATATAAATATGTGAGTTGATAAAAATAAGCCTAGATCACACCTAAATCATCCTGTACACTCTATATACCACCCTCATTATACCATACAGTATTCAAGTTAGCCATGAAGAAAGCAACTATCTTTTTAAGAATTTATGGTACACACACACACACACACAC
21 changes: 21 additions & 0 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,27 @@ def test_perfect_match():
assert calls[0].filterstr == 'PerfectMatch'


def test_cigar_filter_regression():
contigfile = data_file('14153.cc5463.contig.augfasta.gz')
contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
contigs = list(contigstream)

gdnafile = data_file('14153.cc5463.gdna.augfasta.gz')
gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
targets = list(gdnastream)

calls = sorted(call(targets, contigs), key=lambda c: c.position)
assert len(calls) == 2
assert calls[1].seqid == '6'

# Equally valid calls from equally optimal alignments
c1 = ('AGAAA', 'A', 154734241)
c2 = ('GAAGA', 'G', 154734239)

varcall = (calls[1]._refr, calls[1]._alt, calls[1].position)
assert varcall in (c1, c2)


def test_multibest_revcom():
contigfile = data_file('multibestrc.contig.fa')
contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
Expand Down
37 changes: 23 additions & 14 deletions kevlar/tests/test_cigar.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
(data_file('cigar/a.contig.fa'), data_file('cigar/a.gdna.fa')),
(data_file('cigar/b.contig.fa'), data_file('cigar/b.gdna.fa')),
(data_file('cigar/c.contig.fa'), data_file('cigar/c.gdna.fa')),
(data_file('cigar/d.contig.fa'), data_file('cigar/d.gdna.fa')),
(data_file('phony-snv-01.contig.fa'), data_file('phony-snv-01.gdna.fa')),
(data_file('phony-snv-02.contig.fa'), data_file('phony-snv-02.gdna.fa')),
])
Expand All @@ -38,19 +39,27 @@ def test_blocks(contig, gdna):
assert block.query is None


def test_gap_center_aligned():
query = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file('cigar/b.contig.fa'), 'r')
)
)
target = next(
kevlar.parse_augmented_fastx(
kevlar.open(data_file('cigar/b.gdna.fa'), 'r')
)
)
def test_nomargin():
qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r')
tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r')
query = next(kevlar.parse_augmented_fastx(qfile))
target = next(kevlar.parse_augmented_fastx(tfile))
cigar, score = kevlar.align(target.sequence, query.sequence)
tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
assert len(tok.blocks) == 3
assert tok._cigar == '41D150M50D'
assert tok._origcigar == '41D144M50D6M'
assert tok._cigar == tok._origcigar


@pytest.mark.parametrize('contig,gdna,newcigar,origcigar,nblocks', [
('b.contig.fa', 'b.gdna.fa', '41D150M50D', '41D144M50D6M', 3),
('d.contig.fa', 'd.gdna.fa', '39D129M4D43M6D', '39D129M4D29M6D14M', 5),
])
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks):
qfile = kevlar.open(data_file('cigar/' + contig), 'r')
tfile = kevlar.open(data_file('cigar/' + gdna), 'r')
query = next(kevlar.parse_augmented_fastx(qfile))
target = next(kevlar.parse_augmented_fastx(tfile))
cigar, score = kevlar.align(target.sequence, query.sequence)
tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
assert len(tok.blocks) == nblocks
assert tok._cigar == newcigar
assert tok._origcigar == origcigar

0 comments on commit 462ef53

Please sign in to comment.