Skip to content

Commit

Permalink
Merge pull request #232 from dib-lab/feature/call-dedup-take2
Browse files Browse the repository at this point in the history
Deduplicate variant calls
  • Loading branch information
standage committed Mar 27, 2018
2 parents e02e177 + 621d7d7 commit aa81092
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 4 deletions.
33 changes: 29 additions & 4 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from collections import defaultdict
import sys
import kevlar
from kevlar.varmap import VariantMapping
Expand Down Expand Up @@ -64,10 +65,25 @@ def alignments_to_report(alignments):
return aligns2report


def call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
gapextend=0, ksize=31, refrfile=None, mindist=5,
logstream=sys.stderr):
"""Wrap the `kevlar call` procedure as a generator function.
def dedup(callstream):
calls = dict()
for call in callstream:
if call.seqid not in calls:
calls[call.seqid] = defaultdict(set)
calls[call.seqid][call.position].add(call)
for seqid in calls:
for position in calls[seqid]:
sortedcalls = sorted(
calls[seqid][position], key=lambda call: call.windowlength,
reverse=True
)
yield sortedcalls[0]


def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
gapextend=0, ksize=31, refrfile=None, mindist=5,
logstream=sys.stderr):
"""Implement the `kevlar call` procedure as a generator function.
Input is the following.
- an iterable containing one or more target sequences from the reference
Expand Down Expand Up @@ -112,6 +128,15 @@ def call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
yield varcall


def call(*args, **kwargs):
"""Thin wrapper over the `prelim_call` function.
This function applies a deduplication procedure to preliminary calls.
"""
for call in dedup(prelim_call(*args, **kwargs)):
yield call


def main(args):
outstream = kevlar.open(args.out, 'w')
qinstream = kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r'))
Expand Down
18 changes: 18 additions & 0 deletions kevlar/tests/data/bee-dupl.contigs.augfasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
>contig1
TCCTCGGACCCCCGTCGGTAAAGAAGAGGCCGGAGCGATAAACAATAAGGCTAAGATGTAACGGTAAACGGCAGATCCTGGAAAGTGTATTATGAACTTTTGCAGGCGCAGGTGGCCGCGCGAGGCAGGTGCGAGCCTGACCTAGCCCAGCGGGGAGGCTC
AGGCTAAGATGTAACGGTAAACGGCAG 18 1 1#
TAAGATGTAACGGTAAACGGCAGATCC 19 0 1#
AGATGTAACGGTAAACGGCAGATCCTG 18 1 1#
GATGTAACGGTAAACGGCAGATCCTGG 23 0 1#
GGTAAACGGCAGATCCTGGAAAGTGTA 20 1 0#
TAAACGGCAGATCCTGGAAAGTGTATT 19 1 1#
GATCCTGGAAAGTGTATTATGAACTTT 22 0 1#
>contig2
GTCGGTAAAGAAGAGGCCGGAGCGATAAACAATAAGGCTAAGATGTAACGGTAAACGGCAGATCCTGGAAAGTGTATTATGAACTTTTGCAGGCGCAGGTGGCCGCGCGAGGCAGGTGCGAGCCTGACCTAGCCCAGCGGGGAGGC
AGGCTAAGATGTAACGGTAAACGGCAG 18 1 1#
TAAGATGTAACGGTAAACGGCAGATCC 19 0 1#
AGATGTAACGGTAAACGGCAGATCCTG 18 1 1#
GATGTAACGGTAAACGGCAGATCCTGG 23 0 1#
GGTAAACGGCAGATCCTGGAAAGTGTA 20 1 0#
TAAACGGCAGATCCTGGAAAGTGTATT 19 1 1#
GATCCTGGAAAGTGTATTATGAACTTT 22 0 1#
2 changes: 2 additions & 0 deletions kevlar/tests/data/bee-dupl.gdna.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>linkagegroup5_8050-8311
CGAAACGACCTTGCAACTCCGTTCCGTAATAATCGCGGATACGATTATTTTCCTCGGACCCCCGTCGGTAAAGAAGAGGCCGGAGCGATAAACAATAAGGCTAAGATGTAACGGTAAACGGCAAATCCTGGAAAGTGTATTATGAACTTTTGCAGGCGCAGGTGGCCGCGCGAGGCAGGTGCGAGCCTGACCTAGCCCAGCGGGGAGGCTCTCGAGAATCGAGAAGTGGAAAGAGAAATGGCAAATGGAAGGAACGAGAGT
15 changes: 15 additions & 0 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,3 +218,18 @@ def test_mate_distance():
positions = [('seq2', 4000), ('seq2', 3000), ('seq2', 5100), ('seq3', 1)]
gdna_pos = ('seq2', 5000, 5500)
assert kevlar.call.mate_distance(positions, gdna_pos) == 1000.0


def test_snv_dedup():
contigfile = data_file('bee-dupl.contigs.augfasta')
contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
contigs = list(contigstream)

gdnafile = data_file('bee-dupl.gdna.fa')
gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
targets = list(gdnastream)

calls = list(call(targets, contigs, ksize=27))
assert len(calls) == 1
assert calls[0].seqid == 'linkagegroup5'
assert calls[0].position == 8174 - 1
7 changes: 7 additions & 0 deletions kevlar/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ def window(self):
"""
return self.attribute('VW')

@property
def windowlength(self):
window = self.window
if window is None:
return 0
return len(window)

@property
def refrwindow(self):
"""Similar to `window`, but encapsulating the reference allele."""
Expand Down

0 comments on commit aa81092

Please sign in to comment.