Skip to content

Commit

Permalink
Merge pull request #235 from dib-lab/vcf/filter
Browse files Browse the repository at this point in the history
Improved handling of VCF `FILTER` field
  • Loading branch information
standage committed Apr 3, 2018
2 parents daf8771 + 52606fe commit e01fa72
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 16 deletions.
3 changes: 2 additions & 1 deletion kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
import kevlar
from kevlar.varmap import VariantMapping
from kevlar.vcf import VariantFilter as vf


def align_mates(record, refrfile):
Expand Down Expand Up @@ -124,7 +125,7 @@ def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
if alignment.matedist:
varcall.annotate('MD', '{:.2f}'.format(alignment.matedist))
if n > 0:
varcall.annotate('NC', 'matefail')
varcall.filter(vf.MateFail)
yield varcall


Expand Down
4 changes: 2 additions & 2 deletions kevlar/tests/test_alac.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_alac_inf_mate_dist(cc, numrawcalls):
calls = list(caller)
print(*[c.vcf for c in calls], sep='\n', file=sys.stderr)
assert len(calls) in numrawcalls
filtcalls = [c for c in calls if c.attribute('NC') is None]
filtcalls = [c for c in calls if c.filterstr == 'PASS']
print(*[c.vcf for c in filtcalls], sep='\n', file=sys.stderr)
assert len(filtcalls) == 1

Expand All @@ -169,5 +169,5 @@ def test_alac_no_mates():
calls = list(caller)
print(*[c.vcf for c in calls], sep='\n', file=sys.stderr)
assert len(calls) in [3, 4, 5, 7]
filtcalls = [c for c in calls if c.attribute('NC') is None]
filtcalls = [c for c in calls if c.filterstr == 'PASS']
assert len(filtcalls) == 2
2 changes: 1 addition & 1 deletion kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_perfect_match():
assert len(calls) == 1
assert calls[0].seqid == 'chr99'
assert calls[0].position == 2899377
assert calls[0].info['NC'] == 'perfectmatch'
assert calls[0].filterstr == 'PerfectMatch'


def test_multibest_revcom():
Expand Down
8 changes: 4 additions & 4 deletions kevlar/tests/test_varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ def test_nocall():
variants = list(aln.call_variants(21))
assert len(variants) == 1
assert variants[0].vcf == (
'yourchr\t801\t.\t.\t.\t.\t.\t'
'CG=25D5M22I5M46D8M13D2M35I;NC=inscrutablecigar;QN=contig4:cc=1;'
'yourchr\t801\t.\t.\t.\t.\tInscrutableCigar\t'
'CG=25D5M22I5M46D8M13D2M35I;QN=contig4:cc=1;'
'QS=AACTGGTGGGCTCAAGACTAAAAAGACTTTTTTGGTGACAAGCAGGGCGGCCTGCCCTTCCTGTAG'
'TGCAAGAAAAT'
)
Expand Down Expand Up @@ -259,5 +259,5 @@ def test_passenger_screen():
aln = VariantMapping(contig, cutout)
calls = list(aln.call_variants(29))
assert len(calls) == 2
assert calls[0].attribute('NC') is None
assert calls[1].attribute('NC') == 'passenger'
assert calls[0].filterstr == 'PASS'
assert calls[1].filterstr == 'PassengerVariant'
48 changes: 48 additions & 0 deletions kevlar/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import kevlar
from kevlar.vcf import Variant, VariantIndel, VariantSNV
from kevlar.vcf import VariantFilter as vf


def test_snv_obj():
Expand Down Expand Up @@ -42,3 +43,50 @@ def test_indel_obj():
assert str(indel2) == 'chr6:75522412:I->ATTACA'
vcfvalues = ['chr6', '75522412', '.', 'G', 'GATTACA', '.', 'PASS', '.']
assert indel2.vcf == '\t'.join(vcfvalues)


def test_filter_field():
v = Variant('scaffold1', 12345, '.', '.')
assert v.filterstr == '.'
v.filter(vf.InscrutableCigar)
assert v.filterstr == 'InscrutableCigar'

v = Variant('chr1', 55555, '.', '.')
v.filter(vf.PerfectMatch)
assert v.filterstr == 'PerfectMatch'

v = Variant('1', 809768, 'C', 'CAT')
assert v.filterstr == 'PASS'
v.filter(vf.PassengerVariant)
assert v.filterstr == 'PassengerVariant'
v.filter(vf.MateFail)
assert v.filterstr == 'MateFail;PassengerVariant'

v = Variant('one', 112358, 'T', 'A')
v.filter('SNPyMcSNPface')
v.filter(6.022e23)
v.filter(dict(chicken='waffles', biscuits='gravy'))
v.filterstr == 'PASS' # These "filters" shouldn't actually do anything


def test_info():
"""Test handling of "info" field attributes.
Note: this is testing the mechanics of the .annotate() and .attribute()
API. The `VW` attribute should not be handled in this way.
"""
v = Variant('1', 12345, 'G', 'C')
assert v.attribute('VW') is None

v.annotate('VW', 'GATTACA')
assert v.attribute('VW') == 'GATTACA'
assert v.attribute('VW', pair=True) == 'VW=GATTACA'

v.annotate('VW', 'ATGCCCTAG')
assert v.info['VW'] == set(['GATTACA', 'ATGCCCTAG'])
assert v.attribute('VW') == 'ATGCCCTAG,GATTACA'
assert v.attribute('VW', pair=True) == 'VW=ATGCCCTAG,GATTACA'

v.annotate('VW', 'AAAAAAAAA')
assert v.attribute('VW') == 'AAAAAAAAA,ATGCCCTAG,GATTACA'
assert v.attribute('VW', pair=True) == 'VW=AAAAAAAAA,ATGCCCTAG,GATTACA'
15 changes: 9 additions & 6 deletions kevlar/varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import kevlar
from kevlar.alignment import align_both_strands
from kevlar.vcf import Variant, VariantSNV, VariantIndel
from kevlar.vcf import VariantFilter as vf
import re
import sys

Expand Down Expand Up @@ -138,25 +139,26 @@ def call_variants(self, ksize, mindist=5, logstream=sys.stderr):
if self.vartype == 'snv':
for call in self.call_snv(ksize, mindist, logstream=logstream):
if self.is_passenger(call):
call.annotate('NC', 'passenger')
call.filter(vf.PassengerVariant)
yield call
elif self.vartype == 'indel':
caller = self.call_insertion
if self.indeltype == 'D':
caller = self.call_deletion
for call in caller(ksize):
if self.is_passenger(call):
call.annotate('NC', 'passenger')
call.filter(vf.PassengerVariant)
yield call
for call in self.call_indel_snvs(ksize, mindist, logstream):
if self.is_passenger(call):
call.annotate('NC', 'passenger')
call.filter(vf.PassengerVariant)
yield call
else:
nocall = Variant(
self.seqid, self.pos, '.', '.', NC='inscrutablecigar',
QN=self.contig.name, QS=self.varseq, CG=self.cigar,
self.seqid, self.pos, '.', '.', QN=self.contig.name,
QS=self.varseq, CG=self.cigar,
)
nocall.filter(vf.InscrutableCigar)
yield nocall

def snv_variant(self, qseq, tseq, mismatches, offset, ksize):
Expand Down Expand Up @@ -206,8 +208,9 @@ def call_snv(self, ksize, mindist=5, logstream=sys.stderr):
if len(diffs) == 0:
nocall = Variant(
self.seqid, self.cutout.local_to_global(gdnaoffset), '.', '.',
NC='perfectmatch', QN=self.contig.name, QS=q
QN=self.contig.name, QS=q
)
nocall.filter(vf.PerfectMatch)
yield nocall
return

Expand Down
26 changes: 24 additions & 2 deletions kevlar/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from enum import Enum
import sys
import kevlar


class VariantFilter(Enum):
PerfectMatch = 1
InscrutableCigar = 2
PassengerVariant = 3
MateFail = 4


class Variant(object):
"""Base class for handling variant calls and no-calls."""

Expand All @@ -26,6 +34,7 @@ def __init__(self, seqid, pos, refr, alt, **kwargs):
self._pos = pos
self._refr = refr
self._alt = alt
self._filters = set()
self.info = dict()
for key, value in kwargs.items():
self.annotate(key, value)
Expand All @@ -52,9 +61,8 @@ def vcf(self):
kvpairs.append(queryseq)
attrstr = ';'.join(kvpairs)

filterstr = 'PASS' if self._refr != '.' else '.'
return '{:s}\t{:d}\t.\t{:s}\t{:s}\t.\t{:s}\t{:s}'.format(
self._seqid, self._pos + 1, self._refr, self._alt, filterstr,
self._seqid, self._pos + 1, self._refr, self._alt, self.filterstr,
attrstr
)

Expand Down Expand Up @@ -120,6 +128,20 @@ def attribute(self, key, pair=False):
else:
return value

def filter(self, filtertype):
if not isinstance(filtertype, VariantFilter):
return
self._filters.add(filtertype)

@property
def filterstr(self):
if len(self._filters) > 0:
return ';'.join(sorted([vf.name for vf in self._filters]))
elif self._refr == '.':
return '.'
else:
return 'PASS'

@property
def genotypes(self):
gt = self.attribute('GT')
Expand Down

0 comments on commit e01fa72

Please sign in to comment.