Skip to content

Commit

Permalink
Merge pull request #261 from dib-lab/feature/fmt-lists
Browse files Browse the repository at this point in the history
Clean up variant attribute handling with smart lists
  • Loading branch information
standage committed May 18, 2018
2 parents 3aa4ca3 + 5c492b8 commit ce2f8b1
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 35 deletions.
3 changes: 1 addition & 2 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5,
end='\n\n', file=logstream)
for varcall in alignment.call_variants(ksize, mindist, logstream):
if alignment.matedist:
avgdistance = '{:.2f}'.format(alignment.matedist)
varcall.annotate('MATEDIST', avgdistance)
varcall.annotate('MATEDIST', alignment.matedist)
if n > 0:
varcall.filter(vf.MateFail)
yield varcall
Expand Down
16 changes: 8 additions & 8 deletions kevlar/simlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,10 @@ def calc_likescore(call, altabund, refrabund, mu, sigma, epsilon):
llfp = likelihood_false(altabund, refrabund, mean=mu, error=epsilon)
llih = likelihood_inherited(altabund, mean=mu, sd=sigma, error=epsilon)
likescore = lldn - max(llfp, llih)
call.annotate('LLDN', '{:.3f}'.format(lldn))
call.annotate('LLFP', '{:.3f}'.format(llfp))
call.annotate('LLIH', '{:.3f}'.format(llih))
call.annotate('LIKESCORE', '{:.3f}'.format(likescore))
call.annotate('LLDN', lldn)
call.annotate('LLFP', llfp)
call.annotate('LLIH', llih)
call.annotate('LIKESCORE', likescore)


def default_sample_labels(nsamples):
Expand Down Expand Up @@ -188,13 +188,13 @@ def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
message += ' for variant-spanning window {:s}'.format(call.window)
message += ', shorter than k size {:s}'.format(case.ksize())
print(message, file=logstream)
call.annotate('LIKESCORE', '-inf')
call.annotate('LIKESCORE', float('-inf'))
calls_by_partition[call.attribute('PART')].append(call)
continue
altabund, refrabund, ndropped = get_abundances(
call.window, call.refrwindow, case, controls, refr
)
call.annotate('DROPPED', str(ndropped))
call.annotate('DROPPED', ndropped)
abovethresh = [a for a in altabund[0] if a > casemin]
if len(abovethresh) == 0:
call.filter(kevlar.vcf.VariantFilter.PassengerVariant)
Expand All @@ -204,14 +204,14 @@ def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,

allcalls = list()
for partition, calls in calls_by_partition.items():
scores = [float(c.attribute('LIKESCORE')) for c in calls]
scores = [c.attribute('LIKESCORE') for c in calls]
maxscore = max(scores)
for call, score in zip(calls, scores):
if score < maxscore:
call.filter(kevlar.vcf.VariantFilter.PartitionScore)
allcalls.append(call)

allcalls.sort(key=lambda c: float(c.attribute('LIKESCORE')), reverse=True)
allcalls.sort(key=lambda c: c.attribute('LIKESCORE'), reverse=True)
for call in allcalls:
yield call

Expand Down
2 changes: 1 addition & 1 deletion kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_funky_cigar(part, coord, window):
print('DEBUG', calls[0].vcf, file=sys.stderr)
assert calls[0].seqid == '17'
assert calls[0].position == coord - 1
assert calls[0].info['ALTWINDOW'] == window
assert calls[0].attribute('ALTWINDOW') == window


def test_funky_cigar_deletion():
Expand Down
42 changes: 35 additions & 7 deletions kevlar/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import pytest
import kevlar
from kevlar.tests import data_file
from kevlar.vcf import Variant
from kevlar.vcf import Variant, FormattedList
from kevlar.vcf import VariantFilter as vf


Expand Down Expand Up @@ -85,8 +85,19 @@ def test_filter_field():
def test_info():
"""Test handling of "info" field attributes.
This tests the mechanics of the .annotate() and .attribute() API.
This tests the mechanics of the .annotate() and .attribute() API, and the
FormattedList class underpinning it.
"""
values = FormattedList()
assert str(values) == '.'
values.append(42)
assert str(values) == '42'
values.append(1776)
assert str(values) == '42,1776'
values.append('B0gU$')
with pytest.raises(kevlar.vcf.KevlarMixedDataTypeError):
str(values)

v = Variant('1', 12345, 'G', 'C')
assert v.attribute('VW') is None

Expand All @@ -95,13 +106,30 @@ def test_info():
assert v.attribute('VW', pair=True) == 'VW=GATTACA'

v.annotate('VW', 'ATGCCCTAG')
assert v.info['VW'] == set(['GATTACA', 'ATGCCCTAG'])
assert v.attribute('VW') == 'ATGCCCTAG,GATTACA'
assert v.attribute('VW', pair=True) == 'VW=ATGCCCTAG,GATTACA'
assert v.info['VW'] == ['GATTACA', 'ATGCCCTAG']
assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG']
assert v.attribute('VW', string=True) == 'GATTACA,ATGCCCTAG'
assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG'

v.annotate('VW', 'AAAAAAAAA')
assert v.attribute('VW') == 'AAAAAAAAA,ATGCCCTAG,GATTACA'
assert v.attribute('VW', pair=True) == 'VW=AAAAAAAAA,ATGCCCTAG,GATTACA'
assert v.attribute('VW') == ['GATTACA', 'ATGCCCTAG', 'AAAAAAAAA']
assert v.attribute('VW', pair=True) == 'VW=GATTACA,ATGCCCTAG,AAAAAAAAA'

v.annotate('DROPPED', 3)
assert v.attribute('DROPPED') == 3
assert v.attribute('DROPPED', string=True) == '3'

v.annotate('DROPPED', 31)
assert v.attribute('DROPPED') == [3, 31]
assert v.attribute('DROPPED', string=True) == '3,31'
assert v.attribute('DROPPED', pair=True) == 'DROPPED=3,31'

v.annotate('MATEDIST', 432.1234)
v.annotate('MATEDIST', 8765.4321)
assert v.attribute('MATEDIST', string=True) == '432.123,8765.432'

v.annotate('LLIH', -436.0111857750478)
assert v.attribute('LLIH', pair=True) == 'LLIH=-436.011'


def test_format():
Expand Down
70 changes: 53 additions & 17 deletions kevlar/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,17 @@
from enum import Enum
import sys
import kevlar
from numpy import float64


class VariantAnnotationError(ValueError):
pass


class KevlarMixedDataTypeError(ValueError):
pass


class VariantFilter(Enum):
PerfectMatch = 1
InscrutableCigar = 2
Expand All @@ -26,6 +31,31 @@ class VariantFilter(Enum):
PartitionScore = 5


class FormattedList(list):
"""Convenience class for printing lists of various types.
With VCF we need to store free-form data with various different types. This
class helps us project lists of diverse data types to a string
representation, even if they are stored in memory as a list of ints or
floats.
"""
def __str__(self):
types = set([type(v) for v in self])
if len(types) == 0:
return '.'
elif len(types) > 1:
typelist = sorted([str(t) for t in types])
message = 'mixed data type: ' + ','.join(typelist)
raise KevlarMixedDataTypeError(message)
else:
listtype = next(iter(types))
if listtype in (float, float64):
strlist = ['{:.3f}'.format(v) for v in self]
else:
strlist = [str(v) for v in self]
return ','.join(strlist)


class Variant(object):
"""Base class for handling variant calls and no-calls."""

Expand All @@ -42,7 +72,7 @@ def __init__(self, seqid, pos, refr, alt, **kwargs):
self._refr = refr
self._alt = alt
self._filters = set()
self.info = dict()
self.info = defaultdict(FormattedList)
for key, value in kwargs.items():
self.annotate(key, value)
self._sample_data = defaultdict(dict)
Expand Down Expand Up @@ -137,28 +167,34 @@ def refrwindow(self):
return self.attribute('REFRWINDOW')

def annotate(self, key, value):
value = str(value)
if key in self.info:
if isinstance(self.info[key], set):
self.info[key].add(value)
else:
oldvalue = self.info[key]
self.info[key] = set((oldvalue, value))
else:
self.info[key] = value
self.info[key].append(value)

def attribute(self, key, pair=False, string=False):
"""Query annotated INFO data.
def attribute(self, key, pair=False):
By default, returns the value in its native type (str, int, float). If
there are multiple values annotated for the given key, they are
returned as a `FormattedList`.
Set `string=True` to coerce the value(s) to a string representation.
Set `pair=True` to retrieve data as a key/value pair string, such as
`DROPPED=2` or `SCORES=10,12,18`.
"""
if key not in self.info:
return None
value = self.info[key]
if isinstance(value, set):
value = ','.join(sorted(value))
value = value.replace(';', ':')
values = self.info[key]
if pair:
keyvaluepair = '{:s}={:s}'.format(key, value)
keyvaluepair = '{:s}={:s}'.format(key, str(values))
return keyvaluepair
else:
return value
if string:
return str(values)
else:
if len(values) == 1:
return values[0]
else:
return values

def filter(self, filtertype):
if not isinstance(filtertype, VariantFilter):
Expand Down

0 comments on commit ce2f8b1

Please sign in to comment.