Skip to content

Commit

Permalink
Save and load pickled varfilter index (#342)
Browse files Browse the repository at this point in the history
This update enables the `kevlar varfilter` command to save and load from pickles of the variant index. Closes #340.
  • Loading branch information
standage committed Jan 22, 2019
1 parent 5dcd514 commit e899720
Show file tree
Hide file tree
Showing 7 changed files with 115 additions and 6 deletions.
11 changes: 10 additions & 1 deletion kevlar/cli/varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,16 @@ def subparser(subparsers):
'will be written; default is terminal (standard output)'
)
subparser.add_argument(
'bed', help='BED file containing regions to filter out'
'--load-index', action='store_true', help='the "filt" argument is a '
'pre-loaded index, not a BED file'
)
subparser.add_argument(
'--save-index', metavar='FILE', help='save the index to "FILE" once '
'loaded'
)
subparser.add_argument(
'filt', help='BED file (or pre-loaded index) containing regions to '
'filter out'
)
subparser.add_argument(
'vcf', nargs='+', help='VCF file(s) with calls to filter'
Expand Down
4 changes: 3 additions & 1 deletion kevlar/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from collections import defaultdict
import kevlar
from kevlar.intervalforest import IntervalForest
import sys


def populate_index_from_bed(instream):
Expand Down Expand Up @@ -62,14 +63,15 @@ def compact(variants, index, delta=10):
nmatches += 1
if match is None:
match = varcall
localfound = hits
if nmatches == 0:
calllist[0].annotate('EVAL', 'False')
calls.append(calllist[0])
else:
assert nmatches > 0, nmatches
if nmatches > 1:
print('WARNING: found', nmatches, 'matches for CALLCLASS',
callclass, file=sys.stderr)
match.annotate('EVAL', 'True')
calls.append(match)

calls.sort(key=lambda c: float(c.attribute('LIKESCORE')), reverse=True)
Expand Down
Binary file added kevlar/tests/data/fiveparts-varmask.pickle
Binary file not shown.
Binary file added kevlar/tests/data/hg38.segdups.sample.bed.gz
Binary file not shown.
56 changes: 53 additions & 3 deletions kevlar/tests/test_varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import kevlar
from kevlar.tests import data_file
import pytest
from tempfile import NamedTemporaryFile


def test_load_variant_mask():
Expand Down Expand Up @@ -53,12 +54,16 @@ def test_varfilter_single():
assert filtered[0].position == 36385017


def test_varfilter_main(capsys):
@pytest.mark.parametrize('load_index,varmaskfile', [
(False, data_file('fiveparts-ignore.bed')),
(True, data_file('fiveparts-varmask.pickle')),
])
def test_varfilter_main(load_index, varmaskfile, capsys):
arglist = [
'varfilter', data_file('fiveparts-ignore.bed'),
data_file('five-snvs-with-likelihood.vcf')
'varfilter', varmaskfile, data_file('five-snvs-with-likelihood.vcf')
]
args = kevlar.cli.parser().parse_args(arglist)
args.load_index = load_index
kevlar.varfilter.main(args)

out, err = capsys.readouterr()
Expand All @@ -68,3 +73,48 @@ def test_varfilter_main(capsys):
filtered = [c for c in calls if '\tUserFilter\t' in c]
assert len(filtered) == 2
assert [c.split('\t')[1] for c in filtered] == ['36385018', '3547691']


def test_varfilter_main_save():
with NamedTemporaryFile(suffix='.pickle') as picklefile:
arglist = [
'varfilter', '--save-index', picklefile.name,
data_file('fiveparts-ignore.bed'),
data_file('five-snvs-with-likelihood.vcf')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.varfilter.main(args)
index = kevlar.varfilter.load_index_from_file(picklefile.name)
assert index.query('chr17', 3500000) != set()
assert index.query('chr17', 10) == set()


def test_load_and_save_pickle():
with NamedTemporaryFile(suffix='.pickle') as picklefile:
segdups = data_file('hg38.segdups.sample.bed.gz')
with kevlar.open(segdups, 'r') as fh:
index1 = kevlar.varfilter.load_variant_mask(fh)
kevlar.varfilter.save_index_to_file(index1, picklefile.name)
index2 = kevlar.varfilter.load_index_from_file(picklefile.name)

in_index = [
('chr1', 144140505, 144141551),
('chr1', 143201412, 143203950),
('chr15', 20561471, 20576628),
('chr16', 5134281, 5141968),
('chr16', 32647088, 32651261),
]
not_in_index = [
('chr3', 144140505, 144141551),
('chr4', 143201412, 143203950),
('chr5', 20561471, 20576628),
('chr6', 5134281, 5141968),
('chr7', 32647088, 32651261),
]

for chrom, start, end in in_index:
assert index1.query(chrom, start, end) != set()
assert index2.query(chrom, start, end) != set()
for chrom, start, end in not_in_index:
assert index2.query(chrom, start, end) == set()
assert index1.query(chrom, start, end) == set()
19 changes: 18 additions & 1 deletion kevlar/varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,18 @@
# -----------------------------------------------------------------------------

import kevlar
import pickle


def save_index_to_file(index, filename):
with open(filename, 'wb') as fh:
pickle.dump(index, fh)


def load_index_from_file(filename):
with open(filename, 'rb') as fh:
index = pickle.load(fh)
return index


def load_variant_mask(bedstream):
Expand Down Expand Up @@ -43,6 +55,11 @@ def main(args):
outstream = kevlar.open(args.out, 'w')
writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::varfilter')
writer.write_header()
varmask = load_variant_mask(kevlar.open(args.bed, 'r'))
if args.load_index:
varmask = load_index_from_file(args.filt)
else:
varmask = load_variant_mask(kevlar.open(args.filt, 'r'))
if args.save_index:
save_index_to_file(varmask, args.save_index)
for varcall in varfilter(reader, varmask):
writer.write(varcall)
31 changes: 31 additions & 0 deletions notebook/bigsim/evalutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,18 @@ def populate_index_from_simulation(instream, chrlabel):
return index


def populate_index_from_bed(instream):
index = IntervalForest()
for line in instream:
if line.strip() == '':
continue
values = line.strip().split()
chrom = values[0]
start, end = [int(coord) for coord in values[1:3]]
index.insert(chrom, start, end)
return index


def compact(reader, index, delta=10):
"""Compact variants by call class
Expand Down Expand Up @@ -264,6 +276,25 @@ def subset_variants(variants, vartype, minlength=None, maxlength=None):
continue
yield line


def subset_variants_bed(variants, vartype, minlength=None, maxlength=None):
assert vartype in ('SNV', 'INDEL')
for line in variants:
if line.strip() == '':
continue
values = line.strip().split()
if len(values) == 5:
if vartype == 'SNV':
yield line
continue
if vartype == 'INDEL':
indellength = int(values[3])
if minlength and indellength < minlength:
continue
if maxlength and indellength > maxlength:
continue
yield line


def subset_vcf(varcalls, vartype, minlength=None, maxlength=None):
assert vartype in ('SNV', 'INDEL')
Expand Down

0 comments on commit e899720

Please sign in to comment.