Skip to content

Commit

Permalink
Reimplement varfilter module (#354)
Browse files Browse the repository at this point in the history
This update includes a complete rewrite of the varfilter module. Rather than loading regions to be masked into memory, varfilter now loads calls to be filtered into memory and processes regions to be masked in a streaming fashion. Drastically improves memory usage and runtime. Closes #345.
  • Loading branch information
standage committed Feb 7, 2019
1 parent af7ae4e commit eb61d90
Show file tree
Hide file tree
Showing 8 changed files with 70 additions and 129 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
- A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
- A new Snakemake workflow for kevlar's standard processing procedure (see #306).
- New `unband` module to merge augmented Fastq files produced with a *k*-mer banding strategy (see #316).
- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342).
- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342, #354).
- New dependency: `intervaltree` package (see #318).
- A new `sandbox` directory with convenience scripts for development and analysis (see #335).
- A new `--min-like-score` filter for the `simlike` module (see #343).
Expand Down
13 changes: 1 addition & 12 deletions kevlar/cli/varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@


def subparser(subparsers):
"""Define the `kevlar varfilter` command-line interface."""

subparser = subparsers.add_parser(
'varfilter', description='Filter out variants falling in the genomic '
'regions specified by the BED file(s). This can be used to exclude '
Expand All @@ -23,16 +21,7 @@ def subparser(subparsers):
'will be written; default is terminal (standard output)'
)
subparser.add_argument(
'--load-index', action='store_true', help='the "filt" argument is a '
'pre-loaded index, not a BED file'
)
subparser.add_argument(
'--save-index', metavar='FILE', help='save the index to "FILE" once '
'loaded'
)
subparser.add_argument(
'filt', help='BED file (or pre-loaded index) containing regions to '
'filter out'
'filt', help='BED file containing regions to filter out'
)
subparser.add_argument(
'vcf', nargs='+', help='VCF file(s) with calls to filter'
Expand Down
5 changes: 5 additions & 0 deletions kevlar/intervalforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def __iter__(self):
def __len__(self):
return sum([len(tree) for tree in self.trees.values()])

def __iter__(self):
for label, tree in self.trees.items():
for interval in tree:
yield interval.data

def insert(self, label, start, end, data=None):
assert label is not None
if data is None:
Expand Down
Binary file removed kevlar/tests/data/fiveparts-varmask.pickle
Binary file not shown.
112 changes: 32 additions & 80 deletions kevlar/tests/test_varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,63 +7,59 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from intervaltree import Interval
import kevlar
from kevlar.tests import data_file
import pytest
from tempfile import NamedTemporaryFile


def test_load_variant_mask():
bedfile = data_file('fiveparts-ignore-single.bed')
with kevlar.open(bedfile, 'r') as bedstream:
index = kevlar.varfilter.load_variant_mask(bedstream)
assert len(index) == 1
def test_load_predictions():
with kevlar.open(data_file('five-snvs-with-likelihood.vcf'), 'r') as vcf:
vcfreader = kevlar.vcf.VCFReader(vcf)
index = kevlar.varfilter.load_predictions(vcfreader)
assert len(index) == 5
assert list(index.trees.keys()) == ['chr17']
assert index.query('chr1', 1, 10000000) == set()
assert index.query('chr17', 1, 10000000) == set()
itvl = Interval(36385017, 36385018, 'chr17:36385017-36385018')
assert index.query('chr17', 36385017) == set([itvl])
assert index.query('chr1', 1, 1000000) == set()
assert index.query('chr17', 1, 1000000) == set()
result = [i.data.region for i in index.query('chr17', 36385017)]
assert result == [('chr17', 36385017, 36385018)]


def test_load_variant_mask_multi_chrom():
bedfile = data_file('intervals.bed')
with kevlar.open(bedfile, 'r') as bedstream:
index = kevlar.varfilter.load_variant_mask(bedstream)
def test_load_predictions_multi_chrom():
with kevlar.open(data_file('case-low-abund/calls.vcf.gz'), 'r') as vcf:
vcfreader = kevlar.vcf.VCFReader(vcf)
index = kevlar.varfilter.load_predictions(vcfreader)
assert len(index) == 5
assert sorted(index.trees.keys()) == ['1', '2', '3', '4']
assert index.query('5', 1, 10000000) == set()
itvls = [
Interval(40, 400, '4:40-400'),
Interval(44, 444, '4:44-444'),
]
assert index.query('4', 300) == set(itvls)
assert set(index.trees.keys()) == set(['1', '9', '14'])
assert index.query('chr1', 1, 1000000) == set()
assert index.query('1', 1, 1000000) == set()
result = [i.data.region for i in index.query('1', 91850000, 91860000)]
assert set(result) == set([
('1', 91853096, 91853097),
('1', 91853110, 91853111),
])
result = [i.data.region for i in index.query('14', 82461000, 82462000)]
assert result == [('14', 82461856, 82461857)]


def test_varfilter_single():
bedfile = data_file('fiveparts-ignore-single.bed')
with kevlar.open(bedfile, 'r') as bedstream:
index = kevlar.varfilter.load_variant_mask(bedstream)
bedstream = kevlar.parse_bed(
kevlar.open(data_file('fiveparts-ignore-single.bed'), 'r')
)
vcffile = data_file('five-snvs-with-likelihood.vcf')
with kevlar.open(vcffile, 'r') as vcfstream:
reader = kevlar.vcf.VCFReader(vcfstream)
varcalls = list(kevlar.varfilter.varfilter(reader, index))
varcalls = list(kevlar.varfilter.varfilter(reader, bedstream))
assert len(varcalls) == 5
filtered = [vc for vc in varcalls if vc.filterstr != 'PASS']
assert len(filtered) == 1
assert filtered[0].position == 36385017


@pytest.mark.parametrize('load_index,varmaskfile', [
(False, data_file('fiveparts-ignore.bed')),
(True, data_file('fiveparts-varmask.pickle')),
])
def test_varfilter_main(load_index, varmaskfile, capsys):
def test_varfilter_main(capsys):
arglist = [
'varfilter', varmaskfile, data_file('five-snvs-with-likelihood.vcf')
'varfilter', data_file('fiveparts-ignore.bed'),
data_file('five-snvs-with-likelihood.vcf')
]
args = kevlar.cli.parser().parse_args(arglist)
args.load_index = load_index
kevlar.varfilter.main(args)

out, err = capsys.readouterr()
Expand All @@ -72,49 +68,5 @@ def test_varfilter_main(load_index, varmaskfile, capsys):
assert len(calls) == 5
filtered = [c for c in calls if '\tUserFilter\t' in c]
assert len(filtered) == 2
assert [c.split('\t')[1] for c in filtered] == ['36385018', '3547691']


def test_varfilter_main_save():
with NamedTemporaryFile(suffix='.pickle') as picklefile:
arglist = [
'varfilter', '--save-index', picklefile.name,
data_file('fiveparts-ignore.bed'),
data_file('five-snvs-with-likelihood.vcf')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.varfilter.main(args)
index = kevlar.varfilter.load_index_from_file(picklefile.name)
assert index.query('chr17', 3500000) != set()
assert index.query('chr17', 10) == set()


def test_load_and_save_pickle():
with NamedTemporaryFile(suffix='.pickle') as picklefile:
segdups = data_file('hg38.segdups.sample.bed.gz')
with kevlar.open(segdups, 'r') as fh:
index1 = kevlar.varfilter.load_variant_mask(fh)
kevlar.varfilter.save_index_to_file(index1, picklefile.name)
index2 = kevlar.varfilter.load_index_from_file(picklefile.name)

in_index = [
('chr1', 144140505, 144141551),
('chr1', 143201412, 143203950),
('chr15', 20561471, 20576628),
('chr16', 5134281, 5141968),
('chr16', 32647088, 32651261),
]
not_in_index = [
('chr3', 144140505, 144141551),
('chr4', 143201412, 143203950),
('chr5', 20561471, 20576628),
('chr6', 5134281, 5141968),
('chr7', 32647088, 32651261),
]

for chrom, start, end in in_index:
assert index1.query(chrom, start, end) != set()
assert index2.query(chrom, start, end) != set()
for chrom, start, end in not_in_index:
assert index2.query(chrom, start, end) == set()
assert index1.query(chrom, start, end) == set()
positions = [c.split('\t')[1] for c in filtered]
assert sorted(positions) == sorted(['36385018', '3547691'])
11 changes: 11 additions & 0 deletions kevlar/tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,3 +244,14 @@ def test_vcf_roundtrip(capsys):
assert [c.position for c in calls] == [c.position for c in calls2]
assert [str(c) for c in calls] == [str(c) for c in calls2]
assert [c.window for c in calls] == [c.window for c in calls2]


def test_region():
variant = Variant('chr12', 1033773, 'A', 'G')
assert variant.region == ('chr12', 1033773, 1033774)
variant = Variant('chr12', 1033773, 'A', 'AGTG')
assert variant.region == ('chr12', 1033773, 1033774)
variant = Variant('chr12', 1033773, 'AT', 'TG')
assert variant.region == ('chr12', 1033773, 1033775)
variant = Variant('chr12', 1033773, 'ATACCG', 'A')
assert variant.region == ('chr12', 1033773, 1033779)
52 changes: 16 additions & 36 deletions kevlar/varfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,58 +8,38 @@
# -----------------------------------------------------------------------------

import kevlar
import pickle


def save_index_to_file(index, filename):
with open(filename, 'wb') as fh:
pickle.dump(index, fh)


def load_index_from_file(filename):
with open(filename, 'rb') as fh:
index = pickle.load(fh)
return index


def load_variant_mask(bedstream):
message = 'Loading genomic intervals to be filtered into memory'
kevlar.plog('[kevlar::varfilter]', message)
progress_indictator = kevlar.ProgressIndicator(
'[kevlar::varfilter] {counter} intervals loaded', interval=1e4,
breaks=[1e5, 1e6, 1e7], usetimer=True,
)
def load_predictions(varcalls):
kevlar.plog('[kevlar::varfilter] Loading predictions to filter')
index = kevlar.IntervalForest()
for chrom, start, end, data in kevlar.parse_bed(bedstream):
index.insert(chrom, start, end)
progress_indictator.update()
for call in varcalls:
index.insert(*call.region, data=call)
return index


def varfilter(calls, varmask):
def varfilter(callstream, maskstream):
callindex = load_predictions(callstream)
message = 'Filtering preliminary variant calls'
kevlar.plog('[kevlar::varfilter]', message)
progress_indictator = kevlar.ProgressIndicator(
'[kevlar::varfilter] {counter} calls processed', interval=1e3,
breaks=[1e4, 1e5, 1e6], usetimer=True,
'[kevlar::varfilter] {counter} regions processed', interval=1e5,
breaks=[1e6, 1e6, 1e7], usetimer=True,
)
for varcall in calls:
if varmask.query(varcall.seqid, varcall.position) != set():
varcall.filter(kevlar.vcf.VariantFilter.UserFilter)
yield varcall
for chrom, start, end, data in maskstream:
hits = callindex.query(chrom, start, end)
for interval in hits:
interval.data.filter(kevlar.vcf.VariantFilter.UserFilter)
progress_indictator.update()
for varcall in callindex:
yield varcall


def main(args):
reader = kevlar.vcf.vcfstream(args.vcf)
bedstream = kevlar.parse_bed(kevlar.open(args.filt, 'r'))
outstream = kevlar.open(args.out, 'w')
writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::varfilter')
writer.write_header()
if args.load_index:
varmask = load_index_from_file(args.filt)
else:
varmask = load_variant_mask(kevlar.open(args.filt, 'r'))
if args.save_index:
save_index_to_file(varmask, args.save_index)
for varcall in varfilter(reader, varmask):
for varcall in varfilter(reader, bedstream):
writer.write(varcall)
4 changes: 4 additions & 0 deletions kevlar/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ def seqid(self):
def position(self):
return self._pos

@property
def region(self):
return self.seqid, self.position, self.position + len(self._refr)

@property
def vcf(self):
"""Print variant to VCF."""
Expand Down

0 comments on commit eb61d90

Please sign in to comment.