Reimplement varfilter module (#354)

This update includes a complete rewrite of the varfilter module. Rather than loading regions to be masked into memory, varfilter now loads calls to be filtered into memory and processes regions to be masked in a streaming fashion. Drastically improves memory usage and runtime. Closes #345.
kevlar-dev · Feb 7, 2019 · eb61d90 · eb61d90
1 parent af7ae4e
commit eb61d90
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 129 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,7 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
 - A new Snakemake workflow for kevlar's standard processing procedure (see #306).
 - New `unband` module to merge augmented Fastq files produced with a *k*-mer banding strategy (see #316).
-- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342).
+- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342, #354).
 - New dependency: `intervaltree` package (see #318).
 - A new `sandbox` directory with convenience scripts for development and analysis (see #335).
 - A new `--min-like-score` filter for the `simlike` module (see #343).

diff --git a/kevlar/cli/varfilter.py b/kevlar/cli/varfilter.py
@@ -9,8 +9,6 @@
 
 
 def subparser(subparsers):
-    """Define the `kevlar varfilter` command-line interface."""
-
     subparser = subparsers.add_parser(
         'varfilter', description='Filter out variants falling in the genomic '
         'regions specified by the BED file(s). This can be used to exclude '
@@ -23,16 +21,7 @@ def subparser(subparsers):
         'will be written; default is terminal (standard output)'
     )
     subparser.add_argument(
-        '--load-index', action='store_true', help='the "filt" argument is a '
-        'pre-loaded index, not a BED file'
-    )
-    subparser.add_argument(
-        '--save-index', metavar='FILE', help='save the index to "FILE" once '
-        'loaded'
-    )
-    subparser.add_argument(
-        'filt', help='BED file (or pre-loaded index) containing regions to '
-        'filter out'
+        'filt', help='BED file containing regions to filter out'
     )
     subparser.add_argument(
         'vcf', nargs='+', help='VCF file(s) with calls to filter'

diff --git a/kevlar/intervalforest.py b/kevlar/intervalforest.py
@@ -42,6 +42,11 @@ def __iter__(self):
     def __len__(self):
         return sum([len(tree) for tree in self.trees.values()])
 
+    def __iter__(self):
+        for label, tree in self.trees.items():
+            for interval in tree:
+                yield interval.data
+
     def insert(self, label, start, end, data=None):
         assert label is not None
         if data is None:

diff --git a/kevlar/tests/data/fiveparts-varmask.pickle b/kevlar/tests/data/fiveparts-varmask.pickle
diff --git a/kevlar/tests/test_varfilter.py b/kevlar/tests/test_varfilter.py
@@ -7,63 +7,59 @@
 # licensed under the MIT license: see LICENSE.
 # -----------------------------------------------------------------------------
 
-from intervaltree import Interval
 import kevlar
 from kevlar.tests import data_file
-import pytest
-from tempfile import NamedTemporaryFile
 
 
-def test_load_variant_mask():
-    bedfile = data_file('fiveparts-ignore-single.bed')
-    with kevlar.open(bedfile, 'r') as bedstream:
-        index = kevlar.varfilter.load_variant_mask(bedstream)
-    assert len(index) == 1
+def test_load_predictions():
+    with kevlar.open(data_file('five-snvs-with-likelihood.vcf'), 'r') as vcf:
+        vcfreader = kevlar.vcf.VCFReader(vcf)
+        index = kevlar.varfilter.load_predictions(vcfreader)
+    assert len(index) == 5
     assert list(index.trees.keys()) == ['chr17']
-    assert index.query('chr1', 1, 10000000) == set()
-    assert index.query('chr17', 1, 10000000) == set()
-    itvl = Interval(36385017, 36385018, 'chr17:36385017-36385018')
-    assert index.query('chr17', 36385017) == set([itvl])
+    assert index.query('chr1', 1, 1000000) == set()
+    assert index.query('chr17', 1, 1000000) == set()
+    result = [i.data.region for i in index.query('chr17', 36385017)]
+    assert result == [('chr17', 36385017, 36385018)]
 
 
-def test_load_variant_mask_multi_chrom():
-    bedfile = data_file('intervals.bed')
-    with kevlar.open(bedfile, 'r') as bedstream:
-        index = kevlar.varfilter.load_variant_mask(bedstream)
+def test_load_predictions_multi_chrom():
+    with kevlar.open(data_file('case-low-abund/calls.vcf.gz'), 'r') as vcf:
+        vcfreader = kevlar.vcf.VCFReader(vcf)
+        index = kevlar.varfilter.load_predictions(vcfreader)
     assert len(index) == 5
-    assert sorted(index.trees.keys()) == ['1', '2', '3', '4']
-    assert index.query('5', 1, 10000000) == set()
-    itvls = [
-        Interval(40, 400, '4:40-400'),
-        Interval(44, 444, '4:44-444'),
-    ]
-    assert index.query('4', 300) == set(itvls)
+    assert set(index.trees.keys()) == set(['1', '9', '14'])
+    assert index.query('chr1', 1, 1000000) == set()
+    assert index.query('1', 1, 1000000) == set()
+    result = [i.data.region for i in index.query('1', 91850000, 91860000)]
+    assert set(result) == set([
+        ('1', 91853096, 91853097),
+        ('1', 91853110, 91853111),
+    ])
+    result = [i.data.region for i in index.query('14', 82461000, 82462000)]
+    assert result == [('14', 82461856, 82461857)]
 
 
 def test_varfilter_single():
-    bedfile = data_file('fiveparts-ignore-single.bed')
-    with kevlar.open(bedfile, 'r') as bedstream:
-        index = kevlar.varfilter.load_variant_mask(bedstream)
+    bedstream = kevlar.parse_bed(
+        kevlar.open(data_file('fiveparts-ignore-single.bed'), 'r')
+    )
     vcffile = data_file('five-snvs-with-likelihood.vcf')
     with kevlar.open(vcffile, 'r') as vcfstream:
         reader = kevlar.vcf.VCFReader(vcfstream)
-        varcalls = list(kevlar.varfilter.varfilter(reader, index))
+        varcalls = list(kevlar.varfilter.varfilter(reader, bedstream))
     assert len(varcalls) == 5
     filtered = [vc for vc in varcalls if vc.filterstr != 'PASS']
     assert len(filtered) == 1
     assert filtered[0].position == 36385017
 
 
-@pytest.mark.parametrize('load_index,varmaskfile', [
-    (False, data_file('fiveparts-ignore.bed')),
-    (True, data_file('fiveparts-varmask.pickle')),
-])
-def test_varfilter_main(load_index, varmaskfile, capsys):
+def test_varfilter_main(capsys):
     arglist = [
-        'varfilter', varmaskfile, data_file('five-snvs-with-likelihood.vcf')
+        'varfilter', data_file('fiveparts-ignore.bed'),
+        data_file('five-snvs-with-likelihood.vcf')
     ]
     args = kevlar.cli.parser().parse_args(arglist)
-    args.load_index = load_index
     kevlar.varfilter.main(args)
 
     out, err = capsys.readouterr()
@@ -72,49 +68,5 @@ def test_varfilter_main(load_index, varmaskfile, capsys):
     assert len(calls) == 5
     filtered = [c for c in calls if '\tUserFilter\t' in c]
     assert len(filtered) == 2
-    assert [c.split('\t')[1] for c in filtered] == ['36385018', '3547691']
-
-
-def test_varfilter_main_save():
-    with NamedTemporaryFile(suffix='.pickle') as picklefile:
-        arglist = [
-            'varfilter', '--save-index', picklefile.name,
-            data_file('fiveparts-ignore.bed'),
-            data_file('five-snvs-with-likelihood.vcf')
-        ]
-        args = kevlar.cli.parser().parse_args(arglist)
-        kevlar.varfilter.main(args)
-        index = kevlar.varfilter.load_index_from_file(picklefile.name)
-        assert index.query('chr17', 3500000) != set()
-        assert index.query('chr17', 10) == set()
-
-
-def test_load_and_save_pickle():
-    with NamedTemporaryFile(suffix='.pickle') as picklefile:
-        segdups = data_file('hg38.segdups.sample.bed.gz')
-        with kevlar.open(segdups, 'r') as fh:
-            index1 = kevlar.varfilter.load_variant_mask(fh)
-        kevlar.varfilter.save_index_to_file(index1, picklefile.name)
-        index2 = kevlar.varfilter.load_index_from_file(picklefile.name)
-
-    in_index = [
-        ('chr1', 144140505, 144141551),
-        ('chr1', 143201412, 143203950),
-        ('chr15', 20561471, 20576628),
-        ('chr16', 5134281, 5141968),
-        ('chr16', 32647088, 32651261),
-    ]
-    not_in_index = [
-        ('chr3', 144140505, 144141551),
-        ('chr4', 143201412, 143203950),
-        ('chr5', 20561471, 20576628),
-        ('chr6', 5134281, 5141968),
-        ('chr7', 32647088, 32651261),
-    ]
-
-    for chrom, start, end in in_index:
-        assert index1.query(chrom, start, end) != set()
-        assert index2.query(chrom, start, end) != set()
-    for chrom, start, end in not_in_index:
-        assert index2.query(chrom, start, end) == set()
-        assert index1.query(chrom, start, end) == set()
+    positions = [c.split('\t')[1] for c in filtered]
+    assert sorted(positions) == sorted(['36385018', '3547691'])
diff --git a/kevlar/tests/test_vcf.py b/kevlar/tests/test_vcf.py
@@ -244,3 +244,14 @@ def test_vcf_roundtrip(capsys):
     assert [c.position for c in calls] == [c.position for c in calls2]
     assert [str(c) for c in calls] == [str(c) for c in calls2]
     assert [c.window for c in calls] == [c.window for c in calls2]
+
+
+def test_region():
+    variant = Variant('chr12', 1033773, 'A', 'G')
+    assert variant.region == ('chr12', 1033773, 1033774)
+    variant = Variant('chr12', 1033773, 'A', 'AGTG')
+    assert variant.region == ('chr12', 1033773, 1033774)
+    variant = Variant('chr12', 1033773, 'AT', 'TG')
+    assert variant.region == ('chr12', 1033773, 1033775)
+    variant = Variant('chr12', 1033773, 'ATACCG', 'A')
+    assert variant.region == ('chr12', 1033773, 1033779)
diff --git a/kevlar/varfilter.py b/kevlar/varfilter.py
@@ -8,58 +8,38 @@
 # -----------------------------------------------------------------------------
 
 import kevlar
-import pickle
 
 
-def save_index_to_file(index, filename):
-    with open(filename, 'wb') as fh:
-        pickle.dump(index, fh)
-
-
-def load_index_from_file(filename):
-    with open(filename, 'rb') as fh:
-        index = pickle.load(fh)
-    return index
-
-
-def load_variant_mask(bedstream):
-    message = 'Loading genomic intervals to be filtered into memory'
-    kevlar.plog('[kevlar::varfilter]', message)
-    progress_indictator = kevlar.ProgressIndicator(
-        '[kevlar::varfilter]     {counter} intervals loaded', interval=1e4,
-        breaks=[1e5, 1e6, 1e7], usetimer=True,
-    )
+def load_predictions(varcalls):
+    kevlar.plog('[kevlar::varfilter] Loading predictions to filter')
     index = kevlar.IntervalForest()
-    for chrom, start, end, data in kevlar.parse_bed(bedstream):
-        index.insert(chrom, start, end)
-        progress_indictator.update()
+    for call in varcalls:
+        index.insert(*call.region, data=call)
     return index
 
 
-def varfilter(calls, varmask):
+def varfilter(callstream, maskstream):
+    callindex = load_predictions(callstream)
     message = 'Filtering preliminary variant calls'
     kevlar.plog('[kevlar::varfilter]', message)
     progress_indictator = kevlar.ProgressIndicator(
-        '[kevlar::varfilter]     {counter} calls processed', interval=1e3,
-        breaks=[1e4, 1e5, 1e6], usetimer=True,
+        '[kevlar::varfilter]     {counter} regions processed', interval=1e5,
+        breaks=[1e6, 1e6, 1e7], usetimer=True,
     )
-    for varcall in calls:
-        if varmask.query(varcall.seqid, varcall.position) != set():
-            varcall.filter(kevlar.vcf.VariantFilter.UserFilter)
-        yield varcall
+    for chrom, start, end, data in maskstream:
+        hits = callindex.query(chrom, start, end)
+        for interval in hits:
+            interval.data.filter(kevlar.vcf.VariantFilter.UserFilter)
         progress_indictator.update()
+    for varcall in callindex:
+        yield varcall
 
 
 def main(args):
     reader = kevlar.vcf.vcfstream(args.vcf)
+    bedstream = kevlar.parse_bed(kevlar.open(args.filt, 'r'))
     outstream = kevlar.open(args.out, 'w')
     writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::varfilter')
     writer.write_header()
-    if args.load_index:
-        varmask = load_index_from_file(args.filt)
-    else:
-        varmask = load_variant_mask(kevlar.open(args.filt, 'r'))
-    if args.save_index:
-        save_index_to_file(varmask, args.save_index)
-    for varcall in varfilter(reader, varmask):
+    for varcall in varfilter(reader, bedstream):
         writer.write(varcall)
diff --git a/kevlar/vcf.py b/kevlar/vcf.py
@@ -112,6 +112,10 @@ def seqid(self):
     def position(self):
         return self._pos
 
+    @property
+    def region(self):
+        return self.seqid, self.position, self.position + len(self._refr)
+
     @property
     def vcf(self):
         """Print variant to VCF."""