Skip to content

Commit

Permalink
Filter based on proband k-mer abundance (#339)
Browse files Browse the repository at this point in the history
It is often unreasonable to expect that all k-mers spanning the variant in the proband will be high abundance, and so kevlar has never required that all spanning k-mers are "interesting" or putatively novel. Even filtering based on the number of spanning k-mers that are "interesting" has been problematic, since some true variants sometimes have few spanning k-mers that meet the required thresholds.

However, kevlar has never filtered calls based on what is *not* expected in the k-mer spanning the variant in the proband. While it is common to have low abundance k-mers intermittently, it is very rare to have long stretches of low abundance k-mers spanning the variant in a true variant.

This update introduces a new filter that will discard a variant prediction if there are 5 or more low-abundance k-mers spanning the variant call. Based on observations, this will eliminate many false positives, while having little to no effect on true calls.

This update also makes the behavior of this new filter configurable, along with two other recently implemented filters.
  • Loading branch information
standage committed Jan 19, 2019
1 parent d9f0838 commit 5dcd514
Show file tree
Hide file tree
Showing 14 changed files with 188 additions and 40 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Changed
- Added a new flag to print to the terminal (stderr) and a logfile simultaneously (see #308).
- The functionality of the previous `filter` module is now split between the new `unband` module and a reimplementation of the `filter` module (see #316).
- Added `--ctrl-max` flag to the `simlike` module, enabling a new `ControlAbundance` filter (see #327).
- Added a "fast mode" to the `simlike` module, prematurely halting computations for calls already marked for filtering (see #328).
- Added a filter for problematic short indels adjacent to homopolymers (see #336, #338).
- Added a filter for problematic short indels adjacent to homopolymers (see #336, #338, #339).
- Implemented new filters in the `simlike` module based on thresholds and k-mer abundances: the `ControlAbundance` filter for predictions with too many high-abundance parent/control k-mers spanning the variant, and the `CaseAbundance` filter for predictions with too many consecutive proband/child k-mers spanning the variant (see #327, #339).

### Fixed
- Corrected a bug that reported the reference target sequence instead of the assembled contig sequence in the `CONTIG` attribute of indel calls in the VCF (see #304).
Expand Down
5 changes: 3 additions & 2 deletions kevlar/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,14 @@ def merge_adjacent(callstream):

def prelim_call(targetlist, querylist, partid=None, match=1, mismatch=2,
gapopen=5, gapextend=0, ksize=31, refrfile=None, debug=False,
mindist=5):
mindist=5, homopolyfilt=True):
"""Implement the `kevlar call` procedure as a generator function."""
for query in sorted(querylist, reverse=True, key=len):
alignments = list()
for target in sorted(targetlist, key=lambda cutout: cutout.defline):
mapping = VariantMapping(
query, target, match=match, mismatch=mismatch, gapopen=gapopen,
gapextend=gapextend
gapextend=gapextend, homopolyfilt=homopolyfilt,
)
alignments.append(mapping)
aligns2report = alignments_to_report(alignments)
Expand Down Expand Up @@ -192,6 +192,7 @@ def main(args):
gdnas, contigs, partid, match=args.match, mismatch=args.mismatch,
gapopen=args.open, gapextend=args.extend, ksize=args.ksize,
refrfile=args.refr, debug=args.debug, mindist=5,
homopolyfilt=not args.no_homopoly_filter,
)
for varcall in caller:
if args.gen_mask:
Expand Down
4 changes: 4 additions & 0 deletions kevlar/cli/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def subparser(subparsers):
help='show this help message and exit')
misc_args.add_argument('-d', '--debug', action='store_true',
help='show debugging output')
misc_args.add_argument('--no-homopoly-filter', action='store_true',
help='by default, short indels adjacent to '
'homopolymers are filtered out; use this flag to '
'disable that filter')
misc_args.add_argument('--refr', metavar='FILE',
help='reference genome indexed for BWA search; if '
'provided, mates of interesting reads will be used '
Expand Down
22 changes: 20 additions & 2 deletions kevlar/cli/simlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import sys


def subparser(subparsers):
"""Define the `kevlar simlike` command-line interface."""
Expand Down Expand Up @@ -60,6 +58,26 @@ def subparser(subparsers):
'disable dynamic error rate (scaled by abundance of '
'reference allele k-mer in the reference genome)')

filt_args = subparser.add_argument_group(
'Heuristic filters',
'The following heuristic filters can improve accuracy when calling '
'de novo variants but may require tuning for your particular data set.'
)
filt_args.add_argument(
'--ctrl-abund-high', metavar='H', type=int, default=4,
help='a variant call will be filtered out if either of the control '
'samples has >H high abundance k-mers spanning the variant (where '
'high abundance means > --ctrl-max); by default, H=4; set H<=0 to '
'disable the filter'
)
filt_args.add_argument(
'--case-abund-low', metavar='L', type=int, default=5,
help='a variant call will be filtered out if the case sample has L or '
'more consecutive low abundance k-mers spanning the variant (where '
'low abundance means < --case-min); by default, L=5; set L<=0 to '
'disable the filter'
)

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
Expand Down
60 changes: 42 additions & 18 deletions kevlar/simlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,15 +186,18 @@ def annotate_abundances(call, abundances, samplelabels):


def process_partition(partitionid, calls):
maxscore = max([c.attribute('LIKESCORE') for c in calls])
passcalls = [c for c in calls if c.filterstr == 'PASS']
if len(passcalls) == 0:
return
maxscore = max([c.attribute('LIKESCORE') for c in passcalls])
maxcalls = list()
for call in calls:
if call.attribute('LIKESCORE') == maxscore:
maxcalls.append(call)
for c in calls:
if c.attribute('LIKESCORE') == maxscore and c.filterstr == 'PASS':
maxcalls.append(c)
else:
call.filter(kevlar.vcf.VariantFilter.PartitionScore)
for call in maxcalls:
call.annotate('CALLCLASS', partitionid)
c.filter(kevlar.vcf.VariantFilter.PartitionScore)
for c in maxcalls:
c.annotate('CALLCLASS', partitionid)
if calls[0].attribute('MATEDIST') is not None:
matedists = set([c.attribute('MATEDIST') for c in calls])
if matedists == set([float('inf')]):
Expand Down Expand Up @@ -231,9 +234,34 @@ def window_check(call, ksize=31):
return False


def check_hash_spanning_novel_kmers(call, caseabundlist, casemin):
abovethresh = [a for a in caseabundlist if a >= casemin]
if len(abovethresh) == 0:
call.filter(kevlar.vcf.VariantFilter.PassengerVariant)


def check_case_abund_low(call, caseabundlist, casemin, caseabundlow):
if not caseabundlow or caseabundlow <= 0:
return
belowthresh = [a < casemin for a in caseabundlist]
toomanykmers = [True] * caseabundlow
if ''.join(map(str, toomanykmers)) in ''.join(map(str, belowthresh)):
call.filter(kevlar.vcf.VariantFilter.CaseAbundance)


def check_ctrl_abund_high(call, ctrlabundlists, ctrlmax, ctrlabundhigh):
if not ctrlabundhigh or ctrlabundhigh <= 0:
return
for abundlist in ctrlabundlists:
toohigh = [a for a in abundlist if a > ctrlmax]
if len(toohigh) > ctrlabundhigh:
call.filter(kevlar.vcf.VariantFilter.ControlAbundance)
break


def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
dynamic=True, casemin=6, ctrlmax=1, samplelabels=None,
fastmode=False):
dynamic=True, casemin=6, ctrlmax=1, caseabundlow=5,
ctrlabundhigh=4, samplelabels=None, fastmode=False):
calls_by_partition = defaultdict(list)
if samplelabels is None:
samplelabels = default_sample_labels(len(controls) + 1)
Expand All @@ -250,14 +278,9 @@ def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
call.window, call.refrwindow, case, controls, refr
)
call.annotate('DROPPED', ndropped)
abovethresh = [a for a in altabund[0] if a > casemin]
if len(abovethresh) == 0:
call.filter(kevlar.vcf.VariantFilter.PassengerVariant)
for abundlist in altabund[1:]:
toohigh = [a for a in abundlist if a > ctrlmax]
if len(toohigh) > 4:
call.filter(kevlar.vcf.VariantFilter.ControlAbundance)
break
check_hash_spanning_novel_kmers(call, altabund[0], casemin)
check_case_abund_low(call, altabund[0], casemin, caseabundlow)
check_ctrl_abund_high(call, altabund[1:], ctrlmax, ctrlabundhigh)
skipvar = fastmode and call.filterstr != 'PASS'
if skipvar:
call.annotate('LIKESCORE', float('-inf'))
Expand Down Expand Up @@ -310,7 +333,8 @@ def main(args):
calculator = simlike(
reader, case, controls, refr, mu=args.mu, sigma=args.sigma,
epsilon=args.epsilon, dynamic=args.dynamic, casemin=args.case_min,
ctrlmax=args.ctrl_max, samplelabels=args.sample_labels,
ctrlmax=args.ctrl_max, caseabundlow=args.case_abund_low,
ctrlabundhigh=args.ctrl_abund_high, samplelabels=args.sample_labels,
fastmode=args.fast_mode,
)
for call in calculator:
Expand Down
Binary file added kevlar/tests/data/case-low-abund/calls.vcf.gz
Binary file not shown.
Binary file added kevlar/tests/data/case-low-abund/dad.ct
Binary file not shown.
Binary file added kevlar/tests/data/case-low-abund/kid.ct
Binary file not shown.
Binary file added kevlar/tests/data/case-low-abund/mom.ct
Binary file not shown.
Binary file added kevlar/tests/data/case-low-abund/refr.sct
Binary file not shown.
48 changes: 42 additions & 6 deletions kevlar/tests/test_call.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,19 +376,20 @@ def test_call_homopolymers_mixed_results():
partstream = kevlar.parse_partitioned_reads(gdnastream)
targets = kevlar.call.load_contigs(partstream)

kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))

prelimcalls = list()
for partid in contigs:
contiglist = contigs[partid]
gdnalist = targets[partid]
caller = kevlar.call.call(gdnalist, contiglist, partid=partid)
prelimcalls.extend(list(caller))

kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
scorer = kevlar.simlike.simlike(
prelimcalls, kid, [mom, dad], refr, samplelabels=['Kid', 'Mom', 'Dad'],
prelimcalls, kid, [mom, dad], refr,
samplelabels=['Proband', 'Mother', 'Father'],
)
calls = list(scorer)

Expand All @@ -409,3 +410,38 @@ def test_call_homopolymers_mixed_results():
assert call2._alt == 'T'
assert call3.position == 128660727
assert call3.filterstr == 'Homopolymer' # positive control


def test_call_homopolymer_filter_disabled():
contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
partstream = kevlar.parse_partitioned_reads(contigstream)
contigs = kevlar.call.load_contigs(partstream)

gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
partstream = kevlar.parse_partitioned_reads(gdnastream)
targets = kevlar.call.load_contigs(partstream)

prelimcalls = list()
for partid in contigs:
contiglist = contigs[partid]
gdnalist = targets[partid]
caller = kevlar.call.call(
gdnalist, contiglist, partid=partid, homopolyfilt=False
)
prelimcalls.extend(list(caller))

kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
scorer = kevlar.simlike.simlike(
prelimcalls, kid, [mom, dad], refr,
samplelabels=['Proband', 'Mother', 'Father'],
)
calls = list(scorer)

assert len(calls) == 6
for c in calls:
assert 'Homopolymer' not in c.filterstr
69 changes: 63 additions & 6 deletions kevlar/tests/test_simlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,24 @@ def miniabund(minitrio):
return altabund, refrabund


@pytest.fixture
def caselowsketches(scope="module"):
kid = kevlar.sketch.load(data_file('case-low-abund/kid.ct'))
mom = kevlar.sketch.load(data_file('case-low-abund/mom.ct'))
dad = kevlar.sketch.load(data_file('case-low-abund/dad.ct'))
refr = kevlar.sketch.load(data_file('case-low-abund/refr.sct'))
return kid, mom, dad, refr


@pytest.fixture
def ctrlhighsketches(scope="module"):
kid = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.kid.sct'))
mom = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.mom.sct'))
dad = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.dad.sct'))
refr = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.refr.sct'))
return kid, mom, dad, refr


def test_get_abundances(minitrio):
kid, mom, dad, ref = minitrio
altseq = 'TGTCTCCCTCCCCTCCACCCCCAGAAATGGGTTTTTGATAGTCTTCCAAAGTTAGGGTAGT'
Expand Down Expand Up @@ -233,16 +251,55 @@ def test_simlike_fastmode():
]


def test_simlike_ctrl_high_abund():
kid = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.kid.sct'))
mom = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.mom.sct'))
dad = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.dad.sct'))
refr = kevlar.sketch.load(data_file('ctrl-high-abund/cc57120.refr.sct'))
@pytest.mark.parametrize('threshold,filterstatus', [
(-10, False),
(-1, False),
(0, False),
(None, False),
(False, False),
(1, True),
(3, True),
(5, True),
(15, False),
(49, False),
])
def test_simlike_ctrl_high_abund(threshold, filterstatus, ctrlhighsketches):
kid, mom, dad, refr = ctrlhighsketches
vcfin = kevlar.open(data_file('ctrl-high-abund/cc57120.calls.vcf'), 'r')
prelimcalls = kevlar.vcf.VCFReader(vcfin)
scorer = kevlar.simlike.simlike(
prelimcalls, kid, [mom, dad], refr, samplelabels=['Kid', 'Mom', 'Dad'],
ctrlabundhigh=threshold,
)
calls = list(scorer)
assert len(calls) == 2
assert len([c for c in calls if 'ControlAbundance' in c.filterstr])
print([c.filterstr for c in calls])
for c in calls:
assert ('ControlAbundance' in c.filterstr) is filterstatus


@pytest.mark.parametrize('casemin,abund,numfilt', [
(6, -10, 0),
(6, -1, 0),
(6, 0, 0),
(6, None, 0),
(6, False, 0),
(6, 5, 4),
(7, 5, 5),
(6, 4, 5),
(6, 9, 4),
(6, 10, 3),
])
def test_simlike_case_low_abund(casemin, abund, numfilt, caselowsketches):
kid, mom, dad, refr = caselowsketches
vcfin = kevlar.open(data_file('case-low-abund/calls.vcf.gz'), 'r')
prelimcalls = kevlar.vcf.VCFReader(vcfin)
scorer = kevlar.simlike.simlike(
prelimcalls, kid, [mom, dad], refr, samplelabels=['Kid', 'Mom', 'Dad'],
casemin=casemin, caseabundlow=abund,
)
calls = list(scorer)
assert len(calls) == 5
print([c.filterstr for c in calls])
filtered = [c for c in calls if 'CaseAbundance' in c.filterstr]
assert len(filtered) == numfilt
10 changes: 7 additions & 3 deletions kevlar/varmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class VariantMapping(object):
procedures.
"""
def __init__(self, contig, cutout, score=None, cigar=None, strand=1,
match=1, mismatch=2, gapopen=5, gapextend=0):
match=1, mismatch=2, gapopen=5, gapextend=0,
homopolyfilt=True):
if score is None:
score, cigar, strand = align_both_strands(
cutout, contig, match, mismatch, gapopen, gapextend
Expand All @@ -34,6 +35,7 @@ def __init__(self, contig, cutout, score=None, cigar=None, strand=1,
self.cutout = cutout
self.score = score
self.strand = strand
self.do_homopolymer_filter = homopolyfilt
self.matedist = None
self.trimmed = 0

Expand Down Expand Up @@ -156,14 +158,16 @@ def is_passenger(self, call):
return numikmers == 0

def homopolymer_filter(self):
if not self.do_homopolymer_filter:
return False
rf = self.rightflank
if rf is None or len(rf.target) < 4:
return False
rf = rf.target
firstchar = rf[0]
poly4 = firstchar * 4
first6 = rf[0:6]
return poly4 in first6
first7 = rf[0:7]
return poly4 in first7

def call_variants(self, ksize, mindist=6):
"""Attempt to call variants from this contig alignment.
Expand Down
6 changes: 5 additions & 1 deletion kevlar/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class VariantFilter(Enum):
NumerousMismatches = 7
UserFilter = 8
ControlAbundance = 9
Homopolymer = 10
CaseAbundance = 10
Homopolymer = 11


class FormattedList(list):
Expand Down Expand Up @@ -274,6 +275,9 @@ class VCFWriter(object):
VariantFilter.ControlAbundance:
'Too many variant-spanning k-mers have high abundance in one or '
'more control samples.',
VariantFilter.CaseAbundance:
'Too many consecutive variant-spanning k-mers have low abundance '
'in the case/proband sample.',
VariantFilter.Homopolymer:
'Indels associate with homopolymers are most often spurious and '
'very difficult to verify with confidence.'
Expand Down

0 comments on commit 5dcd514

Please sign in to comment.