New --min-like-score filter for simlike module (#343)

This update allows users to adjust the threshold used to filter out low-scoring variant predictions. Closes #341.
kevlar-dev · Jan 22, 2019 · 519b595 · 519b595
1 parent e899720
commit 519b595
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,9 +8,10 @@ This project adheres to [Semantic Versioning](http://semver.org/).
 - A new Snakemake workflow for preprocessing BAM inputs for analysis with kevlar (see #305).
 - A new Snakemake workflow for kevlar's standard processing procedure (see #306).
 - New `unband` module to merge augmented Fastq files produced with a *k*-mer banding strategy (see #316).
-- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318).
+- New `varfilter` module to filter out preliminary variant calls overlapping with problematic/unwanted loci or features (see #318, #342).
 - New dependency: `intervaltree` package (see #318).
 - A new `sandbox` directory with convenience scripts for development and analysis (see #335).
+- A new `--min-like-score` filter for the `simlike` module (see #343).
 
 ### Changed
 - Added a new flag to print to the terminal (stderr) and a logfile simultaneously (see #308).

diff --git a/kevlar/cli/simlike.py b/kevlar/cli/simlike.py
@@ -77,6 +77,12 @@ def subparser(subparsers):
         'low abundance means < --case-min); by default, L=5; set L<=0 to '
         'disable the filter'
     )
+    filt_args.add_argument(
+        '--min-like-score', metavar='S', type=float, default=0.0,
+        help='filter out variant predictions with likelihood scores < S; by '
+        'default, S = 0.0, but it\'s often possible to improve specificity '
+        'without sacrificing sensitivity by raising S to, for example, 50.0'
+    )
 
     misc_args = subparser.add_argument_group('Miscellaneous settings')
     misc_args.add_argument('-h', '--help', action='help',

diff --git a/kevlar/simlike.py b/kevlar/simlike.py
@@ -261,7 +261,8 @@ def check_ctrl_abund_high(call, ctrlabundlists, ctrlmax, ctrlabundhigh):
 
 def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
             dynamic=True, casemin=6, ctrlmax=1, caseabundlow=5,
-            ctrlabundhigh=4, samplelabels=None, fastmode=False):
+            ctrlabundhigh=4, samplelabels=None, fastmode=False,
+            minlikescore=0.0):
     calls_by_partition = defaultdict(list)
     if samplelabels is None:
         samplelabels = default_sample_labels(len(controls) + 1)
@@ -299,7 +300,7 @@ def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
 
     allcalls.sort(key=lambda c: c.attribute('LIKESCORE'), reverse=True)
     for call in allcalls:
-        if call.attribute('LIKESCORE') < 0.0:
+        if call.attribute('LIKESCORE') < minlikescore:
             call.filter(kevlar.vcf.VariantFilter.LikelihoodFail)
         yield call
 
@@ -335,7 +336,7 @@ def main(args):
         epsilon=args.epsilon, dynamic=args.dynamic, casemin=args.case_min,
         ctrlmax=args.ctrl_max, caseabundlow=args.case_abund_low,
         ctrlabundhigh=args.ctrl_abund_high, samplelabels=args.sample_labels,
-        fastmode=args.fast_mode,
+        fastmode=args.fast_mode, minlikescore=args.min_like_score,
     )
     for call in calculator:
         writer.write(call)
diff --git a/kevlar/tests/test_simlike.py b/kevlar/tests/test_simlike.py
@@ -303,3 +303,30 @@ def test_simlike_case_low_abund(casemin, abund, numfilt, caselowsketches):
     print([c.filterstr for c in calls])
     filtered = [c for c in calls if 'CaseAbundance' in c.filterstr]
     assert len(filtered) == numfilt
+
+
+def test_simlike_min_like_score(ctrlhighsketches):
+    kid, mom, dad, refr = ctrlhighsketches
+    vcfin = kevlar.open(data_file('ctrl-high-abund/cc57120.calls.vcf'), 'r')
+    prelimcalls = kevlar.vcf.VCFReader(vcfin)
+    scorer = kevlar.simlike.simlike(
+        prelimcalls, kid, [mom, dad], refr, samplelabels=['Kid', 'Mom', 'Dad'],
+        ctrlabundhigh=0, caseabundlow=0, minlikescore=0.0,
+    )
+    calls = list(scorer)
+    passing = [c for c in calls if c.filterstr == 'PASS']
+    notpassing = [c for c in calls if c.filterstr != 'PASS']
+    assert len(passing) == 1
+    assert len(notpassing) == 1
+
+    vcfin = kevlar.open(data_file('ctrl-high-abund/cc57120.calls.vcf'), 'r')
+    prelimcalls = kevlar.vcf.VCFReader(vcfin)
+    scorer = kevlar.simlike.simlike(
+        prelimcalls, kid, [mom, dad], refr, samplelabels=['Kid', 'Mom', 'Dad'],
+        ctrlabundhigh=0, caseabundlow=0, minlikescore=400.0,
+    )
+    calls = list(scorer)
+    passing = [c for c in calls if c.filterstr == 'PASS']
+    notpassing = [c for c in calls if c.filterstr != 'PASS']
+    assert len(passing) == 0
+    assert len(notpassing) == 2