Merge pull request #259 from dib-lab/feature/simlike

New "simlike" subcommand
kevlar-dev · May 17, 2018 · 3aa4ca3 · 3aa4ca3
2 parents dc568ab + e41d398
commit 3aa4ca3
Show file tree

Hide file tree

Showing 10 changed files with 537 additions and 24 deletions.
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -2,4 +2,5 @@ sphinx-argparse
 pandas
 pysam>=0.11.2
 networkx>=2.0
+scipy>=1.0
 git+https://github.com/dib-lab/khmer.git
diff --git a/kevlar/__init__.py b/kevlar/__init__.py
@@ -52,6 +52,7 @@
 from kevlar import call
 from kevlar import alac
 from kevlar import simplex
+from kevlar import simlike
 from kevlar import split
 from kevlar import gentrio
 from kevlar import cli

diff --git a/kevlar/cli/__init__.py b/kevlar/cli/__init__.py
@@ -25,6 +25,7 @@
 from . import call
 from . import alac
 from . import simplex
+from . import simlike
 from . import split
 
 mains = {
@@ -42,6 +43,7 @@
     'call': kevlar.call.main,
     'alac': kevlar.alac.main,
     'simplex': kevlar.simplex.main,
+    'simlike': kevlar.simlike.main,
     'split': kevlar.split.main,
 }
 
@@ -60,6 +62,7 @@
     'call': call.subparser,
     'alac': alac.subparser,
     'simplex': simplex.subparser,
+    'simlike': simlike.subparser,
     'split': split.subparser,
 }
 

diff --git a/kevlar/cli/simlike.py b/kevlar/cli/simlike.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import sys
+
+
+def subparser(subparsers):
+    """Define the `kevlar simlike` command-line interface."""
+
+    desc = "Sort variants by likelihood score"
+    subparser = subparsers.add_parser('simlike', description=desc,
+                                      add_help=False)
+
+    count_args = subparser.add_argument_group(
+        'K-mer counts',
+        'Likelihood scores are based on the abundance of alternate allele '
+        'k-mers in each sample and on the abundance of reference allele '
+        'k-mers in the reference genome.'
+    )
+    count_args.add_argument('--case', metavar='CT',
+                            help='k-mer counttable for case/proband')
+    count_args.add_argument('--controls', nargs='+', metavar='CT',
+                            help='k-mer counttables for controls/parents/'
+                            'siblings, 1 counttable per sample')
+    count_args.add_argument('--refr', metavar='REFR', help='k-mer '
+                            'smallcounttable for reference genome')
+
+    cov_args = subparser.add_argument_group(
+        'K-mer coverage',
+        'Likelihood scores also depend on the estimated or observed '
+        'distribution of k-mer abundances in each sample.'
+    )
+    cov_args.add_argument('--mu', metavar='μ', type=float, default=30.0,
+                          help='mean k-mer abundance; default is 30.0')
+    cov_args.add_argument('--sigma', metavar='σ', type=float, default=8.0,
+                          help='standard deviation of k-mer abundance; '
+                          'default is 8.0')
+    cov_args.add_argument('--epsilon', metavar='ε', type=float, default=0.001,
+                          help='error rate; default is 0.001')
+
+    misc_args = subparser.add_argument_group('Miscellaneous settings')
+    misc_args.add_argument('-h', '--help', action='help',
+                           help='show this help message and exit')
+    misc_args.add_argument('--sample-labels', metavar='LBL', type=str,
+                           nargs='+', help='list of sample labels (with case/'
+                           'proband first)')
+    misc_args.add_argument('-o', '--out', metavar='OUT', default='-',
+                           help='output file; default is terminal (standard '
+                           'output)')
+    misc_args.add_argument('--case-min', metavar='Y', type=int, default=5,
+                           help='minimum abundance threshold for proband; '
+                           'default is 5')
+    subparser.add_argument('vcf')
diff --git a/kevlar/simlike.py b/kevlar/simlike.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+from collections import defaultdict
+from math import log
+import sys
+
+import kevlar
+from kevlar.vcf import Variant
+import khmer
+import scipy.stats
+
+
+class KevlarSampleLabelingError(ValueError):
+    pass
+
+
+def get_abundances(altseq, refrseq, case, controls, refr):
+    """Create a nested list of k-mer abundances.
+
+    abundances = [
+        [15, 14, 13, 16, 14, 15, 14, 14],  # k-mer abundances from case/proband
+        [0, 0, 1, 0, 2, 10, 0, 0],  # k-mer abundances from parent/control 1
+        [0, 1, 1, 0, 1, 0, 2, 0],  # k-mer abundances from parent/control 2
+    ]
+    refr_abunds = [1, 1, 2, 1, 4, 2, 1, 1]  # genomic freq of refr allele kmers
+    """
+    altkmers = case.get_kmers(altseq)
+    refrkmers = case.get_kmers(refrseq)
+    if len(altseq) == len(refrseq):
+        valid_alt_kmers = list()
+        refr_abunds = list()
+        for altkmer, refrkmer in zip(altkmers, refrkmers):
+            if refr.get(altkmer) == 0:
+                valid_alt_kmers.append(altkmer)
+                refr_abunds.append(refr.get(refrkmer))
+    else:
+        valid_alt_kmers = [k for k in altkmers if refr.get(k) == 0]
+        refr_abunds = [None] * len(valid_alt_kmers)
+    ndropped = len(altkmers) - len(valid_alt_kmers)
+
+    abundances = list()
+    for _ in range(len(controls) + 1):
+        abundances.append(list())
+    for kmer in valid_alt_kmers:
+        abund = case.get(kmer)
+        abundances[0].append(abund)
+        for control, abundlist in zip(controls, abundances[1:]):
+            abund = control.get(kmer)
+            abundlist.append(abund)
+    return abundances, refr_abunds, ndropped
+
+
+def abund_log_prob(genotype, abundance, refrabund=None, mean=30.0, sd=8.0,
+                   error=0.001):
+    """Calculate probability of k-mer abundance conditioned on genotype.
+
+    The `genotype` variable represents the number of assumed allele copies and
+    is one of {0, 1, 2} (corresponding to genotypes {0/0, 0/1, and 1/1}). The
+    `mean` and `sd` variables describe a normal distribution of observed
+    abundances of k-mers with copy number 2. The `error` parameter is the error
+    rate.
+
+    For SNVs, there is a 1-to-1 correspondence of alternate allele k-mers to
+    reference allele k-mers. We can therefore check the frequency of the
+    reference allele in the reference genome and increase the error rate if it
+    is repetitive. There is no such mapping of alt allele k-mers to refr allele
+    k-mers for indels, so we use a flat error rate.
+    """
+    if genotype == 0:
+        if refrabund:  # SNV
+            erate = error * mean * refrabund
+        else:  # INDEL
+            erate = error * mean * 0.1
+        return abundance * log(erate)
+    elif genotype == 1:
+        return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2)
+    elif genotype == 2:
+        return scipy.stats.norm.logpdf(abundance, mean, sd)
+
+
+def likelihood_denovo(abunds, refrabunds, mean=30.0, sd=8.0, error=0.001):
+    assert len(abunds[1]) == len(refrabunds)
+    assert len(abunds[2]) == len(refrabunds)
+    logsum = 0.0
+
+    # Case
+    for abund in abunds[0]:
+        logsum += abund_log_prob(1, abund, mean=mean, sd=sd)
+    # Controls
+    for altabunds in abunds[1:]:
+        for alt, refr in zip(altabunds, refrabunds):
+            logsum += abund_log_prob(0, alt, refrabund=refr, mean=mean,
+                                     error=error)
+    return logsum
+
+
+def likelihood_false(abunds, refrabunds, mean=30.0, error=0.001):
+    assert len(abunds[1]) == len(refrabunds)
+    assert len(abunds[2]) == len(refrabunds)
+    logsum = 0.0
+    for altabunds in abunds:
+        for alt, refr in zip(altabunds, refrabunds):
+            logsum += abund_log_prob(0, alt, refrabund=refr, mean=mean,
+                                     error=error)
+    return logsum
+
+
+def likelihood_inherited(abunds, mean=30.0, sd=8.0, error=0.001):
+    """Compute the likelihood that a variant is inherited.
+
+    There are 15 valid inheritance scenarios, 11 of which (shown below) result
+    in the proband carrying at least one copy of the alternate allele. Select
+    the one with the highest likelihood.
+
+    The other likelihood calculations are implemented to handle an arbitrary
+    number of controls, but this can only handle trios.
+    """
+    scenarios = [
+        (1, 0, 1), (1, 0, 2),
+        (1, 1, 0), (1, 1, 1), (1, 1, 2),
+        (1, 2, 0), (1, 2, 1),
+        (2, 1, 1), (2, 1, 2),
+        (2, 2, 1), (2, 2, 2),
+    ]
+    logsum = 0.0
+    abundances = zip(abunds[0], abunds[1], abunds[2])
+    for a_c, a_m, a_f in abundances:
+        maxval = None
+        for g_c, g_m, g_f in scenarios:
+            p_c = abund_log_prob(g_c, a_c, mean=mean, sd=sd, error=error)
+            p_m = abund_log_prob(g_m, a_m, mean=mean, sd=sd, error=error)
+            p_f = abund_log_prob(g_f, a_f, mean=mean, sd=sd, error=error)
+            testsum = p_c + p_m + p_f + log(1.0 / 15.0)
+            if maxval is None or testsum > maxval:
+                maxval = testsum
+        logsum += maxval
+    return log(15.0 / 11.0) + logsum  # 1 / (11/15)
+
+
+def joinlist(thelist):
+    if len(thelist) == 0:
+        return '.'
+    else:
+        return ','.join([str(v) for v in thelist])
+
+
+def calc_likescore(call, altabund, refrabund, mu, sigma, epsilon):
+    lldn = likelihood_denovo(altabund, refrabund, mean=mu, sd=sigma,
+                             error=epsilon)
+    llfp = likelihood_false(altabund, refrabund, mean=mu, error=epsilon)
+    llih = likelihood_inherited(altabund, mean=mu, sd=sigma, error=epsilon)
+    likescore = lldn - max(llfp, llih)
+    call.annotate('LLDN', '{:.3f}'.format(lldn))
+    call.annotate('LLFP', '{:.3f}'.format(llfp))
+    call.annotate('LLIH', '{:.3f}'.format(llih))
+    call.annotate('LIKESCORE', '{:.3f}'.format(likescore))
+
+
+def default_sample_labels(nsamples):
+    samples = list()
+    for i in range(nsamples):
+        samples.append('Control{:d}'.format(i))
+    samples[0] = 'Case'
+    return samples
+
+
+def annotate_abundances(call, abundances, samplelabels):
+    for sample, abundlist in zip(samplelabels, abundances):
+        abundstr = joinlist(abundlist)
+        call.format(sample, 'ALTABUND', abundstr)
+
+
+def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
+            casemin=5, samplelabels=None, logstream=sys.stderr):
+    calls_by_partition = defaultdict(list)
+    if samplelabels is None:
+        samplelabels = default_sample_labels(len(controls) + 1)
+    for call in variants:
+        if len(call.window) < case.ksize():
+            message = 'WARNING: stubbornly refusing to compute likelihood '
+            message += ' for variant-spanning window {:s}'.format(call.window)
+            message += ', shorter than k size {:s}'.format(case.ksize())
+            print(message, file=logstream)
+            call.annotate('LIKESCORE', '-inf')
+            calls_by_partition[call.attribute('PART')].append(call)
+            continue
+        altabund, refrabund, ndropped = get_abundances(
+            call.window, call.refrwindow, case, controls, refr
+        )
+        call.annotate('DROPPED', str(ndropped))
+        abovethresh = [a for a in altabund[0] if a > casemin]
+        if len(abovethresh) == 0:
+            call.filter(kevlar.vcf.VariantFilter.PassengerVariant)
+        calc_likescore(call, altabund, refrabund, mu, sigma, epsilon)
+        annotate_abundances(call, altabund, samplelabels)
+        calls_by_partition[call.attribute('PART')].append(call)
+
+    allcalls = list()
+    for partition, calls in calls_by_partition.items():
+        scores = [float(c.attribute('LIKESCORE')) for c in calls]
+        maxscore = max(scores)
+        for call, score in zip(calls, scores):
+            if score < maxscore:
+                call.filter(kevlar.vcf.VariantFilter.PartitionScore)
+            allcalls.append(call)
+
+    allcalls.sort(key=lambda c: float(c.attribute('LIKESCORE')), reverse=True)
+    for call in allcalls:
+        yield call
+
+
+def main(args):
+    nsamples = len(args.controls) + 1
+    if args.sample_labels:
+        nlabels = len(args.sample_labels)
+        if nlabels and nlabels != nsamples:
+            message = 'provided {:d} labels'.format(nlabels)
+            message += ' but {:d} samples'.format(nsamples)
+            raise KevlarSampleLabelingError(message)
+    else:
+        args.sample_labels = default_sample_labels(nsamples)
+
+    case = khmer.Counttable.load(args.case)
+    controls = [khmer.Counttable.load(c) for c in args.controls]
+    refr = khmer.SmallCounttable.load(args.refr)
+
+    instream = kevlar.open(args.vcf, 'r')
+    outstream = kevlar.open(args.out, 'w')
+    reader = kevlar.vcf.VCFReader(instream)
+    writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::simlike')
+
+    for label in args.sample_labels:
+        writer.register_sample(label)
+    writer.write_header()
+
+    calculator = simlike(
+        reader, case, controls, refr, mu=args.mu, sigma=args.sigma,
+        epsilon=args.epsilon, casemin=args.case_min,
+        samplelabels=args.sample_labels, logstream=args.logfile,
+    )
+    for call in calculator:
+        writer.write(call)
diff --git a/kevlar/tests/data/minitrio/calls.vcf b/kevlar/tests/data/minitrio/calls.vcf
@@ -0,0 +1 @@
+seq1	45813	.	C	G	.	PASS	ALTWINDOW=TGTCTCCCTCCCCTCCACCCCCAGAAATGGGTTTTTGATAGTCTTCCAAAGTTAGGGTAGT;CIGAR=50D139M50D;IKMERS=30;KSW2=126;PART=1;REFRWINDOW=TGTCTCCCTCCCCTCCACCCCCAGAAATGGCTTTTTGATAGTCTTCCAAAGTTAGGGTAGT;CONTIG=TGCTATGCCCTAAGCAATGCAATTCTATTGTGTCTCCCTCCCCTCCACCCCCAGAAATGGGTTTTTGATAGTCTTCCAAAGTTAGGGTAGTTTTTCCAATAGGTGATTTGGTAAGCCCAGAAGACTTTAGGATGCCTTG