Skip to content

Commit

Permalink
Merge pull request #259 from dib-lab/feature/simlike
Browse files Browse the repository at this point in the history
New "simlike" subcommand
  • Loading branch information
standage committed May 17, 2018
2 parents dc568ab + e41d398 commit 3aa4ca3
Show file tree
Hide file tree
Showing 10 changed files with 537 additions and 24 deletions.
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ sphinx-argparse
pandas
pysam>=0.11.2
networkx>=2.0
scipy>=1.0
git+https://github.com/dib-lab/khmer.git
1 change: 1 addition & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from kevlar import call
from kevlar import alac
from kevlar import simplex
from kevlar import simlike
from kevlar import split
from kevlar import gentrio
from kevlar import cli
Expand Down
3 changes: 3 additions & 0 deletions kevlar/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from . import call
from . import alac
from . import simplex
from . import simlike
from . import split

mains = {
Expand All @@ -42,6 +43,7 @@
'call': kevlar.call.main,
'alac': kevlar.alac.main,
'simplex': kevlar.simplex.main,
'simlike': kevlar.simlike.main,
'split': kevlar.split.main,
}

Expand All @@ -60,6 +62,7 @@
'call': call.subparser,
'alac': alac.subparser,
'simplex': simplex.subparser,
'simlike': simlike.subparser,
'split': split.subparser,
}

Expand Down
59 changes: 59 additions & 0 deletions kevlar/cli/simlike.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import sys


def subparser(subparsers):
"""Define the `kevlar simlike` command-line interface."""

desc = "Sort variants by likelihood score"
subparser = subparsers.add_parser('simlike', description=desc,
add_help=False)

count_args = subparser.add_argument_group(
'K-mer counts',
'Likelihood scores are based on the abundance of alternate allele '
'k-mers in each sample and on the abundance of reference allele '
'k-mers in the reference genome.'
)
count_args.add_argument('--case', metavar='CT',
help='k-mer counttable for case/proband')
count_args.add_argument('--controls', nargs='+', metavar='CT',
help='k-mer counttables for controls/parents/'
'siblings, 1 counttable per sample')
count_args.add_argument('--refr', metavar='REFR', help='k-mer '
'smallcounttable for reference genome')

cov_args = subparser.add_argument_group(
'K-mer coverage',
'Likelihood scores also depend on the estimated or observed '
'distribution of k-mer abundances in each sample.'
)
cov_args.add_argument('--mu', metavar='μ', type=float, default=30.0,
help='mean k-mer abundance; default is 30.0')
cov_args.add_argument('--sigma', metavar='σ', type=float, default=8.0,
help='standard deviation of k-mer abundance; '
'default is 8.0')
cov_args.add_argument('--epsilon', metavar='ε', type=float, default=0.001,
help='error rate; default is 0.001')

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument('-h', '--help', action='help',
help='show this help message and exit')
misc_args.add_argument('--sample-labels', metavar='LBL', type=str,
nargs='+', help='list of sample labels (with case/'
'proband first)')
misc_args.add_argument('-o', '--out', metavar='OUT', default='-',
help='output file; default is terminal (standard '
'output)')
misc_args.add_argument('--case-min', metavar='Y', type=int, default=5,
help='minimum abundance threshold for proband; '
'default is 5')
subparser.add_argument('vcf')
249 changes: 249 additions & 0 deletions kevlar/simlike.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from collections import defaultdict
from math import log
import sys

import kevlar
from kevlar.vcf import Variant
import khmer
import scipy.stats


class KevlarSampleLabelingError(ValueError):
pass


def get_abundances(altseq, refrseq, case, controls, refr):
"""Create a nested list of k-mer abundances.
abundances = [
[15, 14, 13, 16, 14, 15, 14, 14], # k-mer abundances from case/proband
[0, 0, 1, 0, 2, 10, 0, 0], # k-mer abundances from parent/control 1
[0, 1, 1, 0, 1, 0, 2, 0], # k-mer abundances from parent/control 2
]
refr_abunds = [1, 1, 2, 1, 4, 2, 1, 1] # genomic freq of refr allele kmers
"""
altkmers = case.get_kmers(altseq)
refrkmers = case.get_kmers(refrseq)
if len(altseq) == len(refrseq):
valid_alt_kmers = list()
refr_abunds = list()
for altkmer, refrkmer in zip(altkmers, refrkmers):
if refr.get(altkmer) == 0:
valid_alt_kmers.append(altkmer)
refr_abunds.append(refr.get(refrkmer))
else:
valid_alt_kmers = [k for k in altkmers if refr.get(k) == 0]
refr_abunds = [None] * len(valid_alt_kmers)
ndropped = len(altkmers) - len(valid_alt_kmers)

abundances = list()
for _ in range(len(controls) + 1):
abundances.append(list())
for kmer in valid_alt_kmers:
abund = case.get(kmer)
abundances[0].append(abund)
for control, abundlist in zip(controls, abundances[1:]):
abund = control.get(kmer)
abundlist.append(abund)
return abundances, refr_abunds, ndropped


def abund_log_prob(genotype, abundance, refrabund=None, mean=30.0, sd=8.0,
error=0.001):
"""Calculate probability of k-mer abundance conditioned on genotype.
The `genotype` variable represents the number of assumed allele copies and
is one of {0, 1, 2} (corresponding to genotypes {0/0, 0/1, and 1/1}). The
`mean` and `sd` variables describe a normal distribution of observed
abundances of k-mers with copy number 2. The `error` parameter is the error
rate.
For SNVs, there is a 1-to-1 correspondence of alternate allele k-mers to
reference allele k-mers. We can therefore check the frequency of the
reference allele in the reference genome and increase the error rate if it
is repetitive. There is no such mapping of alt allele k-mers to refr allele
k-mers for indels, so we use a flat error rate.
"""
if genotype == 0:
if refrabund: # SNV
erate = error * mean * refrabund
else: # INDEL
erate = error * mean * 0.1
return abundance * log(erate)
elif genotype == 1:
return scipy.stats.norm.logpdf(abundance, mean / 2, sd / 2)
elif genotype == 2:
return scipy.stats.norm.logpdf(abundance, mean, sd)


def likelihood_denovo(abunds, refrabunds, mean=30.0, sd=8.0, error=0.001):
assert len(abunds[1]) == len(refrabunds)
assert len(abunds[2]) == len(refrabunds)
logsum = 0.0

# Case
for abund in abunds[0]:
logsum += abund_log_prob(1, abund, mean=mean, sd=sd)
# Controls
for altabunds in abunds[1:]:
for alt, refr in zip(altabunds, refrabunds):
logsum += abund_log_prob(0, alt, refrabund=refr, mean=mean,
error=error)
return logsum


def likelihood_false(abunds, refrabunds, mean=30.0, error=0.001):
assert len(abunds[1]) == len(refrabunds)
assert len(abunds[2]) == len(refrabunds)
logsum = 0.0
for altabunds in abunds:
for alt, refr in zip(altabunds, refrabunds):
logsum += abund_log_prob(0, alt, refrabund=refr, mean=mean,
error=error)
return logsum


def likelihood_inherited(abunds, mean=30.0, sd=8.0, error=0.001):
"""Compute the likelihood that a variant is inherited.
There are 15 valid inheritance scenarios, 11 of which (shown below) result
in the proband carrying at least one copy of the alternate allele. Select
the one with the highest likelihood.
The other likelihood calculations are implemented to handle an arbitrary
number of controls, but this can only handle trios.
"""
scenarios = [
(1, 0, 1), (1, 0, 2),
(1, 1, 0), (1, 1, 1), (1, 1, 2),
(1, 2, 0), (1, 2, 1),
(2, 1, 1), (2, 1, 2),
(2, 2, 1), (2, 2, 2),
]
logsum = 0.0
abundances = zip(abunds[0], abunds[1], abunds[2])
for a_c, a_m, a_f in abundances:
maxval = None
for g_c, g_m, g_f in scenarios:
p_c = abund_log_prob(g_c, a_c, mean=mean, sd=sd, error=error)
p_m = abund_log_prob(g_m, a_m, mean=mean, sd=sd, error=error)
p_f = abund_log_prob(g_f, a_f, mean=mean, sd=sd, error=error)
testsum = p_c + p_m + p_f + log(1.0 / 15.0)
if maxval is None or testsum > maxval:
maxval = testsum
logsum += maxval
return log(15.0 / 11.0) + logsum # 1 / (11/15)


def joinlist(thelist):
if len(thelist) == 0:
return '.'
else:
return ','.join([str(v) for v in thelist])


def calc_likescore(call, altabund, refrabund, mu, sigma, epsilon):
lldn = likelihood_denovo(altabund, refrabund, mean=mu, sd=sigma,
error=epsilon)
llfp = likelihood_false(altabund, refrabund, mean=mu, error=epsilon)
llih = likelihood_inherited(altabund, mean=mu, sd=sigma, error=epsilon)
likescore = lldn - max(llfp, llih)
call.annotate('LLDN', '{:.3f}'.format(lldn))
call.annotate('LLFP', '{:.3f}'.format(llfp))
call.annotate('LLIH', '{:.3f}'.format(llih))
call.annotate('LIKESCORE', '{:.3f}'.format(likescore))


def default_sample_labels(nsamples):
samples = list()
for i in range(nsamples):
samples.append('Control{:d}'.format(i))
samples[0] = 'Case'
return samples


def annotate_abundances(call, abundances, samplelabels):
for sample, abundlist in zip(samplelabels, abundances):
abundstr = joinlist(abundlist)
call.format(sample, 'ALTABUND', abundstr)


def simlike(variants, case, controls, refr, mu=30.0, sigma=8.0, epsilon=0.001,
casemin=5, samplelabels=None, logstream=sys.stderr):
calls_by_partition = defaultdict(list)
if samplelabels is None:
samplelabels = default_sample_labels(len(controls) + 1)
for call in variants:
if len(call.window) < case.ksize():
message = 'WARNING: stubbornly refusing to compute likelihood '
message += ' for variant-spanning window {:s}'.format(call.window)
message += ', shorter than k size {:s}'.format(case.ksize())
print(message, file=logstream)
call.annotate('LIKESCORE', '-inf')
calls_by_partition[call.attribute('PART')].append(call)
continue
altabund, refrabund, ndropped = get_abundances(
call.window, call.refrwindow, case, controls, refr
)
call.annotate('DROPPED', str(ndropped))
abovethresh = [a for a in altabund[0] if a > casemin]
if len(abovethresh) == 0:
call.filter(kevlar.vcf.VariantFilter.PassengerVariant)
calc_likescore(call, altabund, refrabund, mu, sigma, epsilon)
annotate_abundances(call, altabund, samplelabels)
calls_by_partition[call.attribute('PART')].append(call)

allcalls = list()
for partition, calls in calls_by_partition.items():
scores = [float(c.attribute('LIKESCORE')) for c in calls]
maxscore = max(scores)
for call, score in zip(calls, scores):
if score < maxscore:
call.filter(kevlar.vcf.VariantFilter.PartitionScore)
allcalls.append(call)

allcalls.sort(key=lambda c: float(c.attribute('LIKESCORE')), reverse=True)
for call in allcalls:
yield call


def main(args):
nsamples = len(args.controls) + 1
if args.sample_labels:
nlabels = len(args.sample_labels)
if nlabels and nlabels != nsamples:
message = 'provided {:d} labels'.format(nlabels)
message += ' but {:d} samples'.format(nsamples)
raise KevlarSampleLabelingError(message)
else:
args.sample_labels = default_sample_labels(nsamples)

case = khmer.Counttable.load(args.case)
controls = [khmer.Counttable.load(c) for c in args.controls]
refr = khmer.SmallCounttable.load(args.refr)

instream = kevlar.open(args.vcf, 'r')
outstream = kevlar.open(args.out, 'w')
reader = kevlar.vcf.VCFReader(instream)
writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::simlike')

for label in args.sample_labels:
writer.register_sample(label)
writer.write_header()

calculator = simlike(
reader, case, controls, refr, mu=args.mu, sigma=args.sigma,
epsilon=args.epsilon, casemin=args.case_min,
samplelabels=args.sample_labels, logstream=args.logfile,
)
for call in calculator:
writer.write(call)
1 change: 1 addition & 0 deletions kevlar/tests/data/minitrio/calls.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
seq1 45813 . C G . PASS ALTWINDOW=TGTCTCCCTCCCCTCCACCCCCAGAAATGGGTTTTTGATAGTCTTCCAAAGTTAGGGTAGT;CIGAR=50D139M50D;IKMERS=30;KSW2=126;PART=1;REFRWINDOW=TGTCTCCCTCCCCTCCACCCCCAGAAATGGCTTTTTGATAGTCTTCCAAAGTTAGGGTAGT;CONTIG=TGCTATGCCCTAAGCAATGCAATTCTATTGTGTCTCCCTCCCCTCCACCCCCAGAAATGGGTTTTTGATAGTCTTCCAAAGTTAGGGTAGTTTTTCCAATAGGTGATTTGGTAAGCCCAGAAGACTTTAGGATGCCTTG

0 comments on commit 3aa4ca3

Please sign in to comment.