Skip to content

Commit

Permalink
Merge pull request #268 from dib-lab/extend/simplex
Browse files Browse the repository at this point in the history
Integrate likelihood calculations into simplex
  • Loading branch information
standage committed Jun 14, 2018
2 parents 21d8712 + 05c620e commit 52f6f53
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 10 deletions.
32 changes: 32 additions & 0 deletions kevlar/cli/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,38 @@ def subparser(subparsers):
'gap extension penalty; default is 0'
)

like_args = subparser.add_argument_group(
'Likelihood calculations',
'The abundance of each k-mer in each sample and in the reference '
'genome is used to compute a likelihood score for each variant.'
)
like_args.add_argument(
'--refr-sct', metavar='SCT', help='SmallCounttable of k-mer '
'abundances in the reference genome; if not provided, this will be '
'populated from scratch from the `refr` argument.'
)
like_args.add_argument(
'--refr-sct-mem', default='1e9', type=khmer_args.memory_setting,
metavar='MEM', help='memory allocated to storing reference genome '
'k-mer abundances; only used if `--refr-sct` is not provided'
)
like_args.add_argument(
'--mu', metavar='μ', type=float, default=30.0,
help='mean k-mer abundance; default is 30.0'
)
like_args.add_argument(
'--sigma', metavar='σ', type=float, default=8.0,
help='standard deviation of k-mer abundance; default is 8.0'
)
like_args.add_argument(
'--epsilon', metavar='ε', type=float, default=0.001,
help='error rate; default is 0.001'
)
like_args.add_argument(
'--labels', metavar='LB', nargs='+',
help='list of labels for each sample, case/proband first'
)

misc_args = subparser.add_argument_group('Miscellaneous settings')
misc_args.add_argument(
'-h', '--help', action='help', help='show this help message and exit'
Expand Down
52 changes: 49 additions & 3 deletions kevlar/simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,40 @@
# -----------------------------------------------------------------------------

import sys
import khmer
import kevlar
from kevlar.novel import novel, load_samples
from kevlar.filter import filter as kfilter
from kevlar.filter import load_mask
from kevlar.partition import partition
from kevlar.alac import alac
from kevlar.simlike import simlike


def populate_refrsct(refrseqfile, refrsctfile=None, refrsctmem=1e9, ksize=31,
logstream=sys.stderr):
if refrsctfile:
return khmer.SmallCounttable.load(refrsctfile)
else:
sct = khmer.SmallCounttable(ksize, int(refrsctmem) / 4, 4)
sct.consume_seqfile(refrseqfile)
fpr = kevlar.sketch.estimate_fpr(sct)
if fpr > 0.05:
message = 'WARNING: false positive rate of {:.4f}'.format(fpr)
message += ' may be problematic for likelihood calculations.'
message += ' Consider increasing memory for reference '
message += ' k-mer counts.'
print('[kevlar::simplex]', warning, file=logstream)
return sct


def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
mask=None, filtermem=1e6, filterfpr=0.001,
partminabund=2, partmaxabund=200, dedup=True,
delta=50, seedsize=31, match=1, mismatch=2, gapopen=5, gapextend=0,
fallback=False, ksize=31, threads=1, logstream=sys.stderr):
fallback=False, refrsctfile=None, refrsctmem=1e9, mu=30.0,
sigma=8.0, epsilon=0.001, labels=None, ksize=31, threads=1,
logstream=sys.stderr):
"""
Execute the simplex germline variant discovery workflow.
Expand Down Expand Up @@ -57,6 +78,17 @@ def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
- gapextend: alignment gap extension penalty
- fallback: try assembly with home-grown greedy assembly algorithm if
assembly with fermi-lite fails for a partition
Parameters for computing likelihood scores
- refrsctfile: smallcounttable of k-mer abundances in the reference genome;
if not provided, will be populated from the refrfile
parameter
- refrsctmem: memory to allocate for reference smallcounttable (if it needs
to be populated from scratch)
- mu: observed/expected average k-mer coverage
- sigma: observed/expected standard deviation for k-mer coverage
- epsilon: base error rate
- labels: human-readable lables for each sample, case/proband first
"""
discoverer = novel(
case, [casecounts], controlcounts, ksize=ksize, casemin=casemin,
Expand All @@ -77,7 +109,15 @@ def simplex(case, casecounts, controlcounts, refrfile, ctrlmax=0, casemin=5,
gapextend=gapextend, fallback=fallback, logstream=logstream
)

for variant in caller:
refrsct = populate_refrsct(refrfile, refrsctfile, refrsctmem=refrsctmem,
ksize=ksize, logstream=logstream)
scorer = simlike(
caller, casecounts, controlcounts, refrsct, mu=mu, sigma=sigma,
epsilon=epsilon, casemin=casemin, samplelabels=labels,
logstream=logstream
)

for variant in scorer:
yield variant


Expand All @@ -101,6 +141,8 @@ def main(args):
logstream=args.logfile
)

if not args.labels:
args.labels = kevlar.simlike.default_sample_labels(len(controls) + 1)
outstream = kevlar.open(args.out, 'w')
caserecords = kevlar.multi_file_iter_screed(args.case)
workflow = simplex(
Expand All @@ -110,11 +152,15 @@ def main(args):
partminabund=args.part_min_abund, partmaxabund=args.part_max_abund,
dedup=args.dedup, delta=args.delta, seedsize=args.seed_size,
match=args.match, mismatch=args.mismatch, gapopen=args.open,
gapextend=args.extend, threads=args.threads, logstream=args.logfile
gapextend=args.extend, threads=args.threads, refrsctfile=args.refr_sct,
refrsctmem=args.refr_sct_mem, mu=args.mu, sigma=args.sigma,
epsilon=args.epsilon, labels=args.labels, logstream=args.logfile
)
writer = kevlar.vcf.VCFWriter(
outstream, source='kevlar::simplex', refr=args.refr,
)
for label in args.labels:
writer.register_sample(label)
writer.write_header()
for varcall in workflow:
writer.write(varcall)
20 changes: 13 additions & 7 deletions kevlar/tests/test_simplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_simplex_pico(pico_trio, capsys):
workflow = kevlar.simplex.simplex(
caserecords, cases[0], controls, refr, ksize=25, ctrlmax=0,
casemin=6, mask=mask, filtermem=1e7, filterfpr=0.005,
logstream=sys.stderr
refrsctmem=1e8, logstream=sys.stderr
)
variants = [v for v in workflow]
variants = sorted(variants, key=lambda v: v._pos)
Expand All @@ -85,7 +85,8 @@ def test_simplex_trio1(capsys):
controls[1], '--case-min', '6', '--ctrl-max', '0', '--novel-memory',
'1M', '--novel-fpr', '0.2', '--filter-memory', '50K', '--mask-files',
refr, '--mask-memory', '1M', '--filter-fpr', '0.005', '--ksize', '21',
'--seed-size', '31', '--delta', '25', refr
'--seed-size', '31', '--delta', '25',
'--labels', 'Case', 'Ctrl1', 'Ctrl2', '--refr-sct-mem', '500M', refr
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.simplex.main(args)
Expand All @@ -96,12 +97,17 @@ def test_simplex_trio1(capsys):

testvcf = '\t'.join([
'bogus-genome-chr1', '3567', '.', 'A', 'C', '.', 'PASS',
'ALTWINDOW=GAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAA;CIGAR=25D95M25D;'
'IKMERS=21;KSW2=82;'
'REFRWINDOW=GAAGGGCACACCTAACCGCAACATTTGCCGTGGAAGCATAA;'
'CONTIG=TTGGTGCCACGATCCGGCTATGGCGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGC'
'ATAAAGGTCATCATTGAGGTGGTTCGTTCCGAT'
'ALTWINDOW=GAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAA;CALLCLASS=None;'
'CIGAR=25D95M25D;DROPPED=0;IKMERS=21;KSW2=82;LIKESCORE=252.625;'
'LLDN=-62.504;LLFP=-778.456;LLIH=-315.129;REFRWINDOW=GAAGGGCACACCTAACC'
'GCAACATTTGCCGTGGAAGCATAA;CONTIG=TTGGTGCCACGATCCGGCTATGGCGGAAGGGCACACC'
'TAACCGCACCATTTGCCGTGGAAGCATAAAGGTCATCATTGAGGTGGTTCGTTCCGAT',
'ALTABUND',
'9,9,10,10,10,8,10,10,10,10,10,10,10,10,11,12,13,13,13,12,12',
'0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0',
'0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0'
])
print(out.strip())
assert out.strip() == testvcf


Expand Down

0 comments on commit 52f6f53

Please sign in to comment.