-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #264 from dib-lab/feature/dist
Compute k-mer abundance distribution: mean, stdev, and plot
- Loading branch information
Showing
10 changed files
with
318 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
sphinx-argparse | ||
pandas | ||
matplotlib | ||
pysam>=0.11.2 | ||
networkx>=2.0 | ||
scipy>=1.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2016 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
|
||
import khmer | ||
from khmer import khmer_args | ||
|
||
|
||
def subparser(subparsers): | ||
"""Define the `kevlar dist` command-line interface.""" | ||
|
||
desc = 'Compute the k-mer abundance distribution for a data set.' | ||
|
||
subparser = subparsers.add_parser('dist', description=desc) | ||
subparser.add_argument('-o', '--out', metavar='FILE', help='output file; ' | ||
'default is terminal (stdout)') | ||
subparser.add_argument('-k', '--ksize', metavar='K', type=int, default=31, | ||
help='k-mer size; default is 31') | ||
subparser.add_argument('-M', '--memory', type=khmer_args.memory_setting, | ||
default=1e6, metavar='MEM', | ||
help='memory to allocate for k-mer counting') | ||
subparser.add_argument('-t', '--threads', type=int, metavar='T', default=1, | ||
help='number of threads to use for k-mer counting; ' | ||
'default is 1') | ||
subparser.add_argument('-p', '--plot', metavar='PNG', help='plot k-mer ' | ||
'abundance distribution to file `PNG`') | ||
subparser.add_argument('--plot-xlim', metavar=('MIN', 'MAX'), type=int, | ||
nargs=2, default=(0, 100), help='define the ' | ||
'minimum and maximum x values (k-mer abundance) ' | ||
'for the plot; default is `0 100`') | ||
subparser.add_argument('mask', help='nodetable containing target k-mers ' | ||
'to count (such as single-copy exonic k-mers)') | ||
subparser.add_argument('infiles', nargs='+', help='input files in ' | ||
'FASTA/FASTQ format') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2018 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
|
||
from collections import defaultdict | ||
import json | ||
import math | ||
import sys | ||
import threading | ||
|
||
import kevlar | ||
import khmer | ||
import numpy | ||
import pandas | ||
|
||
|
||
class KevlarZeroAbundanceDistError(ValueError): | ||
pass | ||
|
||
|
||
def count_first_pass(infiles, counts, mask, nthreads=1, logstream=sys.stderr): | ||
message = 'Processing input with {:d} threads'.format(nthreads) | ||
print('[kevlar::dist]', message, file=logstream) | ||
|
||
for filename in infiles: | ||
print(' -', filename, file=logstream) | ||
parser = khmer.ReadParser(filename) | ||
threads = list() | ||
for _ in range(nthreads): | ||
thread = threading.Thread( | ||
target=counts.consume_seqfile_with_mask, | ||
args=(parser, mask,), | ||
kwargs={'threshold': 1, 'consume_masked': True}, | ||
) | ||
threads.append(thread) | ||
thread.start() | ||
for thread in threads: | ||
thread.join() | ||
|
||
print('[kevlar::dist] Done processing input!', file=logstream) | ||
|
||
|
||
def count_second_pass(infiles, counts, nthreads=1, logstream=sys.stderr): | ||
print('[kevlar::dist] Second pass over the data', file=logstream) | ||
tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes()) | ||
abund_lists = list() | ||
|
||
def __do_abund_dist(parser): | ||
abund = counts.abundance_distribution(parser, tracking) | ||
abund_lists.append(abund) | ||
|
||
for filename in infiles: | ||
print(' -', filename, file=logstream) | ||
parser = khmer.ReadParser(filename) | ||
threads = list() | ||
for _ in range(nthreads): | ||
thread = threading.Thread( | ||
target=__do_abund_dist, | ||
args=(parser,) | ||
) | ||
threads.append(thread) | ||
thread.start() | ||
for thread in threads: | ||
thread.join() | ||
|
||
assert len(abund_lists) == len(infiles) * nthreads | ||
abundance = defaultdict(int) | ||
for abund in abund_lists: | ||
for i, count in enumerate(abund): | ||
if i > 0 and count > 0: | ||
abundance[i] += count | ||
|
||
print('[kevlar::dist] Done second pass over input!', file=logstream) | ||
|
||
return abundance | ||
|
||
|
||
def weighted_mean_std_dev(values, weights): | ||
mu = numpy.average(values, weights=weights) | ||
sigma = math.sqrt(numpy.average((values-mu)**2, weights=weights)) | ||
return mu, sigma | ||
|
||
|
||
def calc_mu_sigma(abundance): | ||
total = sum(abundance.values()) | ||
if total == 0: | ||
message = 'all k-mer abundances are 0, please check input files' | ||
raise KevlarZeroAbundanceDistError(message) | ||
mu, sigma = weighted_mean_std_dev( | ||
list(abundance.keys()), | ||
list(abundance.values()), | ||
) | ||
return mu, sigma | ||
|
||
|
||
def compute_dist(abundance): | ||
total = sum(abundance.values()) | ||
fields = ['Abundance', 'Count', 'CumulativeCount', 'CumulativeFraction'] | ||
data = pandas.DataFrame(columns=fields) | ||
cuml = 0 | ||
for abund, count in sorted(abundance.items()): | ||
assert count > 0, (abund, count) | ||
cuml += count | ||
frac = cuml / total | ||
row = { | ||
'Abundance': abund, | ||
'Count': count, | ||
'CumulativeCount': cuml, | ||
'CumulativeFraction': frac, | ||
} | ||
data = data.append(row, ignore_index=True) | ||
return data | ||
|
||
|
||
def dist(infiles, mask, ksize=31, memory=1e6, threads=1, logstream=sys.stderr): | ||
counts = khmer.Counttable(ksize, memory / 4, 4) | ||
count_first_pass(infiles, counts, mask, nthreads=threads, | ||
logstream=logstream) | ||
abundance = count_second_pass(infiles, counts, nthreads=threads, | ||
logstream=logstream) | ||
mu, sigma = calc_mu_sigma(abundance) | ||
data = compute_dist(abundance) | ||
return mu, sigma, data | ||
|
||
|
||
def main(args): | ||
mask = khmer.Nodetable.load(args.mask) | ||
mu, sigma, data = dist( | ||
args.infiles, mask, ksize=args.ksize, memory=args.memory, | ||
threads=args.threads, logstream=args.logfile | ||
) | ||
out = {'mu': mu, 'sigma': sigma} | ||
print(json.dumps(out)) | ||
|
||
if args.plot: | ||
import os | ||
try: # pragma: no cover | ||
import matplotlib | ||
if os.environ.get('DISPLAY', '') == '': | ||
matplotlib.use('Agg') | ||
from matplotlib import pyplot as plt | ||
except RuntimeError as rerr: # pragma: no cover | ||
if 'Python is not installed as a framework' not in str(rerr): | ||
raise rerr | ||
message = 'There was a problem loading matplotlib. ' | ||
message += 'Try https://stackoverflow.com/q/21784641/459780 ' | ||
message += 'for troubleshooting ideas.' | ||
raise RuntimeError(message) | ||
matplotlib.rcParams["figure.figsize"] = [12, 6] | ||
matplotlib.rcParams['axes.labelsize'] = 16 | ||
matplotlib.rcParams['xtick.labelsize'] = 16 | ||
plt.plot(data['Abundance'], data['Count'], color='blue') | ||
plt.axvline(x=mu, color='blue', linestyle='--') | ||
plt.axvline(x=mu - sigma, color='red', linestyle=':') | ||
plt.axvline(x=mu + sigma, color='red', linestyle=':') | ||
plt.xlim(args.plot_xlim) | ||
plt.xlabel('K-mer abundance') | ||
plt.ylabel('Frequency') | ||
plt.savefig(args.plot, dpi=300) |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#!/usr/bin/env python | ||
# | ||
# ----------------------------------------------------------------------------- | ||
# Copyright (c) 2018 The Regents of the University of California | ||
# | ||
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is | ||
# licensed under the MIT license: see LICENSE. | ||
# ----------------------------------------------------------------------------- | ||
|
||
import filecmp | ||
import json | ||
import sys | ||
from tempfile import NamedTemporaryFile | ||
|
||
import pytest | ||
import kevlar | ||
from kevlar.dist import count_first_pass, count_second_pass | ||
from kevlar.dist import calc_mu_sigma, compute_dist | ||
from kevlar.dist import KevlarZeroAbundanceDistError | ||
from kevlar.tests import data_file | ||
from khmer import Counttable, Nodetable | ||
|
||
|
||
def test_count_first_pass(): | ||
mask = Nodetable.load(data_file('minitrio/mask.nt')) | ||
counts = Counttable(31, 1e4, 4) | ||
seqfile = data_file('minitrio/trio-proband.fq.gz') | ||
count_first_pass([seqfile], counts, mask) | ||
with NamedTemporaryFile(suffix='.ct') as countfile: | ||
counts.save(countfile.name) | ||
testcountfile = data_file('minitrio/trio-proband-mask-counts.ct') | ||
assert filecmp.cmp(testcountfile, countfile.name) is True | ||
|
||
|
||
def test_count_second_pass(): | ||
mask = Nodetable.load(data_file('minitrio/mask.nt')) | ||
counts = Counttable.load(data_file('minitrio/trio-proband-mask-counts.ct')) | ||
seqfile = data_file('minitrio/trio-proband.fq.gz') | ||
abund = count_second_pass([seqfile], counts) | ||
assert abund == {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9, | ||
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3} | ||
|
||
|
||
def test_calc_mu_sigma(): | ||
abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9, | ||
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3} | ||
mu, sigma = calc_mu_sigma(abund) | ||
assert pytest.approx(15.32558, mu) | ||
assert pytest.approx(3.280581, sigma) | ||
|
||
|
||
def test_musigma_empty_dist(): | ||
with pytest.raises(KevlarZeroAbundanceDistError): | ||
empty = dict() | ||
calc_mu_sigma(empty) | ||
|
||
|
||
def test_compute_dist(): | ||
abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9, | ||
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3} | ||
data = compute_dist(abund) | ||
print(data) | ||
assert list(data['Count'][:5]) == [6.0, 10.0, 12.0, 18.0, 16.0] | ||
assert list(data['CumulativeCount'][:5]) == [6.0, 16.0, 28.0, 46.0, 62.0] | ||
|
||
|
||
def test_dist(): | ||
mask = Nodetable.load(data_file('minitrio/mask.nt')) | ||
filenames = [data_file('minitrio/trio-proband.fq.gz')] | ||
mu, sigma, data = kevlar.dist.dist(filenames, mask, memory=4e4) | ||
assert pytest.approx(15.32558, mu) | ||
assert pytest.approx(3.280581, sigma) | ||
assert list(data['Count'][-5:]) == [11.0, 8.0, 9.0, 7.0, 3.0] | ||
|
||
|
||
def test_dist_empty(): | ||
mask = Nodetable(31, 1e4, 4) | ||
mask.consume('GATTACA' * 10) | ||
mask.consume('A' * 50) | ||
filenames = [data_file('minitrio/trio-proband.fq.gz')] | ||
with pytest.raises(KevlarZeroAbundanceDistError): | ||
kevlar.dist.dist(filenames, mask, memory=4e4) | ||
|
||
|
||
def test_main(capsys): | ||
arglist = [ | ||
'dist', data_file('minitrio/mask.nt'), | ||
data_file('minitrio/trio-proband.fq.gz') | ||
] | ||
args = kevlar.cli.parser().parse_args(arglist) | ||
kevlar.dist.main(args) | ||
|
||
out, err = capsys.readouterr() | ||
print(out) | ||
js = json.loads(out) | ||
assert pytest.approx(15.32558, js['mu']) | ||
assert pytest.approx(3.280581, js['sigma']) | ||
|
||
|
||
def test_main_plot(capsys): | ||
with NamedTemporaryFile(suffix='.png') as plotfile: | ||
arglist = [ | ||
'dist', '--plot', plotfile.name, '--plot-xlim', '0', '50', | ||
data_file('minitrio/mask.nt'), | ||
data_file('minitrio/trio-proband.fq.gz') | ||
] | ||
args = kevlar.cli.parser().parse_args(arglist) | ||
kevlar.dist.main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
pandas | ||
matplotlib | ||
pysam>=0.11.2 | ||
networkx>=2.0 | ||
scipy>=1.0 | ||
|