Skip to content

Commit

Permalink
Merge pull request #264 from dib-lab/feature/dist
Browse files Browse the repository at this point in the history
Compute k-mer abundance distribution: mean, stdev, and plot
  • Loading branch information
standage committed Jun 2, 2018
2 parents ce2f8b1 + e0c9ce4 commit af22493
Show file tree
Hide file tree
Showing 10 changed files with 318 additions and 0 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ addons:
packages:
- cloc
- git
- python3-tk
branches:
only: # don't build "pushes" except on the master branch
- master
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
sphinx-argparse
pandas
matplotlib
pysam>=0.11.2
networkx>=2.0
scipy>=1.0
Expand Down
1 change: 1 addition & 0 deletions kevlar/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from kevlar import alac
from kevlar import simplex
from kevlar import simlike
from kevlar import dist
from kevlar import split
from kevlar import gentrio
from kevlar import cli
Expand Down
3 changes: 3 additions & 0 deletions kevlar/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from . import simplex
from . import simlike
from . import split
from . import dist

mains = {
'dump': kevlar.dump.main,
Expand All @@ -45,6 +46,7 @@
'simplex': kevlar.simplex.main,
'simlike': kevlar.simlike.main,
'split': kevlar.split.main,
'dist': kevlar.dist.main,
}

subparser_funcs = {
Expand All @@ -64,6 +66,7 @@
'simplex': simplex.subparser,
'simlike': simlike.subparser,
'split': split.subparser,
'dist': dist.subparser,
}


Expand Down
39 changes: 39 additions & 0 deletions kevlar/cli/dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2016 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import khmer
from khmer import khmer_args


def subparser(subparsers):
"""Define the `kevlar dist` command-line interface."""

desc = 'Compute the k-mer abundance distribution for a data set.'

subparser = subparsers.add_parser('dist', description=desc)
subparser.add_argument('-o', '--out', metavar='FILE', help='output file; '
'default is terminal (stdout)')
subparser.add_argument('-k', '--ksize', metavar='K', type=int, default=31,
help='k-mer size; default is 31')
subparser.add_argument('-M', '--memory', type=khmer_args.memory_setting,
default=1e6, metavar='MEM',
help='memory to allocate for k-mer counting')
subparser.add_argument('-t', '--threads', type=int, metavar='T', default=1,
help='number of threads to use for k-mer counting; '
'default is 1')
subparser.add_argument('-p', '--plot', metavar='PNG', help='plot k-mer '
'abundance distribution to file `PNG`')
subparser.add_argument('--plot-xlim', metavar=('MIN', 'MAX'), type=int,
nargs=2, default=(0, 100), help='define the '
'minimum and maximum x values (k-mer abundance) '
'for the plot; default is `0 100`')
subparser.add_argument('mask', help='nodetable containing target k-mers '
'to count (such as single-copy exonic k-mers)')
subparser.add_argument('infiles', nargs='+', help='input files in '
'FASTA/FASTQ format')
164 changes: 164 additions & 0 deletions kevlar/dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

from collections import defaultdict
import json
import math
import sys
import threading

import kevlar
import khmer
import numpy
import pandas


class KevlarZeroAbundanceDistError(ValueError):
pass


def count_first_pass(infiles, counts, mask, nthreads=1, logstream=sys.stderr):
message = 'Processing input with {:d} threads'.format(nthreads)
print('[kevlar::dist]', message, file=logstream)

for filename in infiles:
print(' -', filename, file=logstream)
parser = khmer.ReadParser(filename)
threads = list()
for _ in range(nthreads):
thread = threading.Thread(
target=counts.consume_seqfile_with_mask,
args=(parser, mask,),
kwargs={'threshold': 1, 'consume_masked': True},
)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()

print('[kevlar::dist] Done processing input!', file=logstream)


def count_second_pass(infiles, counts, nthreads=1, logstream=sys.stderr):
print('[kevlar::dist] Second pass over the data', file=logstream)
tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes())
abund_lists = list()

def __do_abund_dist(parser):
abund = counts.abundance_distribution(parser, tracking)
abund_lists.append(abund)

for filename in infiles:
print(' -', filename, file=logstream)
parser = khmer.ReadParser(filename)
threads = list()
for _ in range(nthreads):
thread = threading.Thread(
target=__do_abund_dist,
args=(parser,)
)
threads.append(thread)
thread.start()
for thread in threads:
thread.join()

assert len(abund_lists) == len(infiles) * nthreads
abundance = defaultdict(int)
for abund in abund_lists:
for i, count in enumerate(abund):
if i > 0 and count > 0:
abundance[i] += count

print('[kevlar::dist] Done second pass over input!', file=logstream)

return abundance


def weighted_mean_std_dev(values, weights):
mu = numpy.average(values, weights=weights)
sigma = math.sqrt(numpy.average((values-mu)**2, weights=weights))
return mu, sigma


def calc_mu_sigma(abundance):
total = sum(abundance.values())
if total == 0:
message = 'all k-mer abundances are 0, please check input files'
raise KevlarZeroAbundanceDistError(message)
mu, sigma = weighted_mean_std_dev(
list(abundance.keys()),
list(abundance.values()),
)
return mu, sigma


def compute_dist(abundance):
total = sum(abundance.values())
fields = ['Abundance', 'Count', 'CumulativeCount', 'CumulativeFraction']
data = pandas.DataFrame(columns=fields)
cuml = 0
for abund, count in sorted(abundance.items()):
assert count > 0, (abund, count)
cuml += count
frac = cuml / total
row = {
'Abundance': abund,
'Count': count,
'CumulativeCount': cuml,
'CumulativeFraction': frac,
}
data = data.append(row, ignore_index=True)
return data


def dist(infiles, mask, ksize=31, memory=1e6, threads=1, logstream=sys.stderr):
counts = khmer.Counttable(ksize, memory / 4, 4)
count_first_pass(infiles, counts, mask, nthreads=threads,
logstream=logstream)
abundance = count_second_pass(infiles, counts, nthreads=threads,
logstream=logstream)
mu, sigma = calc_mu_sigma(abundance)
data = compute_dist(abundance)
return mu, sigma, data


def main(args):
mask = khmer.Nodetable.load(args.mask)
mu, sigma, data = dist(
args.infiles, mask, ksize=args.ksize, memory=args.memory,
threads=args.threads, logstream=args.logfile
)
out = {'mu': mu, 'sigma': sigma}
print(json.dumps(out))

if args.plot:
import os
try: # pragma: no cover
import matplotlib
if os.environ.get('DISPLAY', '') == '':
matplotlib.use('Agg')
from matplotlib import pyplot as plt
except RuntimeError as rerr: # pragma: no cover
if 'Python is not installed as a framework' not in str(rerr):
raise rerr
message = 'There was a problem loading matplotlib. '
message += 'Try https://stackoverflow.com/q/21784641/459780 '
message += 'for troubleshooting ideas.'
raise RuntimeError(message)
matplotlib.rcParams["figure.figsize"] = [12, 6]
matplotlib.rcParams['axes.labelsize'] = 16
matplotlib.rcParams['xtick.labelsize'] = 16
plt.plot(data['Abundance'], data['Count'], color='blue')
plt.axvline(x=mu, color='blue', linestyle='--')
plt.axvline(x=mu - sigma, color='red', linestyle=':')
plt.axvline(x=mu + sigma, color='red', linestyle=':')
plt.xlim(args.plot_xlim)
plt.xlabel('K-mer abundance')
plt.ylabel('Frequency')
plt.savefig(args.plot, dpi=300)
Binary file added kevlar/tests/data/minitrio/mask.nt
Binary file not shown.
Binary file not shown.
108 changes: 108 additions & 0 deletions kevlar/tests/test_dist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python
#
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import filecmp
import json
import sys
from tempfile import NamedTemporaryFile

import pytest
import kevlar
from kevlar.dist import count_first_pass, count_second_pass
from kevlar.dist import calc_mu_sigma, compute_dist
from kevlar.dist import KevlarZeroAbundanceDistError
from kevlar.tests import data_file
from khmer import Counttable, Nodetable


def test_count_first_pass():
mask = Nodetable.load(data_file('minitrio/mask.nt'))
counts = Counttable(31, 1e4, 4)
seqfile = data_file('minitrio/trio-proband.fq.gz')
count_first_pass([seqfile], counts, mask)
with NamedTemporaryFile(suffix='.ct') as countfile:
counts.save(countfile.name)
testcountfile = data_file('minitrio/trio-proband-mask-counts.ct')
assert filecmp.cmp(testcountfile, countfile.name) is True


def test_count_second_pass():
mask = Nodetable.load(data_file('minitrio/mask.nt'))
counts = Counttable.load(data_file('minitrio/trio-proband-mask-counts.ct'))
seqfile = data_file('minitrio/trio-proband.fq.gz')
abund = count_second_pass([seqfile], counts)
assert abund == {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}


def test_calc_mu_sigma():
abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}
mu, sigma = calc_mu_sigma(abund)
assert pytest.approx(15.32558, mu)
assert pytest.approx(3.280581, sigma)


def test_musigma_empty_dist():
with pytest.raises(KevlarZeroAbundanceDistError):
empty = dict()
calc_mu_sigma(empty)


def test_compute_dist():
abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}
data = compute_dist(abund)
print(data)
assert list(data['Count'][:5]) == [6.0, 10.0, 12.0, 18.0, 16.0]
assert list(data['CumulativeCount'][:5]) == [6.0, 16.0, 28.0, 46.0, 62.0]


def test_dist():
mask = Nodetable.load(data_file('minitrio/mask.nt'))
filenames = [data_file('minitrio/trio-proband.fq.gz')]
mu, sigma, data = kevlar.dist.dist(filenames, mask, memory=4e4)
assert pytest.approx(15.32558, mu)
assert pytest.approx(3.280581, sigma)
assert list(data['Count'][-5:]) == [11.0, 8.0, 9.0, 7.0, 3.0]


def test_dist_empty():
mask = Nodetable(31, 1e4, 4)
mask.consume('GATTACA' * 10)
mask.consume('A' * 50)
filenames = [data_file('minitrio/trio-proband.fq.gz')]
with pytest.raises(KevlarZeroAbundanceDistError):
kevlar.dist.dist(filenames, mask, memory=4e4)


def test_main(capsys):
arglist = [
'dist', data_file('minitrio/mask.nt'),
data_file('minitrio/trio-proband.fq.gz')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.dist.main(args)

out, err = capsys.readouterr()
print(out)
js = json.loads(out)
assert pytest.approx(15.32558, js['mu'])
assert pytest.approx(3.280581, js['sigma'])


def test_main_plot(capsys):
with NamedTemporaryFile(suffix='.png') as plotfile:
arglist = [
'dist', '--plot', plotfile.name, '--plot-xlim', '0', '50',
data_file('minitrio/mask.nt'),
data_file('minitrio/trio-proband.fq.gz')
]
args = kevlar.cli.parser().parse_args(arglist)
kevlar.dist.main(args)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pandas
matplotlib
pysam>=0.11.2
networkx>=2.0
scipy>=1.0
Expand Down

0 comments on commit af22493

Please sign in to comment.