Merge pull request #264 from dib-lab/feature/dist

Compute k-mer abundance distribution: mean, stdev, and plot
kevlar-dev · Jun 2, 2018 · af22493 · af22493
2 parents ce2f8b1 + e0c9ce4
commit af22493
Show file tree

Hide file tree

Showing 10 changed files with 318 additions and 0 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,6 +5,7 @@ addons:
         packages:
             - cloc
             - git
+            - python3-tk
 branches:
   only:  # don't build "pushes" except on the master branch
     - master

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,5 +1,6 @@
 sphinx-argparse
 pandas
+matplotlib
 pysam>=0.11.2
 networkx>=2.0
 scipy>=1.0

diff --git a/kevlar/__init__.py b/kevlar/__init__.py
@@ -53,6 +53,7 @@
 from kevlar import alac
 from kevlar import simplex
 from kevlar import simlike
+from kevlar import dist
 from kevlar import split
 from kevlar import gentrio
 from kevlar import cli

diff --git a/kevlar/cli/__init__.py b/kevlar/cli/__init__.py
@@ -27,6 +27,7 @@
 from . import simplex
 from . import simlike
 from . import split
+from . import dist
 
 mains = {
     'dump': kevlar.dump.main,
@@ -45,6 +46,7 @@
     'simplex': kevlar.simplex.main,
     'simlike': kevlar.simlike.main,
     'split': kevlar.split.main,
+    'dist': kevlar.dist.main,
 }
 
 subparser_funcs = {
@@ -64,6 +66,7 @@
     'simplex': simplex.subparser,
     'simlike': simlike.subparser,
     'split': split.subparser,
+    'dist': dist.subparser,
 }
 
 

diff --git a/kevlar/cli/dist.py b/kevlar/cli/dist.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2016 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import khmer
+from khmer import khmer_args
+
+
+def subparser(subparsers):
+    """Define the `kevlar dist` command-line interface."""
+
+    desc = 'Compute the k-mer abundance distribution for a data set.'
+
+    subparser = subparsers.add_parser('dist', description=desc)
+    subparser.add_argument('-o', '--out', metavar='FILE', help='output file; '
+                           'default is terminal (stdout)')
+    subparser.add_argument('-k', '--ksize', metavar='K', type=int, default=31,
+                           help='k-mer size; default is 31')
+    subparser.add_argument('-M', '--memory', type=khmer_args.memory_setting,
+                           default=1e6, metavar='MEM',
+                           help='memory to allocate for k-mer counting')
+    subparser.add_argument('-t', '--threads', type=int, metavar='T', default=1,
+                           help='number of threads to use for k-mer counting; '
+                           'default is 1')
+    subparser.add_argument('-p', '--plot', metavar='PNG', help='plot k-mer '
+                           'abundance distribution to file `PNG`')
+    subparser.add_argument('--plot-xlim', metavar=('MIN', 'MAX'), type=int,
+                           nargs=2, default=(0, 100), help='define the '
+                           'minimum and maximum x values (k-mer abundance) '
+                           'for the plot; default is `0 100`')
+    subparser.add_argument('mask', help='nodetable containing target k-mers '
+                           'to count (such as single-copy exonic k-mers)')
+    subparser.add_argument('infiles', nargs='+', help='input files in '
+                           'FASTA/FASTQ format')
diff --git a/kevlar/dist.py b/kevlar/dist.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+from collections import defaultdict
+import json
+import math
+import sys
+import threading
+
+import kevlar
+import khmer
+import numpy
+import pandas
+
+
+class KevlarZeroAbundanceDistError(ValueError):
+    pass
+
+
+def count_first_pass(infiles, counts, mask, nthreads=1, logstream=sys.stderr):
+    message = 'Processing input with {:d} threads'.format(nthreads)
+    print('[kevlar::dist]', message, file=logstream)
+
+    for filename in infiles:
+        print('    -', filename, file=logstream)
+        parser = khmer.ReadParser(filename)
+        threads = list()
+        for _ in range(nthreads):
+            thread = threading.Thread(
+                target=counts.consume_seqfile_with_mask,
+                args=(parser, mask,),
+                kwargs={'threshold': 1, 'consume_masked': True},
+            )
+            threads.append(thread)
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+    print('[kevlar::dist] Done processing input!', file=logstream)
+
+
+def count_second_pass(infiles, counts, nthreads=1, logstream=sys.stderr):
+    print('[kevlar::dist] Second pass over the data', file=logstream)
+    tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes())
+    abund_lists = list()
+
+    def __do_abund_dist(parser):
+        abund = counts.abundance_distribution(parser, tracking)
+        abund_lists.append(abund)
+
+    for filename in infiles:
+        print('    -', filename, file=logstream)
+        parser = khmer.ReadParser(filename)
+        threads = list()
+        for _ in range(nthreads):
+            thread = threading.Thread(
+                target=__do_abund_dist,
+                args=(parser,)
+            )
+            threads.append(thread)
+            thread.start()
+        for thread in threads:
+            thread.join()
+
+    assert len(abund_lists) == len(infiles) * nthreads
+    abundance = defaultdict(int)
+    for abund in abund_lists:
+        for i, count in enumerate(abund):
+            if i > 0 and count > 0:
+                abundance[i] += count
+
+    print('[kevlar::dist] Done second pass over input!', file=logstream)
+
+    return abundance
+
+
+def weighted_mean_std_dev(values, weights):
+    mu = numpy.average(values, weights=weights)
+    sigma = math.sqrt(numpy.average((values-mu)**2, weights=weights))
+    return mu, sigma
+
+
+def calc_mu_sigma(abundance):
+    total = sum(abundance.values())
+    if total == 0:
+        message = 'all k-mer abundances are 0, please check input files'
+        raise KevlarZeroAbundanceDistError(message)
+    mu, sigma = weighted_mean_std_dev(
+        list(abundance.keys()),
+        list(abundance.values()),
+    )
+    return mu, sigma
+
+
+def compute_dist(abundance):
+    total = sum(abundance.values())
+    fields = ['Abundance', 'Count', 'CumulativeCount', 'CumulativeFraction']
+    data = pandas.DataFrame(columns=fields)
+    cuml = 0
+    for abund, count in sorted(abundance.items()):
+        assert count > 0, (abund, count)
+        cuml += count
+        frac = cuml / total
+        row = {
+            'Abundance': abund,
+            'Count': count,
+            'CumulativeCount': cuml,
+            'CumulativeFraction': frac,
+        }
+        data = data.append(row, ignore_index=True)
+    return data
+
+
+def dist(infiles, mask, ksize=31, memory=1e6, threads=1, logstream=sys.stderr):
+    counts = khmer.Counttable(ksize, memory / 4, 4)
+    count_first_pass(infiles, counts, mask, nthreads=threads,
+                     logstream=logstream)
+    abundance = count_second_pass(infiles, counts, nthreads=threads,
+                                  logstream=logstream)
+    mu, sigma = calc_mu_sigma(abundance)
+    data = compute_dist(abundance)
+    return mu, sigma, data
+
+
+def main(args):
+    mask = khmer.Nodetable.load(args.mask)
+    mu, sigma, data = dist(
+        args.infiles, mask, ksize=args.ksize, memory=args.memory,
+        threads=args.threads, logstream=args.logfile
+    )
+    out = {'mu': mu, 'sigma': sigma}
+    print(json.dumps(out))
+
+    if args.plot:
+        import os
+        try:  # pragma: no cover
+            import matplotlib
+            if os.environ.get('DISPLAY', '') == '':
+                matplotlib.use('Agg')
+            from matplotlib import pyplot as plt
+        except RuntimeError as rerr:  # pragma: no cover
+            if 'Python is not installed as a framework' not in str(rerr):
+                raise rerr
+            message = 'There was a problem loading matplotlib. '
+            message += 'Try https://stackoverflow.com/q/21784641/459780 '
+            message += 'for troubleshooting ideas.'
+            raise RuntimeError(message)
+        matplotlib.rcParams["figure.figsize"] = [12, 6]
+        matplotlib.rcParams['axes.labelsize'] = 16
+        matplotlib.rcParams['xtick.labelsize'] = 16
+        plt.plot(data['Abundance'], data['Count'], color='blue')
+        plt.axvline(x=mu, color='blue', linestyle='--')
+        plt.axvline(x=mu - sigma, color='red', linestyle=':')
+        plt.axvline(x=mu + sigma, color='red', linestyle=':')
+        plt.xlim(args.plot_xlim)
+        plt.xlabel('K-mer abundance')
+        plt.ylabel('Frequency')
+        plt.savefig(args.plot, dpi=300)
diff --git a/kevlar/tests/data/minitrio/mask.nt b/kevlar/tests/data/minitrio/mask.nt
diff --git a/kevlar/tests/data/minitrio/trio-proband-mask-counts.ct b/kevlar/tests/data/minitrio/trio-proband-mask-counts.ct
diff --git a/kevlar/tests/test_dist.py b/kevlar/tests/test_dist.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+#
+# -----------------------------------------------------------------------------
+# Copyright (c) 2018 The Regents of the University of California
+#
+# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
+# licensed under the MIT license: see LICENSE.
+# -----------------------------------------------------------------------------
+
+import filecmp
+import json
+import sys
+from tempfile import NamedTemporaryFile
+
+import pytest
+import kevlar
+from kevlar.dist import count_first_pass, count_second_pass
+from kevlar.dist import calc_mu_sigma, compute_dist
+from kevlar.dist import KevlarZeroAbundanceDistError
+from kevlar.tests import data_file
+from khmer import Counttable, Nodetable
+
+
+def test_count_first_pass():
+    mask = Nodetable.load(data_file('minitrio/mask.nt'))
+    counts = Counttable(31, 1e4, 4)
+    seqfile = data_file('minitrio/trio-proband.fq.gz')
+    count_first_pass([seqfile], counts, mask)
+    with NamedTemporaryFile(suffix='.ct') as countfile:
+        counts.save(countfile.name)
+        testcountfile = data_file('minitrio/trio-proband-mask-counts.ct')
+        assert filecmp.cmp(testcountfile, countfile.name) is True
+
+
+def test_count_second_pass():
+    mask = Nodetable.load(data_file('minitrio/mask.nt'))
+    counts = Counttable.load(data_file('minitrio/trio-proband-mask-counts.ct'))
+    seqfile = data_file('minitrio/trio-proband.fq.gz')
+    abund = count_second_pass([seqfile], counts)
+    assert abund == {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
+                     17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}
+
+
+def test_calc_mu_sigma():
+    abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
+             17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}
+    mu, sigma = calc_mu_sigma(abund)
+    assert pytest.approx(15.32558, mu)
+    assert pytest.approx(3.280581, sigma)
+
+
+def test_musigma_empty_dist():
+    with pytest.raises(KevlarZeroAbundanceDistError):
+        empty = dict()
+        calc_mu_sigma(empty)
+
+
+def test_compute_dist():
+    abund = {10: 6, 11: 10, 12: 12, 13: 18, 14: 16, 15: 11, 16: 9,
+             17: 9, 18: 11, 19: 8, 20: 9, 21: 7, 22: 3}
+    data = compute_dist(abund)
+    print(data)
+    assert list(data['Count'][:5]) == [6.0, 10.0, 12.0, 18.0, 16.0]
+    assert list(data['CumulativeCount'][:5]) == [6.0, 16.0, 28.0, 46.0, 62.0]
+
+
+def test_dist():
+    mask = Nodetable.load(data_file('minitrio/mask.nt'))
+    filenames = [data_file('minitrio/trio-proband.fq.gz')]
+    mu, sigma, data = kevlar.dist.dist(filenames, mask, memory=4e4)
+    assert pytest.approx(15.32558, mu)
+    assert pytest.approx(3.280581, sigma)
+    assert list(data['Count'][-5:]) == [11.0, 8.0, 9.0, 7.0, 3.0]
+
+
+def test_dist_empty():
+    mask = Nodetable(31, 1e4, 4)
+    mask.consume('GATTACA' * 10)
+    mask.consume('A' * 50)
+    filenames = [data_file('minitrio/trio-proband.fq.gz')]
+    with pytest.raises(KevlarZeroAbundanceDistError):
+        kevlar.dist.dist(filenames, mask, memory=4e4)
+
+
+def test_main(capsys):
+    arglist = [
+        'dist', data_file('minitrio/mask.nt'),
+        data_file('minitrio/trio-proband.fq.gz')
+    ]
+    args = kevlar.cli.parser().parse_args(arglist)
+    kevlar.dist.main(args)
+
+    out, err = capsys.readouterr()
+    print(out)
+    js = json.loads(out)
+    assert pytest.approx(15.32558, js['mu'])
+    assert pytest.approx(3.280581, js['sigma'])
+
+
+def test_main_plot(capsys):
+    with NamedTemporaryFile(suffix='.png') as plotfile:
+        arglist = [
+            'dist', '--plot', plotfile.name, '--plot-xlim', '0', '50',
+            data_file('minitrio/mask.nt'),
+            data_file('minitrio/trio-proband.fq.gz')
+        ]
+        args = kevlar.cli.parser().parse_args(arglist)
+        kevlar.dist.main(args)
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 pandas
+matplotlib
 pysam>=0.11.2
 networkx>=2.0
 scipy>=1.0