Merge pull request #1380 from dib-lab/update/python_api

Cleanups from khmer onboarding
dib-lab · Jun 27, 2016 · 5a83929 · 5a83929
2 parents 98d075d + 492789c
commit 5a83929
Show file tree

Hide file tree

Showing 14 changed files with 327 additions and 143 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -3,6 +3,19 @@
    * doc/requirements.txt: updated sphinx-autoprogram requirement for building
    documentation, to point at released version.
 
+2016-06-26  Titus Brown  <titus@idyll.org>
+
+   * khmer/_khmer.cc, tests/test_nodegraph.py: 'add' is now a synonym for
+   graph.count(kmer).
+   * khmer/thread_utils.py: update verbose_loader function to take 'verbose'
+   argument.
+   * scripts/{abundance-dist-single.py, abundance-dist.py,
+   filter-abund-single.py, filter-abund.py, load-into-counting.py,
+   normalize-by-median.py,trim-low-abund.py}: updated to
+   respect -q/--quiet.
+   * tests/test_filter_abund.py: tests for '-q'.
+   * scripts/partition-graph.py: fix typo in args help message.
+
 2016-06-26  Titus Brown  <titus@idyll.org>
 
    * khmer/_khmer.cc, lib/{hashtable.cc, hashtable.hh},

diff --git a/khmer/_khmer.cc b/khmer/_khmer.cc
@@ -2852,6 +2852,11 @@ static PyMethodDef khmer_hashtable_methods[] = {
         (PyCFunction)hashtable_count, METH_VARARGS,
         "Increment the count of this k-mer."
     },
+    {
+        "add",
+        (PyCFunction)hashtable_count, METH_VARARGS,
+        "Increment the count of this k-mer. (Synonym for 'count'.)"
+    },
     {
         "consume",
         (PyCFunction)hashtable_consume, METH_VARARGS,

diff --git a/khmer/thread_utils.py b/khmer/thread_utils.py
@@ -41,6 +41,7 @@
 import sys
 import screed
 from khmer.utils import write_record, check_is_pair
+from khmer.khmer_logger import log_info
 # stdlib queue module was renamed on Python 3
 try:
     import queue
@@ -56,7 +57,7 @@ def verbose_loader(filename):
     screed_iter = screed.open(filename)
     for num, record in enumerate(screed_iter):
         if num % 100000 == 0:
-            print('... filtering', num, file=sys.stderr)
+            log_info('... filtering {num}', num=num)
         yield record
 
 verbose_fasta_iter = verbose_loader  # pylint: disable=invalid-name

diff --git a/scripts/abundance-dist-single.py b/scripts/abundance-dist-single.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
 # Copyright (C) 2010-2015, Michigan State University.
-# Copyright (C) 2015, The Regents of the University of California.
+# Copyright (C) 2015-2016, The Regents of the University of California.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -55,6 +55,8 @@
                               report_on_config, info, calculate_graphsize,
                               sanitize_help)
 from khmer.kfile import (check_input_files, check_space_for_graph)
+from khmer.khmer_logger import (configure_logging, log_info, log_error,
+                                log_warn)
 
 
 def get_parser():
@@ -97,12 +99,17 @@ def get_parser():
                         "filename.")
     parser.add_argument('-f', '--force', default=False, action='store_true',
                         help='Overwrite output file if it exists')
+    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
+                        action='store_true')
     return parser
 
 
 def main():  # pylint: disable=too-many-locals,too-many-branches
-    info('abundance-dist-single.py', ['counting', 'SeqAn'])
     args = sanitize_help(get_parser()).parse_args()
+    if not args.quiet:
+        info('abundance-dist-single.py', ['counting', 'SeqAn'])
+
+    configure_logging(args.quiet)
     report_on_config(args)
 
     check_input_files(args.input_sequence_filename, args.force)
@@ -111,8 +118,8 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
         check_space_for_graph(args.savegraph, graphsize, args.force)
     if (not args.squash_output and
             os.path.exists(args.output_histogram_filename)):
-        print('ERROR: %s exists; not squashing.' %
-              args.output_histogram_filename, file=sys.stderr)
+        log_error('ERROR: {output} exists; not squashing.',
+                  output=args.output_histogram_filename)
         sys.exit(1)
     else:
         hist_fp = open(args.output_histogram_filename, 'w')
@@ -121,23 +128,22 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
         hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
                               'cumulative_fraction'])
 
-    print('making countgraph', file=sys.stderr)
+    log_info('making countgraph')
     countgraph = khmer_args.create_countgraph(args, multiplier=1.1)
     countgraph.set_use_bigcount(args.bigcount)
 
-    print('building k-mer tracking graph', file=sys.stderr)
+    log_info('building k-mer tracking graph')
     tracking = khmer_args.create_nodegraph(args, multiplier=1.1)
 
-    print('kmer_size:', countgraph.ksize(), file=sys.stderr)
-    print('k-mer countgraph sizes:',
-          countgraph.hashsizes(), file=sys.stderr)
-    print('outputting to', args.output_histogram_filename, file=sys.stderr)
+    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
+    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
+    log_info('outputting to {output}', output=args.output_histogram_filename)
 
     # start loading
     rparser = khmer.ReadParser(args.input_sequence_filename)
     threads = []
-    print('consuming input, round 1 --',
-          args.input_sequence_filename, file=sys.stderr)
+    log_info('consuming input, round 1 -- {input}',
+             input=args.input_sequence_filename)
     for _ in range(args.threads):
         thread = \
             threading.Thread(
@@ -150,8 +156,8 @@ def main():  # pylint: disable=too-many-locals,too-many-branches
     for thread in threads:
         thread.join()
 
-    print('Total number of unique k-mers: {0}'.format(
-        countgraph.n_unique_kmers()), file=sys.stderr)
+    log_info('Total number of unique k-mers: {nk}',
+             nk=countgraph.n_unique_kmers())
 
     abundance_lists = []
 
@@ -160,12 +166,12 @@ def __do_abundance_dist__(read_parser):
             read_parser, tracking)
         abundance_lists.append(abundances)
 
-    print('preparing hist from %s...' %
-          args.input_sequence_filename, file=sys.stderr)
+    log_info('preparing hist from {seqfile}...',
+             seqfile=args.input_sequence_filename)
     rparser = khmer.ReadParser(args.input_sequence_filename)
     threads = []
-    print('consuming input, round 2 --',
-          args.input_sequence_filename, file=sys.stderr)
+    log_info('consuming input, round 2 -- {filename}',
+             filename=args.input_sequence_filename)
     for _ in range(args.threads):
         thread = \
             threading.Thread(
@@ -187,10 +193,9 @@ def __do_abundance_dist__(read_parser):
     total = sum(abundance.values())
 
     if 0 == total:
-        print("ERROR: abundance distribution is uniformly zero; "
-              "nothing to report.", file=sys.stderr)
-        print(
-            "\tPlease verify that the input files are valid.", file=sys.stderr)
+        log_error("ERROR: abundance distribution is uniformly zero; "
+                  "nothing to report.")
+        log_error("\tPlease verify that the input files are valid.")
         sys.exit(1)
 
     sofar = 0
@@ -207,11 +212,12 @@ def __do_abundance_dist__(read_parser):
             break
 
     if args.savegraph:
-        print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr)
-        print('...saving to', args.savegraph, file=sys.stderr)
+        log_info('Saving k-mer countgraph to {savegraph}',
+                 savegraph=args.savegraph)
         countgraph.save(args.savegraph)
 
-    print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
+    log_info('wrote to: {output}', output=args.output_histogram_filename)
+
 
 if __name__ == '__main__':
     main()

diff --git a/scripts/abundance-dist.py b/scripts/abundance-dist.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 # This file is part of khmer, https://github.com/dib-lab/khmer/, and is
 # Copyright (C) 2010-2015, Michigan State University.
-# Copyright (C) 2015, The Regents of the University of California.
+# Copyright (C) 2015-2016, The Regents of the University of California.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
@@ -53,6 +53,8 @@
 from khmer.kfile import check_input_files
 from khmer.khmer_args import (info, sanitize_help, ComboFormatter,
                               _VersionStdErrAction)
+from khmer.khmer_logger import (configure_logging, log_info, log_error,
+                                log_warn)
 
 
 def get_parser():
@@ -89,28 +91,32 @@ def get_parser():
     parser.add_argument('-f', '--force', default=False, action='store_true',
                         help='Continue even if specified input files '
                         'do not exist or are empty.')
+    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
+                        action='store_true')
     return parser
 
 
 def main():
-    info('abundance-dist.py', ['counting'])
     args = sanitize_help(get_parser()).parse_args()
+    if not args.quiet:
+        info('abundance-dist.py', ['counting'])
+
+    configure_logging(args.quiet)
 
     infiles = [args.input_count_graph_filename,
                args.input_sequence_filename]
     for infile in infiles:
         check_input_files(infile, False)
 
-    print('Counting graph from', args.input_count_graph_filename,
-          file=sys.stderr)
+    log_info('Loading counting graph from {graph}',
+             graph=args.input_count_graph_filename)
     countgraph = khmer.load_countgraph(
         args.input_count_graph_filename)
 
     if not countgraph.get_use_bigcount() and args.bigcount:
-        print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
-              " reporting is ENABLED--counts higher than 255 will not be "
-              "reported.",
-              file=sys.stderr)
+        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
+                 "bigcount reporting is ENABLED--counts higher than 255 will "
+                 "not be reported.")
 
     countgraph.set_use_bigcount(args.bigcount)
 
@@ -119,31 +125,29 @@ def main():
     tracking = khmer._Nodegraph(  # pylint: disable=protected-access
         kmer_size, hashsizes)
 
-    print('K:', kmer_size, file=sys.stderr)
-    print('outputting to', args.output_histogram_filename, file=sys.stderr)
+    log_info('K: {ksize}', ksize=kmer_size)
+    log_info('outputting to {output}', output=args.output_histogram_filename)
 
     if args.output_histogram_filename in ('-', '/dev/stdout'):
         pass
     elif os.path.exists(args.output_histogram_filename):
         if not args.squash_output:
-            print('ERROR: %s exists; not squashing.' %
-                  args.output_histogram_filename,
-                  file=sys.stderr)
+            log_error('ERROR: {output} exists; not squashing.',
+                      output=args.output_histogram_filename)
             sys.exit(1)
 
-        print('** squashing existing file %s' %
-              args.output_histogram_filename, file=sys.stderr)
+        log_info('** squashing existing file {output}',
+                 output=args.output_histogram_filename)
 
-    print('preparing hist...', file=sys.stderr)
+    log_info('preparing hist...')
     abundances = countgraph.abundance_distribution(
         args.input_sequence_filename, tracking)
     total = sum(abundances)
 
     if 0 == total:
-        print("ERROR: abundance distribution is uniformly zero; "
-              "nothing to report.", file=sys.stderr)
-        print("\tPlease verify that the input files are valid.",
-              file=sys.stderr)
+        log_error("ERROR: abundance distribution is uniformly zero; "
+                  "nothing to report.")
+        log_error("\tPlease verify that the input files are valid.")
         sys.exit(1)
 
     if args.output_histogram_filename in ('-', '/dev/stdout'):

diff --git a/scripts/filter-abund-single.py b/scripts/filter-abund-single.py
@@ -60,6 +60,9 @@
                          check_space_for_graph,
                          add_output_compression_type,
                          get_file_writer)
+from khmer.khmer_logger import (configure_logging, log_info, log_error,
+                                log_warn)
+
 
 DEFAULT_CUTOFF = 2
 
@@ -96,14 +99,18 @@ def get_parser():
                         help="FAST[AQ] sequence file to trim")
     parser.add_argument('-f', '--force', default=False, action='store_true',
                         help='Overwrite output file if it exists')
+    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
+                        action='store_true')
     add_output_compression_type(parser)
     return parser
 
 
 def main():
-    info('filter-abund-single.py', ['counting', 'SeqAn'])
     args = sanitize_help(get_parser()).parse_args()
+    if not args.quiet:
+        info('filter-abund-single.py', ['counting', 'SeqAn'])
 
+    configure_logging(args.quiet)
     check_input_files(args.datafile, args.force)
     check_space([args.datafile], args.force)
 
@@ -113,13 +120,13 @@ def main():
 
     report_on_config(args)
 
-    print('making countgraph', file=sys.stderr)
+    log_info('making countgraph')
     graph = khmer_args.create_countgraph(args)
 
     # first, load reads into graph
     rparser = khmer.ReadParser(args.datafile)
     threads = []
-    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
+    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
     for _ in range(args.threads):
         cur_thread = \
             threading.Thread(
@@ -132,11 +139,10 @@ def main():
     for _ in threads:
         _.join()
 
-    print('Total number of unique k-mers: {0}'.format(
-        graph.n_unique_kmers()), file=sys.stderr)
+    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())
 
     fp_rate = khmer.calc_expected_collisions(graph, args.force)
-    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
+    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)
 
     # now, trim.
 
@@ -156,22 +162,22 @@ def process_fn(record):
         return None, None
 
     # the filtering loop
-    print('filtering', args.datafile, file=sys.stderr)
+    log_info('filtering {datafile}', datafile=args.datafile)
     if args.outfile is None:
         outfile = os.path.basename(args.datafile) + '.abundfilt'
     else:
         outfile = args.outfile
     outfp = open(outfile, 'wb')
     outfp = get_file_writer(outfp, args.gzip, args.bzip)
 
-    tsp = ThreadedSequenceProcessor(process_fn)
+    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
     tsp.start(verbose_loader(args.datafile), outfp)
 
-    print('output in', outfile, file=sys.stderr)
+    log_info('output in {outfile}', outfile=outfile)
 
     if args.savegraph:
-        print('Saving k-mer countgraph filename',
-              args.savegraph, file=sys.stderr)
+        log_info('Saving k-mer countgraph filename {graph}',
+                 graph=args.savegraph)
         graph.save(args.savegraph)
 
 if __name__ == '__main__':