Skip to content

Commit

Permalink
Merge pull request #1380 from dib-lab/update/python_api
Browse files Browse the repository at this point in the history
Cleanups from khmer onboarding
  • Loading branch information
standage committed Jun 27, 2016
2 parents 98d075d + 492789c commit 5a83929
Show file tree
Hide file tree
Showing 14 changed files with 327 additions and 143 deletions.
13 changes: 13 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
* doc/requirements.txt: updated sphinx-autoprogram requirement for building
documentation, to point at released version.

2016-06-26 Titus Brown <titus@idyll.org>

* khmer/_khmer.cc, tests/test_nodegraph.py: 'add' is now a synonym for
graph.count(kmer).
* khmer/thread_utils.py: update verbose_loader function to take 'verbose'
argument.
* scripts/{abundance-dist-single.py, abundance-dist.py,
filter-abund-single.py, filter-abund.py, load-into-counting.py,
normalize-by-median.py,trim-low-abund.py}: updated to
respect -q/--quiet.
* tests/test_filter_abund.py: tests for '-q'.
* scripts/partition-graph.py: fix typo in args help message.

2016-06-26 Titus Brown <titus@idyll.org>

* khmer/_khmer.cc, lib/{hashtable.cc, hashtable.hh},
Expand Down
5 changes: 5 additions & 0 deletions khmer/_khmer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2852,6 +2852,11 @@ static PyMethodDef khmer_hashtable_methods[] = {
(PyCFunction)hashtable_count, METH_VARARGS,
"Increment the count of this k-mer."
},
{
"add",
(PyCFunction)hashtable_count, METH_VARARGS,
"Increment the count of this k-mer. (Synonym for 'count'.)"
},
{
"consume",
(PyCFunction)hashtable_consume, METH_VARARGS,
Expand Down
3 changes: 2 additions & 1 deletion khmer/thread_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import sys
import screed
from khmer.utils import write_record, check_is_pair
from khmer.khmer_logger import log_info
# stdlib queue module was renamed on Python 3
try:
import queue
Expand All @@ -56,7 +57,7 @@ def verbose_loader(filename):
screed_iter = screed.open(filename)
for num, record in enumerate(screed_iter):
if num % 100000 == 0:
print('... filtering', num, file=sys.stderr)
log_info('... filtering {num}', num=num)
yield record

verbose_fasta_iter = verbose_loader # pylint: disable=invalid-name
Expand Down
56 changes: 31 additions & 25 deletions scripts/abundance-dist-single.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) 2010-2015, Michigan State University.
# Copyright (C) 2015, The Regents of the University of California.
# Copyright (C) 2015-2016, The Regents of the University of California.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -55,6 +55,8 @@
report_on_config, info, calculate_graphsize,
sanitize_help)
from khmer.kfile import (check_input_files, check_space_for_graph)
from khmer.khmer_logger import (configure_logging, log_info, log_error,
log_warn)


def get_parser():
Expand Down Expand Up @@ -97,12 +99,17 @@ def get_parser():
"filename.")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
return parser


def main(): # pylint: disable=too-many-locals,too-many-branches
info('abundance-dist-single.py', ['counting', 'SeqAn'])
args = sanitize_help(get_parser()).parse_args()
if not args.quiet:
info('abundance-dist-single.py', ['counting', 'SeqAn'])

configure_logging(args.quiet)
report_on_config(args)

check_input_files(args.input_sequence_filename, args.force)
Expand All @@ -111,8 +118,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
check_space_for_graph(args.savegraph, graphsize, args.force)
if (not args.squash_output and
os.path.exists(args.output_histogram_filename)):
print('ERROR: %s exists; not squashing.' %
args.output_histogram_filename, file=sys.stderr)
log_error('ERROR: {output} exists; not squashing.',
output=args.output_histogram_filename)
sys.exit(1)
else:
hist_fp = open(args.output_histogram_filename, 'w')
Expand All @@ -121,23 +128,22 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
'cumulative_fraction'])

print('making countgraph', file=sys.stderr)
log_info('making countgraph')
countgraph = khmer_args.create_countgraph(args, multiplier=1.1)
countgraph.set_use_bigcount(args.bigcount)

print('building k-mer tracking graph', file=sys.stderr)
log_info('building k-mer tracking graph')
tracking = khmer_args.create_nodegraph(args, multiplier=1.1)

print('kmer_size:', countgraph.ksize(), file=sys.stderr)
print('k-mer countgraph sizes:',
countgraph.hashsizes(), file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
log_info('outputting to {output}', output=args.output_histogram_filename)

# start loading
rparser = khmer.ReadParser(args.input_sequence_filename)
threads = []
print('consuming input, round 1 --',
args.input_sequence_filename, file=sys.stderr)
log_info('consuming input, round 1 -- {input}',
input=args.input_sequence_filename)
for _ in range(args.threads):
thread = \
threading.Thread(
Expand All @@ -150,8 +156,8 @@ def main(): # pylint: disable=too-many-locals,too-many-branches
for thread in threads:
thread.join()

print('Total number of unique k-mers: {0}'.format(
countgraph.n_unique_kmers()), file=sys.stderr)
log_info('Total number of unique k-mers: {nk}',
nk=countgraph.n_unique_kmers())

abundance_lists = []

Expand All @@ -160,12 +166,12 @@ def __do_abundance_dist__(read_parser):
read_parser, tracking)
abundance_lists.append(abundances)

print('preparing hist from %s...' %
args.input_sequence_filename, file=sys.stderr)
log_info('preparing hist from {seqfile}...',
seqfile=args.input_sequence_filename)
rparser = khmer.ReadParser(args.input_sequence_filename)
threads = []
print('consuming input, round 2 --',
args.input_sequence_filename, file=sys.stderr)
log_info('consuming input, round 2 -- {filename}',
filename=args.input_sequence_filename)
for _ in range(args.threads):
thread = \
threading.Thread(
Expand All @@ -187,10 +193,9 @@ def __do_abundance_dist__(read_parser):
total = sum(abundance.values())

if 0 == total:
print("ERROR: abundance distribution is uniformly zero; "
"nothing to report.", file=sys.stderr)
print(
"\tPlease verify that the input files are valid.", file=sys.stderr)
log_error("ERROR: abundance distribution is uniformly zero; "
"nothing to report.")
log_error("\tPlease verify that the input files are valid.")
sys.exit(1)

sofar = 0
Expand All @@ -207,11 +212,12 @@ def __do_abundance_dist__(read_parser):
break

if args.savegraph:
print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr)
print('...saving to', args.savegraph, file=sys.stderr)
log_info('Saving k-mer countgraph to {savegraph}',
savegraph=args.savegraph)
countgraph.save(args.savegraph)

print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
log_info('wrote to: {output}', output=args.output_histogram_filename)


if __name__ == '__main__':
main()
Expand Down
44 changes: 24 additions & 20 deletions scripts/abundance-dist.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env python
# This file is part of khmer, https://github.com/dib-lab/khmer/, and is
# Copyright (C) 2010-2015, Michigan State University.
# Copyright (C) 2015, The Regents of the University of California.
# Copyright (C) 2015-2016, The Regents of the University of California.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -53,6 +53,8 @@
from khmer.kfile import check_input_files
from khmer.khmer_args import (info, sanitize_help, ComboFormatter,
_VersionStdErrAction)
from khmer.khmer_logger import (configure_logging, log_info, log_error,
log_warn)


def get_parser():
Expand Down Expand Up @@ -89,28 +91,32 @@ def get_parser():
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Continue even if specified input files '
'do not exist or are empty.')
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
return parser


def main():
info('abundance-dist.py', ['counting'])
args = sanitize_help(get_parser()).parse_args()
if not args.quiet:
info('abundance-dist.py', ['counting'])

configure_logging(args.quiet)

infiles = [args.input_count_graph_filename,
args.input_sequence_filename]
for infile in infiles:
check_input_files(infile, False)

print('Counting graph from', args.input_count_graph_filename,
file=sys.stderr)
log_info('Loading counting graph from {graph}',
graph=args.input_count_graph_filename)
countgraph = khmer.load_countgraph(
args.input_count_graph_filename)

if not countgraph.get_use_bigcount() and args.bigcount:
print("WARNING: The loaded graph has bigcount DISABLED while bigcount"
" reporting is ENABLED--counts higher than 255 will not be "
"reported.",
file=sys.stderr)
log_warn("WARNING: The loaded graph has bigcount DISABLED while "
"bigcount reporting is ENABLED--counts higher than 255 will "
"not be reported.")

countgraph.set_use_bigcount(args.bigcount)

Expand All @@ -119,31 +125,29 @@ def main():
tracking = khmer._Nodegraph( # pylint: disable=protected-access
kmer_size, hashsizes)

print('K:', kmer_size, file=sys.stderr)
print('outputting to', args.output_histogram_filename, file=sys.stderr)
log_info('K: {ksize}', ksize=kmer_size)
log_info('outputting to {output}', output=args.output_histogram_filename)

if args.output_histogram_filename in ('-', '/dev/stdout'):
pass
elif os.path.exists(args.output_histogram_filename):
if not args.squash_output:
print('ERROR: %s exists; not squashing.' %
args.output_histogram_filename,
file=sys.stderr)
log_error('ERROR: {output} exists; not squashing.',
output=args.output_histogram_filename)
sys.exit(1)

print('** squashing existing file %s' %
args.output_histogram_filename, file=sys.stderr)
log_info('** squashing existing file {output}',
output=args.output_histogram_filename)

print('preparing hist...', file=sys.stderr)
log_info('preparing hist...')
abundances = countgraph.abundance_distribution(
args.input_sequence_filename, tracking)
total = sum(abundances)

if 0 == total:
print("ERROR: abundance distribution is uniformly zero; "
"nothing to report.", file=sys.stderr)
print("\tPlease verify that the input files are valid.",
file=sys.stderr)
log_error("ERROR: abundance distribution is uniformly zero; "
"nothing to report.")
log_error("\tPlease verify that the input files are valid.")
sys.exit(1)

if args.output_histogram_filename in ('-', '/dev/stdout'):
Expand Down
28 changes: 17 additions & 11 deletions scripts/filter-abund-single.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@
check_space_for_graph,
add_output_compression_type,
get_file_writer)
from khmer.khmer_logger import (configure_logging, log_info, log_error,
log_warn)


DEFAULT_CUTOFF = 2

Expand Down Expand Up @@ -96,14 +99,18 @@ def get_parser():
help="FAST[AQ] sequence file to trim")
parser.add_argument('-f', '--force', default=False, action='store_true',
help='Overwrite output file if it exists')
parser.add_argument('-q', '--quiet', dest='quiet', default=False,
action='store_true')
add_output_compression_type(parser)
return parser


def main():
info('filter-abund-single.py', ['counting', 'SeqAn'])
args = sanitize_help(get_parser()).parse_args()
if not args.quiet:
info('filter-abund-single.py', ['counting', 'SeqAn'])

configure_logging(args.quiet)
check_input_files(args.datafile, args.force)
check_space([args.datafile], args.force)

Expand All @@ -113,13 +120,13 @@ def main():

report_on_config(args)

print('making countgraph', file=sys.stderr)
log_info('making countgraph')
graph = khmer_args.create_countgraph(args)

# first, load reads into graph
rparser = khmer.ReadParser(args.datafile)
threads = []
print('consuming input, round 1 --', args.datafile, file=sys.stderr)
log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
for _ in range(args.threads):
cur_thread = \
threading.Thread(
Expand All @@ -132,11 +139,10 @@ def main():
for _ in threads:
_.join()

print('Total number of unique k-mers: {0}'.format(
graph.n_unique_kmers()), file=sys.stderr)
log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

fp_rate = khmer.calc_expected_collisions(graph, args.force)
print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)
log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

# now, trim.

Expand All @@ -156,22 +162,22 @@ def process_fn(record):
return None, None

# the filtering loop
print('filtering', args.datafile, file=sys.stderr)
log_info('filtering {datafile}', datafile=args.datafile)
if args.outfile is None:
outfile = os.path.basename(args.datafile) + '.abundfilt'
else:
outfile = args.outfile
outfp = open(outfile, 'wb')
outfp = get_file_writer(outfp, args.gzip, args.bzip)

tsp = ThreadedSequenceProcessor(process_fn)
tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
tsp.start(verbose_loader(args.datafile), outfp)

print('output in', outfile, file=sys.stderr)
log_info('output in {outfile}', outfile=outfile)

if args.savegraph:
print('Saving k-mer countgraph filename',
args.savegraph, file=sys.stderr)
log_info('Saving k-mer countgraph filename {graph}',
graph=args.savegraph)
graph.save(args.savegraph)

if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 5a83929

Please sign in to comment.