Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CLI flag to load-into-counting.py allowing TSV output of basic stats #649

Closed
wants to merge 12 commits into from
6 changes: 6 additions & 0 deletions ChangeLog
@@ -1,3 +1,9 @@
2014-11-20 Kevin Murray <spam@kdmurray.id.au>

* load-into-counting.py: Add a CLI parameter to output a machine-readable
summary of the run, including number of k-mers, FPR, input files etc in
json or TSV format.

2014-11-19 Michael R. Crusoe <mcrusoe@msu.edu>

* CODE_OF_CONDUT.RST,doc/dev/{index,CODE_OF_CONDUCT}.txt: added a code of
Expand Down
63 changes: 49 additions & 14 deletions scripts/load-into-counting.py
Expand Up @@ -13,6 +13,8 @@
Use '-h' for parameter help.
"""

import json
import os
import sys
import threading
import textwrap
Expand Down Expand Up @@ -40,7 +42,7 @@ def get_parser():

Example::

load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
"""

parser = build_counting_args("Build a k-mer counting table from the given"
Expand All @@ -54,6 +56,10 @@ def get_parser():
parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
action='store_false',
help='Do not count k-mers past 255')
parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT",
choices=['json', 'tsv'],
help="What format should the machine readable run "
"summary be in? (json or tsv, disabled by default)")
parser.add_argument('--report-total-kmers', '-t', action='store_true',
help="Prints the total number of k-mers to stderr")
return parser
Expand All @@ -62,6 +68,7 @@ def get_parser():
def main():

info('load-into-counting.py', ['counting'])

args = get_parser().parse_args()
report_on_config(args)

Expand All @@ -74,9 +81,13 @@ def main():
check_space(args.input_sequence_filename)
check_space_for_hashtable(args.n_tables * args.min_tablesize)

print >>sys.stderr, 'Saving k-mer counting table to %s' % base
print >>sys.stderr, 'Saving k-mer counting table to %s' % base
print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

# clobber the '.info' file now, as we always open in append mode below
if os.path.exists(base + '.info'):
os.remove(base + '.info')

print >>sys.stderr, 'making k-mer counting table'
htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
args.n_tables, args.threads)
Expand Down Expand Up @@ -108,32 +119,56 @@ def main():
check_space_for_hashtable(args.n_tables * args.min_tablesize)
print >>sys.stderr, 'mid-save', base
htable.save(base)
open(base + '.info', 'w').write('through %s' % filename)
with open(base + '.info', 'a') as info_fh:
print >> info_fh, 'through', filename

n_kmers = htable.n_unique_kmers()
if args.report_total_kmers:
print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
htable.n_unique_kmers())
print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
with open(base + '.info', 'a') as info_fp:
print >>info_fp, 'Total number of unique k-mers:', n_kmers

print >>sys.stderr, 'saving', base
htable.save(base)

info_fp = open(base + '.info', 'w')
info_fp.write('through end: %s\n' % filename)

# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(htable)
print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

with open(base + '.info', 'a') as info_fp:
print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

if args.summary_info:
mr_fmt = args.summary_info.lower()
mr_file = base + '.info.' + mr_fmt
print >> sys.stderr, "Writing summmary info to", mr_file
with open(mr_file, 'w') as mr_fh:
if mr_fmt == 'json':
mr_data = {
"ht_name": os.path.basename(base),
"fpr": fp_rate,
"num_kmers": n_kmers,
"files": filenames,
"mrinfo_version": "0.1.0",
}
json.dump(mr_data, mr_fh)
mr_fh.write('\n')
elif mr_fmt == 'tsv':
mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
fls=";".join(filenames)))

print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

# Change 0.2 only if you really grok it. HINT: You don't.
if fp_rate > 0.20:
print >> sys.stderr, "**"
print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
" this data set. Increase tablesize/# tables.")
print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
print >> sys.stderr, "for this data set. Increase tablesize/# tables."
print >> sys.stderr, "**"
sys.exit(1)

print >>sys.stderr, 'DONE.'
print>>sys.stderr, 'wrote to: ' + base + '.info'
print >>sys.stderr, 'wrote to:', base + '.info'

if __name__ == '__main__':
main()
Expand Down
66 changes: 66 additions & 0 deletions tests/test_scripts.py
Expand Up @@ -7,6 +7,7 @@

# pylint: disable=C0111,C0103,E1103,W0612

import json
import sys
import os
import shutil
Expand Down Expand Up @@ -66,6 +67,71 @@ def test_load_into_counting_fail():
assert "ERROR:" in err


def test_load_into_counting_tsv():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'tsv']

outfile = utils.get_temp_filename('out.kh')
tabfile = outfile + '.info.tsv'
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args)
assert 'Total number of unique k-mers: 95' in err, err
assert os.path.exists(outfile)
assert os.path.exists(tabfile)
with open(tabfile) as tabfh:
tabfile_lines = tabfh.readlines()
assert len(tabfile_lines) == 2
outbase = os.path.basename(outfile)
expected_tsv_line = '\t'.join([outbase, '0.000', '95', infile]) + '\n'
assert tabfile_lines[1] == expected_tsv_line, tabfile_lines


def test_load_into_counting_json():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'json']

outfile = utils.get_temp_filename('out.kh')
jsonfile = outfile + '.info.json'
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args)
assert 'Total number of unique k-mers: 95' in err, err
assert os.path.exists(outfile)
assert os.path.exists(jsonfile)

with open(jsonfile) as jsonfh:
got_json = json.load(jsonfh)
outbase = os.path.basename(outfile)
expected_json = {
"files": [infile],
"ht_name": outbase,
"num_kmers": 95,
"fpr": 9.024965705097741e-11,
"mrinfo_version": "0.1.0",
}

assert got_json == expected_json, got_json


def test_load_into_counting_bad_summary_fmt():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'badfmt']

outfile = utils.get_temp_filename('out.kh')
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args, fail_ok=True)
assert status != 0, status
assert "invalid choice: 'badfmt'" in err, err


def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
script = scriptpath('load-into-counting.py')
args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]
Expand Down