Skip to content

Commit

Permalink
Merge branch 'kdmurray91-feature/load-into-counting_tsv-output'
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-c committed Dec 1, 2014
2 parents 365e399 + 5b04e2d commit 820710c
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 14 deletions.
5 changes: 5 additions & 0 deletions ChangeLog
@@ -1,3 +1,8 @@
2014-12-01 Kevin Murray <spam@kdmurray.id.au>

* load-into-counting.py: Add a CLI parameter to output a machine-readable
summary of the run, including number of k-mers, FPR, input files etc in
json or TSV format.

2014-12-01 Titus Brown <t@idyll.org>

Expand Down
63 changes: 49 additions & 14 deletions scripts/load-into-counting.py
Expand Up @@ -13,6 +13,8 @@
Use '-h' for parameter help.
"""

import json
import os
import sys
import threading
import textwrap
Expand Down Expand Up @@ -40,7 +42,7 @@ def get_parser():
Example::
load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
"""

parser = build_counting_args("Build a k-mer counting table from the given"
Expand All @@ -54,6 +56,10 @@ def get_parser():
parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
action='store_false',
help='Do not count k-mers past 255')
parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT",
choices=['json', 'tsv'],
help="What format should the machine readable run "
"summary be in? (json or tsv, disabled by default)")
parser.add_argument('--report-total-kmers', '-t', action='store_true',
help="Prints the total number of k-mers to stderr")
return parser
Expand All @@ -62,6 +68,7 @@ def get_parser():
def main():

info('load-into-counting.py', ['counting'])

args = get_parser().parse_args()
report_on_config(args)

Expand All @@ -74,9 +81,13 @@ def main():
check_space(args.input_sequence_filename)
check_space_for_hashtable(args.n_tables * args.min_tablesize)

print >>sys.stderr, 'Saving k-mer counting table to %s' % base
print >>sys.stderr, 'Saving k-mer counting table to %s' % base
print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

# clobber the '.info' file now, as we always open in append mode below
if os.path.exists(base + '.info'):
os.remove(base + '.info')

print >>sys.stderr, 'making k-mer counting table'
htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
args.n_tables, args.threads)
Expand Down Expand Up @@ -108,32 +119,56 @@ def main():
check_space_for_hashtable(args.n_tables * args.min_tablesize)
print >>sys.stderr, 'mid-save', base
htable.save(base)
open(base + '.info', 'w').write('through %s' % filename)
with open(base + '.info', 'a') as info_fh:
print >> info_fh, 'through', filename

n_kmers = htable.n_unique_kmers()
if args.report_total_kmers:
print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
htable.n_unique_kmers())
print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
with open(base + '.info', 'a') as info_fp:
print >>info_fp, 'Total number of unique k-mers:', n_kmers

print >>sys.stderr, 'saving', base
htable.save(base)

info_fp = open(base + '.info', 'w')
info_fp.write('through end: %s\n' % filename)

# Change 0.2 only if you really grok it. HINT: You don't.
fp_rate = khmer.calc_expected_collisions(htable)
print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

with open(base + '.info', 'a') as info_fp:
print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

if args.summary_info:
mr_fmt = args.summary_info.lower()
mr_file = base + '.info.' + mr_fmt
print >> sys.stderr, "Writing summmary info to", mr_file
with open(mr_file, 'w') as mr_fh:
if mr_fmt == 'json':
mr_data = {
"ht_name": os.path.basename(base),
"fpr": fp_rate,
"num_kmers": n_kmers,
"files": filenames,
"mrinfo_version": "0.1.0",
}
json.dump(mr_data, mr_fh)
mr_fh.write('\n')
elif mr_fmt == 'tsv':
mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
fls=";".join(filenames)))

print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

# Change 0.2 only if you really grok it. HINT: You don't.
if fp_rate > 0.20:
print >> sys.stderr, "**"
print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
" this data set. Increase tablesize/# tables.")
print >> sys.stderr, "** ERROR: the k-mer counting table is too small",
print >> sys.stderr, "for this data set. Increase tablesize/# tables."
print >> sys.stderr, "**"
sys.exit(1)

print >>sys.stderr, 'DONE.'
print>>sys.stderr, 'wrote to: ' + base + '.info'
print >>sys.stderr, 'wrote to:', base + '.info'

if __name__ == '__main__':
main()
Expand Down
66 changes: 66 additions & 0 deletions tests/test_scripts.py
Expand Up @@ -7,6 +7,7 @@

# pylint: disable=C0111,C0103,E1103,W0612

import json
import sys
import os
import shutil
Expand Down Expand Up @@ -66,6 +67,71 @@ def test_load_into_counting_fail():
assert "ERROR:" in err


def test_load_into_counting_tsv():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'tsv']

outfile = utils.get_temp_filename('out.kh')
tabfile = outfile + '.info.tsv'
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args)
assert 'Total number of unique k-mers: 95' in err, err
assert os.path.exists(outfile)
assert os.path.exists(tabfile)
with open(tabfile) as tabfh:
tabfile_lines = tabfh.readlines()
assert len(tabfile_lines) == 2
outbase = os.path.basename(outfile)
expected_tsv_line = '\t'.join([outbase, '0.000', '95', infile]) + '\n'
assert tabfile_lines[1] == expected_tsv_line, tabfile_lines


def test_load_into_counting_json():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-t', '-s', 'json']

outfile = utils.get_temp_filename('out.kh')
jsonfile = outfile + '.info.json'
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args)
assert 'Total number of unique k-mers: 95' in err, err
assert os.path.exists(outfile)
assert os.path.exists(jsonfile)

with open(jsonfile) as jsonfh:
got_json = json.load(jsonfh)
outbase = os.path.basename(outfile)
expected_json = {
"files": [infile],
"ht_name": outbase,
"num_kmers": 95,
"fpr": 9.024965705097741e-11,
"mrinfo_version": "0.1.0",
}

assert got_json == expected_json, got_json


def test_load_into_counting_bad_summary_fmt():
script = scriptpath('load-into-counting.py')
args = ['-x', '1e7', '-N', '2', '-k', '20', '-s', 'badfmt']

outfile = utils.get_temp_filename('out.kh')
infile = utils.get_test_data('test-abund-read-2.fa')

args.extend([outfile, infile])

(status, out, err) = utils.runscript(script, args, fail_ok=True)
assert status != 0, status
assert "invalid choice: 'badfmt'" in err, err


def _make_counting(infilename, SIZE=1e7, N=2, K=20, BIGCOUNT=True):
script = scriptpath('load-into-counting.py')
args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]
Expand Down

0 comments on commit 820710c

Please sign in to comment.