From d70254589cc98ae10e9c91f0633437cdbdb13ab8 Mon Sep 17 00:00:00 2001 From: Dan Blanchard Date: Thu, 19 Oct 2017 10:21:37 -0400 Subject: [PATCH] add benchmark script --- bench.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 bench.py diff --git a/bench.py b/bench.py new file mode 100644 index 00000000..81d4a84c --- /dev/null +++ b/bench.py @@ -0,0 +1,125 @@ +""" +Run chardet on a bunch of documents and see that we get the correct encodings. + +:author: Dan Blanchard +:author: Ian Cordasco +""" + +from __future__ import print_function, with_statement + +import argparse +import sys +import timeit +from collections import defaultdict +from io import open +from os import listdir +from os.path import dirname, isdir, join, realpath, relpath, splitext + +import chardet + +try: + import cchardet + HAVE_CCHARDET = True +except: + HAVE_CCHARDET = False + + +# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we +# retrain model. +MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250', + 'windows-1254', 'windows-1256']) +EXPECTED_FAILURES = set(['tests/iso-8859-7-greek/disabled.gr.xml', + 'tests/iso-8859-9-turkish/divxplanet.com.xml', + 'tests/iso-8859-9-turkish/subtitle.srt', + 'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt']) + +def get_test_files(): + """Yields filenames to use for timing chardet.detect""" + base_path = relpath(join(dirname(realpath(__file__)), 'tests')) + for encoding in listdir(base_path): + path = join(base_path, encoding) + # Skip files in tests directory + if not isdir(path): + continue + # Remove language suffixes from encoding if pressent + encoding = encoding.lower() + for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek', + '-hebrew', '-hungarian', '-turkish']: + if encoding.endswith(postfix): + encoding = encoding.rpartition(postfix)[0] + break + # Skip directories for encodings we don't handle yet. + if encoding in MISSING_ENCODINGS: + continue + # Test encoding detection for each file we have of encoding for + for file_name in listdir(path): + ext = splitext(file_name)[1].lower() + if ext not in ['.html', '.txt', '.xml', '.srt']: + continue + full_path = join(path, file_name) + if full_path in EXPECTED_FAILURES: + continue + yield full_path, encoding + + +def benchmark(chardet_mod=chardet, verbose=False, num_iters=10): + print('Benchmarking {} {}'.format(chardet_mod.__name__, + chardet_mod.__version__)) + print('-' * 80) + glocals = dict(globals()) + total_time = 0 + num_files = 0 + encoding_times = defaultdict(float) + encoding_num_files = defaultdict(int) + for full_path, encoding in get_test_files(): + num_files += 1 + with open(full_path, 'rb') as f: + input_bytes = f.read() + glocals.update(locals()) + bench_time = timeit.timeit('chardet_mod.detect(input_bytes)', + globals=glocals, + number=num_iters) + if verbose: + print('Average time for {}: {}s'.format(full_path, + bench_time / num_iters)) + else: + print('.', end='') + sys.stdout.flush() + total_time += bench_time + encoding_times[encoding] += bench_time + encoding_num_files[encoding] += 1 + + print('\nCalls per second for each encoding:') + for encoding in sorted(encoding_times.keys()): + print('{}: {}'.format(encoding, + num_iters * encoding_times[encoding] / + encoding_num_files[encoding])) + + print('\nTotal time: {}s ({} calls per second)'.format(total_time, + num_iters * num_files / + total_time)) + + +def main(): + parser = argparse.ArgumentParser( + description='Times how long it takes to process each file in test set ' + 'multiple times.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('-v', '--verbose', + help='Prints out the timing for each individual file.', + action='store_true') + parser.add_argument('-i', '--iterations', + help='Number of times to process each file', + type=int, + default=10) + args = parser.parse_args() + + benchmark(verbose=args.verbose, num_iters=args.iterations) + + if HAVE_CCHARDET: + print('\n') + benchmark(cchardet, verbose=args.verbose, num_iters=args.iterations) + + +if __name__ == '__main__': + main()