add benchmark script

chardet · Oct 19, 2017 · d702545 · d702545
1 parent b3d867a
commit d702545
Showing 1 changed file with 125 additions and 0 deletions.
diff --git a/bench.py b/bench.py
@@ -0,0 +1,125 @@
+"""
+Run chardet on a bunch of documents and see that we get the correct encodings.
+
+:author: Dan Blanchard
+:author: Ian Cordasco
+"""
+
+from __future__ import print_function, with_statement
+
+import argparse
+import sys
+import timeit
+from collections import defaultdict
+from io import open
+from os import listdir
+from os.path import dirname, isdir, join, realpath, relpath, splitext
+
+import chardet
+
+try:
+    import cchardet
+    HAVE_CCHARDET = True
+except:
+    HAVE_CCHARDET = False
+
+
+# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
+#       retrain model.
+MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250',
+                         'windows-1254', 'windows-1256'])
+EXPECTED_FAILURES = set(['tests/iso-8859-7-greek/disabled.gr.xml',
+                         'tests/iso-8859-9-turkish/divxplanet.com.xml',
+                         'tests/iso-8859-9-turkish/subtitle.srt',
+                         'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'])
+
+def get_test_files():
+    """Yields filenames to use for timing chardet.detect"""
+    base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
+    for encoding in listdir(base_path):
+        path = join(base_path, encoding)
+        # Skip files in tests directory
+        if not isdir(path):
+            continue
+        # Remove language suffixes from encoding if pressent
+        encoding = encoding.lower()
+        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
+                        '-hebrew', '-hungarian', '-turkish']:
+            if encoding.endswith(postfix):
+                encoding = encoding.rpartition(postfix)[0]
+                break
+        # Skip directories for encodings we don't handle yet.
+        if encoding in MISSING_ENCODINGS:
+            continue
+        # Test encoding detection for each file we have of encoding for
+        for file_name in listdir(path):
+            ext = splitext(file_name)[1].lower()
+            if ext not in ['.html', '.txt', '.xml', '.srt']:
+                continue
+            full_path = join(path, file_name)
+            if full_path in EXPECTED_FAILURES:
+                continue
+            yield full_path, encoding
+
+
+def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
+    print('Benchmarking {} {}'.format(chardet_mod.__name__,
+                                      chardet_mod.__version__))
+    print('-' * 80)
+    glocals = dict(globals())
+    total_time = 0
+    num_files = 0
+    encoding_times = defaultdict(float)
+    encoding_num_files = defaultdict(int)
+    for full_path, encoding in get_test_files():
+        num_files += 1
+        with open(full_path, 'rb') as f:
+            input_bytes = f.read()
+        glocals.update(locals())
+        bench_time = timeit.timeit('chardet_mod.detect(input_bytes)',
+                                   globals=glocals,
+                                   number=num_iters)
+        if verbose:
+            print('Average time for {}: {}s'.format(full_path,
+                                                    bench_time / num_iters))
+        else:
+            print('.', end='')
+            sys.stdout.flush()
+        total_time += bench_time
+        encoding_times[encoding] += bench_time
+        encoding_num_files[encoding] += 1
+
+    print('\nCalls per second for each encoding:')
+    for encoding in sorted(encoding_times.keys()):
+        print('{}: {}'.format(encoding,
+                              num_iters * encoding_times[encoding] /
+                              encoding_num_files[encoding]))
+
+    print('\nTotal time: {}s ({} calls per second)'.format(total_time,
+                                                           num_iters * num_files /
+                                                           total_time))
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Times how long it takes to process each file in test set '
+            'multiple times.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-v', '--verbose',
+                        help='Prints out the timing for each individual file.',
+                        action='store_true')
+    parser.add_argument('-i', '--iterations',
+                        help='Number of times to process each file',
+                        type=int,
+                        default=10)
+    args = parser.parse_args()
+
+    benchmark(verbose=args.verbose, num_iters=args.iterations)
+
+    if HAVE_CCHARDET:
+        print('\n')
+        benchmark(cchardet, verbose=args.verbose, num_iters=args.iterations)
+
+
+if __name__ == '__main__':
+    main()