Skip to content

Commit

Permalink
add benchmark script
Browse files Browse the repository at this point in the history
  • Loading branch information
dan-blanchard committed Oct 19, 2017
1 parent b3d867a commit d702545
Showing 1 changed file with 125 additions and 0 deletions.
125 changes: 125 additions & 0 deletions bench.py
@@ -0,0 +1,125 @@
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""

from __future__ import print_function, with_statement

import argparse
import sys
import timeit
from collections import defaultdict
from io import open
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext

import chardet

try:
import cchardet
HAVE_CCHARDET = True
except:
HAVE_CCHARDET = False


# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250',
'windows-1254', 'windows-1256'])
EXPECTED_FAILURES = set(['tests/iso-8859-7-greek/disabled.gr.xml',
'tests/iso-8859-9-turkish/divxplanet.com.xml',
'tests/iso-8859-9-turkish/subtitle.srt',
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'])

def get_test_files():
"""Yields filenames to use for timing chardet.detect"""
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
'-hebrew', '-hungarian', '-turkish']:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in ['.html', '.txt', '.xml', '.srt']:
continue
full_path = join(path, file_name)
if full_path in EXPECTED_FAILURES:
continue
yield full_path, encoding


def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
print('Benchmarking {} {}'.format(chardet_mod.__name__,
chardet_mod.__version__))
print('-' * 80)
glocals = dict(globals())
total_time = 0
num_files = 0
encoding_times = defaultdict(float)
encoding_num_files = defaultdict(int)
for full_path, encoding in get_test_files():
num_files += 1
with open(full_path, 'rb') as f:
input_bytes = f.read()
glocals.update(locals())
bench_time = timeit.timeit('chardet_mod.detect(input_bytes)',
globals=glocals,
number=num_iters)
if verbose:
print('Average time for {}: {}s'.format(full_path,
bench_time / num_iters))
else:
print('.', end='')
sys.stdout.flush()
total_time += bench_time
encoding_times[encoding] += bench_time
encoding_num_files[encoding] += 1

print('\nCalls per second for each encoding:')
for encoding in sorted(encoding_times.keys()):
print('{}: {}'.format(encoding,
num_iters * encoding_times[encoding] /
encoding_num_files[encoding]))

print('\nTotal time: {}s ({} calls per second)'.format(total_time,
num_iters * num_files /
total_time))


def main():
parser = argparse.ArgumentParser(
description='Times how long it takes to process each file in test set '
'multiple times.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-v', '--verbose',
help='Prints out the timing for each individual file.',
action='store_true')
parser.add_argument('-i', '--iterations',
help='Number of times to process each file',
type=int,
default=10)
args = parser.parse_args()

benchmark(verbose=args.verbose, num_iters=args.iterations)

if HAVE_CCHARDET:
print('\n')
benchmark(cchardet, verbose=args.verbose, num_iters=args.iterations)


if __name__ == '__main__':
main()

0 comments on commit d702545

Please sign in to comment.