Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b3d867a
commit d702545
Showing
1 changed file
with
125 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
""" | ||
Run chardet on a bunch of documents and see that we get the correct encodings. | ||
:author: Dan Blanchard | ||
:author: Ian Cordasco | ||
""" | ||
|
||
from __future__ import print_function, with_statement | ||
|
||
import argparse | ||
import sys | ||
import timeit | ||
from collections import defaultdict | ||
from io import open | ||
from os import listdir | ||
from os.path import dirname, isdir, join, realpath, relpath, splitext | ||
|
||
import chardet | ||
|
||
try: | ||
import cchardet | ||
HAVE_CCHARDET = True | ||
except: | ||
HAVE_CCHARDET = False | ||
|
||
|
||
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we | ||
# retrain model. | ||
MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250', | ||
'windows-1254', 'windows-1256']) | ||
EXPECTED_FAILURES = set(['tests/iso-8859-7-greek/disabled.gr.xml', | ||
'tests/iso-8859-9-turkish/divxplanet.com.xml', | ||
'tests/iso-8859-9-turkish/subtitle.srt', | ||
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt']) | ||
|
||
def get_test_files(): | ||
"""Yields filenames to use for timing chardet.detect""" | ||
base_path = relpath(join(dirname(realpath(__file__)), 'tests')) | ||
for encoding in listdir(base_path): | ||
path = join(base_path, encoding) | ||
# Skip files in tests directory | ||
if not isdir(path): | ||
continue | ||
# Remove language suffixes from encoding if pressent | ||
encoding = encoding.lower() | ||
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek', | ||
'-hebrew', '-hungarian', '-turkish']: | ||
if encoding.endswith(postfix): | ||
encoding = encoding.rpartition(postfix)[0] | ||
break | ||
# Skip directories for encodings we don't handle yet. | ||
if encoding in MISSING_ENCODINGS: | ||
continue | ||
# Test encoding detection for each file we have of encoding for | ||
for file_name in listdir(path): | ||
ext = splitext(file_name)[1].lower() | ||
if ext not in ['.html', '.txt', '.xml', '.srt']: | ||
continue | ||
full_path = join(path, file_name) | ||
if full_path in EXPECTED_FAILURES: | ||
continue | ||
yield full_path, encoding | ||
|
||
|
||
def benchmark(chardet_mod=chardet, verbose=False, num_iters=10): | ||
print('Benchmarking {} {}'.format(chardet_mod.__name__, | ||
chardet_mod.__version__)) | ||
print('-' * 80) | ||
glocals = dict(globals()) | ||
total_time = 0 | ||
num_files = 0 | ||
encoding_times = defaultdict(float) | ||
encoding_num_files = defaultdict(int) | ||
for full_path, encoding in get_test_files(): | ||
num_files += 1 | ||
with open(full_path, 'rb') as f: | ||
input_bytes = f.read() | ||
glocals.update(locals()) | ||
bench_time = timeit.timeit('chardet_mod.detect(input_bytes)', | ||
globals=glocals, | ||
number=num_iters) | ||
if verbose: | ||
print('Average time for {}: {}s'.format(full_path, | ||
bench_time / num_iters)) | ||
else: | ||
print('.', end='') | ||
sys.stdout.flush() | ||
total_time += bench_time | ||
encoding_times[encoding] += bench_time | ||
encoding_num_files[encoding] += 1 | ||
|
||
print('\nCalls per second for each encoding:') | ||
for encoding in sorted(encoding_times.keys()): | ||
print('{}: {}'.format(encoding, | ||
num_iters * encoding_times[encoding] / | ||
encoding_num_files[encoding])) | ||
|
||
print('\nTotal time: {}s ({} calls per second)'.format(total_time, | ||
num_iters * num_files / | ||
total_time)) | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description='Times how long it takes to process each file in test set ' | ||
'multiple times.', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('-v', '--verbose', | ||
help='Prints out the timing for each individual file.', | ||
action='store_true') | ||
parser.add_argument('-i', '--iterations', | ||
help='Number of times to process each file', | ||
type=int, | ||
default=10) | ||
args = parser.parse_args() | ||
|
||
benchmark(verbose=args.verbose, num_iters=args.iterations) | ||
|
||
if HAVE_CCHARDET: | ||
print('\n') | ||
benchmark(cchardet, verbose=args.verbose, num_iters=args.iterations) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |