Skip to content

Commit

Permalink
Merge pull request #205 from chardet/feature/remove_python2
Browse files Browse the repository at this point in the history
Remove support for Python < 3.6
  • Loading branch information
dan-blanchard committed Dec 10, 2020
2 parents a808ed1 + 56051dd commit 9a4e8b6
Show file tree
Hide file tree
Showing 49 changed files with 5,993 additions and 5,773 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy2, pypy3]
python-version: [3.6, 3.7, 3.8, 3.9, pypy3]

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Detects
Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
disabled until we can retrain the models.

Requires Python 2.7 or 3.5+.
Requires Python 3.6+.

Installation
------------
Expand Down
134 changes: 78 additions & 56 deletions bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,59 +5,73 @@
:author: Ian Cordasco
"""

from __future__ import print_function, with_statement

import argparse
import sys
import time
from collections import defaultdict
from io import open
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext

import chardet

try:
import cchardet

HAVE_CCHARDET = True
except:
HAVE_CCHARDET = False


# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
'windows-1254', 'windows-1256'}
EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
'tests/iso-8859-9-turkish/divxplanet.com.xml',
'tests/iso-8859-9-turkish/subtitle.srt',
'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
MISSING_ENCODINGS = {
"iso-8859-2",
"iso-8859-6",
"windows-1250",
"windows-1254",
"windows-1256",
}
EXPECTED_FAILURES = {
"tests/iso-8859-7-greek/disabled.gr.xml",
"tests/iso-8859-9-turkish/divxplanet.com.xml",
"tests/iso-8859-9-turkish/subtitle.srt",
"tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
}


def get_py_impl():
"""Return what kind of Python this is"""
if hasattr(sys, 'pypy_version_info'):
pyimpl = 'PyPy'
elif sys.platform.startswith('java'):
pyimpl = 'Jython'
elif sys.platform == 'cli':
pyimpl = 'IronPython'
if hasattr(sys, "pypy_version_info"):
pyimpl = "PyPy"
elif sys.platform.startswith("java"):
pyimpl = "Jython"
elif sys.platform == "cli":
pyimpl = "IronPython"
else:
pyimpl = 'CPython'
pyimpl = "CPython"
return pyimpl


def get_test_files():
"""Yields filenames to use for timing chardet.detect"""
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
base_path = relpath(join(dirname(realpath(__file__)), "tests"))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
'-hebrew', '-hungarian', '-turkish']:
for postfix in [
"-arabic",
"-bulgarian",
"-cyrillic",
"-greek",
"-hebrew",
"-hungarian",
"-turkish",
]:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
Expand All @@ -67,7 +81,7 @@ def get_test_files():
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in ['.html', '.txt', '.xml', '.srt']:
if ext not in [".html", ".txt", ".xml", ".srt"]:
continue
full_path = join(path, file_name)
if full_path in EXPECTED_FAILURES:
Expand All @@ -76,71 +90,79 @@ def get_test_files():


def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
print('Benchmarking {} {} on {} {}'.format(chardet_mod.__name__,
chardet_mod.__version__,
get_py_impl(),
sys.version))
print('-' * 80)
print(
f"Benchmarking {chardet_mod.__name__} {chardet_mod.__version__} "
f"on {get_py_impl()} {sys.version}"
)
print("-" * 80)
total_time = 0
num_files = 0
encoding_times = defaultdict(float)
encoding_num_files = defaultdict(int)
for full_path, encoding in get_test_files():
num_files += 1
with open(full_path, 'rb') as f:
with open(full_path, "rb") as f:
input_bytes = f.read()
start = time.time()
for _ in range(num_iters):
chardet_mod.detect(input_bytes)
bench_time = time.time() - start
if verbose:
print('Average time for {}: {}s'.format(full_path,
bench_time / num_iters))
print(f"Average time for {full_path}: {bench_time / num_iters}s")
else:
print('.', end='')
print(".", end="")
sys.stdout.flush()
total_time += bench_time
encoding_times[encoding] += bench_time
encoding_num_files[encoding] += 1

print('\nCalls per second for each encoding:')
print("\nCalls per second for each encoding:")
for encoding in sorted(encoding_times.keys()):
print('{}: {}'.format(encoding,
num_iters * encoding_num_files[encoding] /
encoding_times[encoding]))

print('\nTotal time: {}s ({} calls per second)'.format(total_time,
num_iters * num_files /
total_time))
calls_per_sec = (
num_iters * encoding_num_files[encoding] / encoding_times[encoding]
)
print(f"{encoding}: {calls_per_sec}")
calls_per_sec = num_iters * num_files / total_time
print(f"\nTotal time: {total_time}s ({calls_per_sec} calls per second)")


def main():
parser = argparse.ArgumentParser(
description='Times how long it takes to process each file in test set '
'multiple times.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--cchardet',
action='store_true',
help='Run benchmarks for cChardet instead of chardet, '
'if it is installed.')
parser.add_argument('-i', '--iterations',
help='Number of times to process each file',
type=int,
default=10)
parser.add_argument('-v', '--verbose',
help='Prints out the timing for each individual file.',
action='store_true')
description="Times how long it takes to process each file in test set "
"multiple times.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-c",
"--cchardet",
action="store_true",
help="Run benchmarks for cChardet instead of chardet, " "if it is installed.",
)
parser.add_argument(
"-i",
"--iterations",
help="Number of times to process each file",
type=int,
default=10,
)
parser.add_argument(
"-v",
"--verbose",
help="Prints out the timing for each individual file.",
action="store_true",
)
args = parser.parse_args()

if args.cchardet and not HAVE_CCHARDET:
print('You must pip install cchardet if you want to benchmark it.')
print("You must pip install cchardet if you want to benchmark it.")
sys.exit(1)


benchmark(chardet_mod=cchardet if args.cchardet else chardet,
verbose=args.verbose,
num_iters=args.iterations)
benchmark(
chardet_mod=cchardet if args.cchardet else chardet,
verbose=args.verbose,
num_iters=args.iterations,
)


if __name__ == '__main__':
if __name__ == "__main__":
main()
38 changes: 21 additions & 17 deletions chardet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@
######################### END LICENSE BLOCK #########################


from .universaldetector import UniversalDetector
from .enums import InputState
from .version import __version__, VERSION

from .universaldetector import UniversalDetector
from .version import VERSION, __version__

__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]


def detect(byte_str):
Expand All @@ -33,8 +32,9 @@ def detect(byte_str):
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
raise TypeError(
"Expected object of type bytes or bytearray, got: " f"{type(byte_str)}"
)
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
Expand All @@ -51,8 +51,9 @@ def detect_all(byte_str):
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{}'.format(type(byte_str)))
raise TypeError(
"Expected object of type bytes or bytearray, got: " f"{type(byte_str)}"
)
else:
byte_str = bytearray(byte_str)

Expand All @@ -68,16 +69,19 @@ def detect_all(byte_str):
lower_charset_name = prober.charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if lower_charset_name.startswith("iso-8859"):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence(),
'language': prober.language,
})
charset_name = detector.ISO_WIN_MAP.get(
lower_charset_name, charset_name
)
results.append(
{
"encoding": charset_name,
"confidence": prober.get_confidence(),
"language": prober.language,
}
)
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])
return sorted(results, key=lambda result: -result["confidence"])

return [detector.result]
6 changes: 3 additions & 3 deletions chardet/big5freq.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@

BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75

#Char to FreqOrder table
# Char to FreqOrder table
BIG5_TABLE_SIZE = 5376

# fmt: off
BIG5_CHAR_TO_FREQ_ORDER = (
1,1801,1506, 255,1431, 198, 9, 82, 6,5008, 177, 202,3681,1256,2821, 110, # 16
3814, 33,3274, 261, 76, 44,2114, 16,2946,2187,1176, 659,3971, 26,3451,2653, # 32
Expand Down Expand Up @@ -383,4 +383,4 @@
890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
)

# fmt: on
6 changes: 3 additions & 3 deletions chardet/big5prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from .mbcharsetprober import MultiByteCharSetProber
from .codingstatemachine import CodingStateMachine
from .chardistribution import Big5DistributionAnalysis
from .codingstatemachine import CodingStateMachine
from .mbcharsetprober import MultiByteCharSetProber
from .mbcssm import BIG5_SM_MODEL


class Big5Prober(MultiByteCharSetProber):
def __init__(self):
super(Big5Prober, self).__init__()
super().__init__()
self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
self.distribution_analyzer = Big5DistributionAnalysis()
self.reset()
Expand Down

0 comments on commit 9a4e8b6

Please sign in to comment.