Merge pull request #205 from chardet/feature/remove_python2

Remove support for Python < 3.6
chardet · Dec 10, 2020 · 9a4e8b6 · 9a4e8b6
2 parents a808ed1 + 56051dd
commit 9a4e8b6
Show file tree

Hide file tree

Showing 49 changed files with 5,993 additions and 5,773 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, pypy2, pypy3]
+        python-version: [3.6, 3.7, 3.8, 3.9, pypy3]
 
     steps:
     - uses: actions/checkout@v2

diff --git a/README.rst b/README.rst
@@ -32,7 +32,7 @@ Detects
    Our ISO-8859-2 and windows-1250 (Hungarian) probers have been temporarily
    disabled until we can retrain the models.
 
-Requires Python 2.7 or 3.5+.
+Requires Python 3.6+.
 
 Installation
 ------------

diff --git a/bench.py b/bench.py
@@ -5,59 +5,73 @@
 :author: Ian Cordasco
 """
 
-from __future__ import print_function, with_statement
 
 import argparse
 import sys
 import time
 from collections import defaultdict
-from io import open
 from os import listdir
 from os.path import dirname, isdir, join, realpath, relpath, splitext
 
 import chardet
 
 try:
     import cchardet
+
     HAVE_CCHARDET = True
 except:
     HAVE_CCHARDET = False
 
 
 # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
 #       retrain model.
-MISSING_ENCODINGS = {'iso-8859-2', 'iso-8859-6', 'windows-1250',
-                     'windows-1254', 'windows-1256'}
-EXPECTED_FAILURES = {'tests/iso-8859-7-greek/disabled.gr.xml',
-                     'tests/iso-8859-9-turkish/divxplanet.com.xml',
-                     'tests/iso-8859-9-turkish/subtitle.srt',
-                     'tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt'}
+MISSING_ENCODINGS = {
+    "iso-8859-2",
+    "iso-8859-6",
+    "windows-1250",
+    "windows-1254",
+    "windows-1256",
+}
+EXPECTED_FAILURES = {
+    "tests/iso-8859-7-greek/disabled.gr.xml",
+    "tests/iso-8859-9-turkish/divxplanet.com.xml",
+    "tests/iso-8859-9-turkish/subtitle.srt",
+    "tests/iso-8859-9-turkish/wikitop_tr_ISO-8859-9.txt",
+}
+
 
 def get_py_impl():
     """Return what kind of Python this is"""
-    if hasattr(sys, 'pypy_version_info'):
-        pyimpl = 'PyPy'
-    elif sys.platform.startswith('java'):
-        pyimpl = 'Jython'
-    elif sys.platform == 'cli':
-        pyimpl = 'IronPython'
+    if hasattr(sys, "pypy_version_info"):
+        pyimpl = "PyPy"
+    elif sys.platform.startswith("java"):
+        pyimpl = "Jython"
+    elif sys.platform == "cli":
+        pyimpl = "IronPython"
     else:
-        pyimpl = 'CPython'
+        pyimpl = "CPython"
     return pyimpl
 
 
 def get_test_files():
     """Yields filenames to use for timing chardet.detect"""
-    base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
+    base_path = relpath(join(dirname(realpath(__file__)), "tests"))
     for encoding in listdir(base_path):
         path = join(base_path, encoding)
         # Skip files in tests directory
         if not isdir(path):
             continue
         # Remove language suffixes from encoding if pressent
         encoding = encoding.lower()
-        for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
-                        '-hebrew', '-hungarian', '-turkish']:
+        for postfix in [
+            "-arabic",
+            "-bulgarian",
+            "-cyrillic",
+            "-greek",
+            "-hebrew",
+            "-hungarian",
+            "-turkish",
+        ]:
             if encoding.endswith(postfix):
                 encoding = encoding.rpartition(postfix)[0]
                 break
@@ -67,7 +81,7 @@ def get_test_files():
         # Test encoding detection for each file we have of encoding for
         for file_name in listdir(path):
             ext = splitext(file_name)[1].lower()
-            if ext not in ['.html', '.txt', '.xml', '.srt']:
+            if ext not in [".html", ".txt", ".xml", ".srt"]:
                 continue
             full_path = join(path, file_name)
             if full_path in EXPECTED_FAILURES:
@@ -76,71 +90,79 @@ def get_test_files():
 
 
 def benchmark(chardet_mod=chardet, verbose=False, num_iters=10):
-    print('Benchmarking {} {} on {} {}'.format(chardet_mod.__name__,
-                                               chardet_mod.__version__,
-                                               get_py_impl(),
-                                               sys.version))
-    print('-' * 80)
+    print(
+        f"Benchmarking {chardet_mod.__name__} {chardet_mod.__version__} "
+        f"on {get_py_impl()} {sys.version}"
+    )
+    print("-" * 80)
     total_time = 0
     num_files = 0
     encoding_times = defaultdict(float)
     encoding_num_files = defaultdict(int)
     for full_path, encoding in get_test_files():
         num_files += 1
-        with open(full_path, 'rb') as f:
+        with open(full_path, "rb") as f:
             input_bytes = f.read()
         start = time.time()
         for _ in range(num_iters):
             chardet_mod.detect(input_bytes)
         bench_time = time.time() - start
         if verbose:
-            print('Average time for {}: {}s'.format(full_path,
-                                                    bench_time / num_iters))
+            print(f"Average time for {full_path}: {bench_time / num_iters}s")
         else:
-            print('.', end='')
+            print(".", end="")
             sys.stdout.flush()
         total_time += bench_time
         encoding_times[encoding] += bench_time
         encoding_num_files[encoding] += 1
 
-    print('\nCalls per second for each encoding:')
+    print("\nCalls per second for each encoding:")
     for encoding in sorted(encoding_times.keys()):
-        print('{}: {}'.format(encoding,
-                              num_iters * encoding_num_files[encoding] /
-                              encoding_times[encoding]))
-
-    print('\nTotal time: {}s ({} calls per second)'.format(total_time,
-                                                           num_iters * num_files /
-                                                           total_time))
+        calls_per_sec = (
+            num_iters * encoding_num_files[encoding] / encoding_times[encoding]
+        )
+        print(f"{encoding}: {calls_per_sec}")
+    calls_per_sec = num_iters * num_files / total_time
+    print(f"\nTotal time: {total_time}s ({calls_per_sec} calls per second)")
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description='Times how long it takes to process each file in test set '
-            'multiple times.',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('-c', '--cchardet',
-                        action='store_true',
-                        help='Run benchmarks for cChardet instead of chardet, '
-                             'if it is installed.')
-    parser.add_argument('-i', '--iterations',
-                        help='Number of times to process each file',
-                        type=int,
-                        default=10)
-    parser.add_argument('-v', '--verbose',
-                        help='Prints out the timing for each individual file.',
-                        action='store_true')
+        description="Times how long it takes to process each file in test set "
+        "multiple times.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-c",
+        "--cchardet",
+        action="store_true",
+        help="Run benchmarks for cChardet instead of chardet, " "if it is installed.",
+    )
+    parser.add_argument(
+        "-i",
+        "--iterations",
+        help="Number of times to process each file",
+        type=int,
+        default=10,
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        help="Prints out the timing for each individual file.",
+        action="store_true",
+    )
     args = parser.parse_args()
 
     if args.cchardet and not HAVE_CCHARDET:
-        print('You must pip install cchardet if you want to benchmark it.')
+        print("You must pip install cchardet if you want to benchmark it.")
         sys.exit(1)
 
-
-    benchmark(chardet_mod=cchardet if args.cchardet else chardet,
-              verbose=args.verbose,
-              num_iters=args.iterations)
+    benchmark(
+        chardet_mod=cchardet if args.cchardet else chardet,
+        verbose=args.verbose,
+        num_iters=args.iterations,
+    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -16,12 +16,11 @@
 ######################### END LICENSE BLOCK #########################
 
 
-from .universaldetector import UniversalDetector
 from .enums import InputState
-from .version import __version__, VERSION
-
+from .universaldetector import UniversalDetector
+from .version import VERSION, __version__
 
-__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
+__all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"]
 
 
 def detect(byte_str):
@@ -33,8 +32,9 @@ def detect(byte_str):
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
-            raise TypeError('Expected object of type bytes or bytearray, got: '
-                            '{}'.format(type(byte_str)))
+            raise TypeError(
+                "Expected object of type bytes or bytearray, got: " f"{type(byte_str)}"
+            )
         else:
             byte_str = bytearray(byte_str)
     detector = UniversalDetector()
@@ -51,8 +51,9 @@ def detect_all(byte_str):
     """
     if not isinstance(byte_str, bytearray):
         if not isinstance(byte_str, bytes):
-            raise TypeError('Expected object of type bytes or bytearray, got: '
-                            '{}'.format(type(byte_str)))
+            raise TypeError(
+                "Expected object of type bytes or bytearray, got: " f"{type(byte_str)}"
+            )
         else:
             byte_str = bytearray(byte_str)
 
@@ -68,16 +69,19 @@ def detect_all(byte_str):
                 lower_charset_name = prober.charset_name.lower()
                 # Use Windows encoding name instead of ISO-8859 if we saw any
                 # extra Windows-specific bytes
-                if lower_charset_name.startswith('iso-8859'):
+                if lower_charset_name.startswith("iso-8859"):
                     if detector._has_win_bytes:
-                        charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
-                                                            charset_name)
-                results.append({
-                    'encoding': charset_name,
-                    'confidence': prober.get_confidence(),
-                    'language': prober.language,
-                })
+                        charset_name = detector.ISO_WIN_MAP.get(
+                            lower_charset_name, charset_name
+                        )
+                results.append(
+                    {
+                        "encoding": charset_name,
+                        "confidence": prober.get_confidence(),
+                        "language": prober.language,
+                    }
+                )
         if len(results) > 0:
-            return sorted(results, key=lambda result: -result['confidence'])
+            return sorted(results, key=lambda result: -result["confidence"])
 
     return [detector.result]
diff --git a/chardet/big5freq.py b/chardet/big5freq.py
@@ -42,9 +42,9 @@
 
 BIG5_TYPICAL_DISTRIBUTION_RATIO = 0.75
 
-#Char to FreqOrder table
+# Char to FreqOrder table
 BIG5_TABLE_SIZE = 5376
-
+# fmt: off
 BIG5_CHAR_TO_FREQ_ORDER = (
    1,1801,1506, 255,1431, 198,   9,  82,   6,5008, 177, 202,3681,1256,2821, 110, #   16
 3814,  33,3274, 261,  76,  44,2114,  16,2946,2187,1176, 659,3971,  26,3451,2653, #   32
@@ -383,4 +383,4 @@
  890,3669,3943,5791,1878,3798,3439,5792,2186,2358,3440,1652,5793,5794,5795, 941, # 5360
 2299, 208,3546,4161,2020, 330,4438,3944,2906,2499,3799,4439,4811,5796,5797,5798, # 5376
 )
-
+# fmt: on
diff --git a/chardet/big5prober.py b/chardet/big5prober.py
@@ -25,15 +25,15 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from .mbcharsetprober import MultiByteCharSetProber
-from .codingstatemachine import CodingStateMachine
 from .chardistribution import Big5DistributionAnalysis
+from .codingstatemachine import CodingStateMachine
+from .mbcharsetprober import MultiByteCharSetProber
 from .mbcssm import BIG5_SM_MODEL
 
 
 class Big5Prober(MultiByteCharSetProber):
     def __init__(self):
-        super(Big5Prober, self).__init__()
+        super().__init__()
         self.coding_sm = CodingStateMachine(BIG5_SM_MODEL)
         self.distribution_analyzer = Big5DistributionAnalysis()
         self.reset()