Remove need for wrap_ord (#106)

We now process bytearray objects all over the place, which greatly simplifies compatibility between Python 2 and 3, as both treat bytearray objects the same (as mutable lists of integers).
chardet · Apr 10, 2017 · cec00af · cec00af
1 parent 2979943
commit cec00af
Show file tree

Hide file tree

Showing 11 changed files with 43 additions and 51 deletions.
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -16,17 +16,25 @@
 ######################### END LICENSE BLOCK #########################
 
 
-from .compat import PY2, PY3, bin_type as _bin_type
+from .compat import PY2, PY3
 from .universaldetector import UniversalDetector
 from .version import __version__, VERSION
 
 
 def detect(byte_str):
-    if not isinstance(byte_str, _bin_type):
-        raise TypeError('Expected object of {0} type, got: {1}'
-                        ''.format(_bin_type, type(byte_str)))
+    """
+    Detect the encoding of the given byte string.
 
-    u = UniversalDetector()
-    u.feed(byte_str)
-    u.close()
-    return u.result
+    :param byte_str:     The byte sequence to examine.
+    :type byte_str:      ``bytes`` or ``bytearray``
+    """
+    if not isinstance(byte_str, bytearray):
+        if not isinstance(byte_str, bytes):
+            raise TypeError('Expected object of type bytes or bytearray, got: '
+                            '{0}'.format(type(byte_str)))
+        else:
+            byte_str = bytearray(byte_str)
+    detector = UniversalDetector()
+    detector.feed(byte_str)
+    detector.close()
+    return detector.result
diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
@@ -35,7 +35,6 @@
                        BIG5_TYPICAL_DISTRIBUTION_RATIO)
 from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
                       JIS_TYPICAL_DISTRIBUTION_RATIO)
-from .compat import wrap_ord
 
 
 class CharDistributionAnalysis(object):
@@ -123,9 +122,9 @@ def get_order(self, byte_str):
         #   first  byte range: 0xc4 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if first_char >= 0xC4:
-            return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
         else:
             return -1
 
@@ -142,9 +141,9 @@ def get_order(self, byte_str):
         #   first  byte range: 0xb0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if first_char >= 0xB0:
-            return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
+            return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
         else:
             return -1
 
@@ -161,7 +160,7 @@ def get_order(self, byte_str):
         #  first  byte range: 0xb0 -- 0xfe
         #  second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if (first_char >= 0xB0) and (second_char >= 0xA1):
             return 94 * (first_char - 0xB0) + second_char - 0xA1
         else:
@@ -180,7 +179,7 @@ def get_order(self, byte_str):
         #   first  byte range: 0xa4 -- 0xfe
         #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if first_char >= 0xA4:
             if second_char >= 0xA1:
                 return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
@@ -202,7 +201,7 @@ def get_order(self, byte_str):
         #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
         #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
         # no validation needed here. State machine has done that
-        first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
+        first_char, second_char = byte_str[0], byte_str[1]
         if (first_char >= 0x81) and (first_char <= 0x9F):
             order = 188 * (first_char - 0x81)
         elif (first_char >= 0xE0) and (first_char <= 0xEF):
@@ -227,8 +226,8 @@ def get_order(self, byte_str):
         #   first  byte range: 0xa0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        char = wrap_ord(byte_str[0])
+        char = byte_str[0]
         if char >= 0xA0:
-            return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
+            return 94 * (char - 0xA1) + byte_str[1] - 0xa1
         else:
             return -1
diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py
@@ -17,15 +17,12 @@
 
 import argparse
 import sys
-from io import open
 
 from chardet import __version__
 from chardet.compat import PY2
 from chardet.universaldetector import UniversalDetector
 
 
-
-
 def description_of(lines, name='stdin'):
     """
     Return a string describing the probable encoding of a file or
@@ -38,6 +35,7 @@ def description_of(lines, name='stdin'):
     """
     u = UniversalDetector()
     for line in lines:
+        line = bytearray(line)
         u.feed(line)
         # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
         if u.done:

diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py
@@ -28,7 +28,6 @@
 import logging
 
 from .enums import MachineState
-from .compat import wrap_ord
 
 
 class CodingStateMachine(object):
@@ -67,7 +66,7 @@ def reset(self):
     def next_state(self, c):
         # for each byte we get its class
         # if it is first byte, we also get byte length
-        byte_class = self._model['class_table'][wrap_ord(c)]
+        byte_class = self._model['class_table'][c]
         if self._curr_state == MachineState.start:
             self._curr_byte_pos = 0
             self._curr_char_len = self._model['char_len_table'][byte_class]

diff --git a/chardet/compat.py b/chardet/compat.py
@@ -27,17 +27,8 @@
     PY3 = False
     base_str = (str, unicode)
     text_type = unicode
-    bin_type = str
 else:
     PY2 = False
     PY3 = True
     base_str = (bytes, str)
     text_type = str
-    bin_type = (bytes, bytearray)
-
-
-def wrap_ord(a):
-    if PY2 and isinstance(a, base_str):
-        return ord(a)
-    else:
-        return a
diff --git a/chardet/escprober.py b/chardet/escprober.py
@@ -27,7 +27,6 @@
 
 from .charsetprober import CharSetProber
 from .codingstatemachine import CodingStateMachine
-from .compat import wrap_ord
 from .enums import LanguageFilter, ProbingState, MachineState
 from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
                     ISO2022KR_SM_MODEL)
@@ -80,7 +79,7 @@ def feed(self, byte_str):
             for coding_sm in self.coding_sm:
                 if not coding_sm or not coding_sm.active:
                     continue
-                coding_state = coding_sm.next_state(wrap_ord(c))
+                coding_state = coding_sm.next_state(c)
                 if coding_state == MachineState.error:
                     coding_sm.active = False
                     self.active_sm_count -= 1

diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py
@@ -27,7 +27,6 @@
 
 from .charsetprober import CharSetProber
 from .enums import ProbingState
-from .compat import wrap_ord
 
 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers
@@ -177,8 +176,8 @@ def set_model_probers(self, logicalProber, visualProber):
         self._visual_prober = visualProber
 
     def is_final(self, c):
-        return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
-                               self.FINAL_PE, self.FINAL_TSADI]
+        return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
+                     self.FINAL_PE, self.FINAL_TSADI]
 
     def is_non_final(self, c):
         # The normal Tsadi is not a good Non-Final letter due to words like
@@ -191,8 +190,8 @@ def is_non_final(self, c):
         # for example legally end with a Non-Final Pe or Kaf. However, the
         # benefit of these letters as Non-Final letters outweighs the damage
         # since these words are quite rare.
-        return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM,
-                               self.NORMAL_NUN, self.NORMAL_PE]
+        return c in [self.NORMAL_KAF, self.NORMAL_MEM,
+                     self.NORMAL_NUN, self.NORMAL_PE]
 
     def feed(self, byte_str):
         # Final letter analysis for logical-visual decision.

diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py
@@ -25,7 +25,6 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from .compat import wrap_ord
 
 # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
 jp2CharContext = (
@@ -194,7 +193,7 @@ def get_order(self, byte_str):
         if not byte_str:
             return -1, 1
         # find out current char's byte length
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
             char_len = 2
             if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
@@ -204,7 +203,7 @@ def get_order(self, byte_str):
 
         # return its order if it is hiragana
         if len(byte_str) > 1:
-            second_char = wrap_ord(byte_str[1])
+            second_char = byte_str[1]
             if (first_char == 202) and (0x9F <= second_char <= 0xF1):
                 return second_char - 0x9F, char_len
 
@@ -215,7 +214,7 @@ def get_order(self, byte_str):
         if not byte_str:
             return -1, 1
         # find out current char's byte length
-        first_char = wrap_ord(byte_str[0])
+        first_char = byte_str[0]
         if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
             char_len = 2
         elif first_char == 0x8F:
@@ -225,7 +224,7 @@ def get_order(self, byte_str):
 
         # return its order if it is hiragana
         if len(byte_str) > 1:
-            second_char = wrap_ord(byte_str[1])
+            second_char = byte_str[1]
             if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
                 return second_char - 0xA1, char_len
 

diff --git a/chardet/latin1prober.py b/chardet/latin1prober.py
@@ -27,7 +27,6 @@
 ######################### END LICENSE BLOCK #########################
 
 from .charsetprober import CharSetProber
-from .compat import wrap_ord
 from .enums import ProbingState
 
 FREQ_CAT_NUM = 4
@@ -113,7 +112,7 @@ def charset_name(self):
     def feed(self, byte_str):
         byte_str = self.filter_with_english_letters(byte_str)
         for c in byte_str:
-            char_class = Latin1_CharToClass[wrap_ord(c)]
+            char_class = Latin1_CharToClass[c]
             freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
                                     + char_class]
             if freq == 0:

diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py
@@ -27,7 +27,6 @@
 ######################### END LICENSE BLOCK #########################
 
 from .charsetprober import CharSetProber
-from .compat import wrap_ord
 from .enums import ProbingState
 
 
@@ -74,11 +73,11 @@ def charset_name(self):
     def feed(self, byte_str):
         if not self._model['keep_english_letter']:
             byte_str = self.filter_international_words(byte_str)
-        num_bytes = len(byte_str)
-        if not num_bytes:
+        if not byte_str:
             return self.state
-        for c in byte_str:
-            order = self._model['char_to_order_map'][wrap_ord(c)]
+        char_to_order_map = self._model['char_to_order_map']
+        for i, c in enumerate(byte_str):
+            order = char_to_order_map[c] - 1
             if order < self.SYMBOL_CAT_ORDER:
                 self._total_char += 1
             if order < self.SAMPLE_SIZE:

diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py
@@ -127,6 +127,8 @@ def feed(self, byte_str):
         if not len(byte_str):
             return
 
+        if not isinstance(byte_str, bytearray):
+            byte_str = bytearray(byte_str)
         # First check for known BOMs, since these are guaranteed to be correct
         if not self._got_data:
             # If the data starts with BOM, we know it is UTF