Skip to content

Commit

Permalink
Remove need for wrap_ord (#106)
Browse files Browse the repository at this point in the history
We now process bytearray objects all over the place, which greatly
simplifies compatibility between Python 2 and 3, as both treat
bytearray objects the same (as mutable lists of integers).
  • Loading branch information
dan-blanchard committed Apr 10, 2017
1 parent 2979943 commit cec00af
Show file tree
Hide file tree
Showing 11 changed files with 43 additions and 51 deletions.
24 changes: 16 additions & 8 deletions chardet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,25 @@
######################### END LICENSE BLOCK #########################


from .compat import PY2, PY3, bin_type as _bin_type
from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .version import __version__, VERSION


def detect(byte_str):
if not isinstance(byte_str, _bin_type):
raise TypeError('Expected object of {0} type, got: {1}'
''.format(_bin_type, type(byte_str)))
"""
Detect the encoding of the given byte string.
u = UniversalDetector()
u.feed(byte_str)
u.close()
return u.result
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)
detector = UniversalDetector()
detector.feed(byte_str)
detector.close()
return detector.result
19 changes: 9 additions & 10 deletions chardet/chardistribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
BIG5_TYPICAL_DISTRIBUTION_RATIO)
from .jisfreq import (JIS_CHAR_TO_FREQ_ORDER, JIS_TABLE_SIZE,
JIS_TYPICAL_DISTRIBUTION_RATIO)
from .compat import wrap_ord


class CharDistributionAnalysis(object):
Expand Down Expand Up @@ -123,9 +122,9 @@ def get_order(self, byte_str):
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if first_char >= 0xC4:
return 94 * (first_char - 0xC4) + wrap_ord(byte_str[1]) - 0xA1
return 94 * (first_char - 0xC4) + byte_str[1] - 0xA1
else:
return -1

Expand All @@ -142,9 +141,9 @@ def get_order(self, byte_str):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if first_char >= 0xB0:
return 94 * (first_char - 0xB0) + wrap_ord(byte_str[1]) - 0xA1
return 94 * (first_char - 0xB0) + byte_str[1] - 0xA1
else:
return -1

Expand All @@ -161,7 +160,7 @@ def get_order(self, byte_str):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0xB0) and (second_char >= 0xA1):
return 94 * (first_char - 0xB0) + second_char - 0xA1
else:
Expand All @@ -180,7 +179,7 @@ def get_order(self, byte_str):
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if first_char >= 0xA4:
if second_char >= 0xA1:
return 157 * (first_char - 0xA4) + second_char - 0xA1 + 63
Expand All @@ -202,7 +201,7 @@ def get_order(self, byte_str):
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
first_char, second_char = wrap_ord(byte_str[0]), wrap_ord(byte_str[1])
first_char, second_char = byte_str[0], byte_str[1]
if (first_char >= 0x81) and (first_char <= 0x9F):
order = 188 * (first_char - 0x81)
elif (first_char >= 0xE0) and (first_char <= 0xEF):
Expand All @@ -227,8 +226,8 @@ def get_order(self, byte_str):
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
char = wrap_ord(byte_str[0])
char = byte_str[0]
if char >= 0xA0:
return 94 * (char - 0xA1) + wrap_ord(byte_str[1]) - 0xa1
return 94 * (char - 0xA1) + byte_str[1] - 0xa1
else:
return -1
4 changes: 1 addition & 3 deletions chardet/cli/chardetect.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,12 @@

import argparse
import sys
from io import open

from chardet import __version__
from chardet.compat import PY2
from chardet.universaldetector import UniversalDetector




def description_of(lines, name='stdin'):
"""
Return a string describing the probable encoding of a file or
Expand All @@ -38,6 +35,7 @@ def description_of(lines, name='stdin'):
"""
u = UniversalDetector()
for line in lines:
line = bytearray(line)
u.feed(line)
# shortcut out of the loop to save reading further - particularly useful if we read a BOM.
if u.done:
Expand Down
3 changes: 1 addition & 2 deletions chardet/codingstatemachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import logging

from .enums import MachineState
from .compat import wrap_ord


class CodingStateMachine(object):
Expand Down Expand Up @@ -67,7 +66,7 @@ def reset(self):
def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
byte_class = self._model['class_table'][wrap_ord(c)]
byte_class = self._model['class_table'][c]
if self._curr_state == MachineState.start:
self._curr_byte_pos = 0
self._curr_char_len = self._model['char_len_table'][byte_class]
Expand Down
9 changes: 0 additions & 9 deletions chardet/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,8 @@
PY3 = False
base_str = (str, unicode)
text_type = unicode
bin_type = str
else:
PY2 = False
PY3 = True
base_str = (bytes, str)
text_type = str
bin_type = (bytes, bytearray)


def wrap_ord(a):
if PY2 and isinstance(a, base_str):
return ord(a)
else:
return a
3 changes: 1 addition & 2 deletions chardet/escprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

from .charsetprober import CharSetProber
from .codingstatemachine import CodingStateMachine
from .compat import wrap_ord
from .enums import LanguageFilter, ProbingState, MachineState
from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,
ISO2022KR_SM_MODEL)
Expand Down Expand Up @@ -80,7 +79,7 @@ def feed(self, byte_str):
for coding_sm in self.coding_sm:
if not coding_sm or not coding_sm.active:
continue
coding_state = coding_sm.next_state(wrap_ord(c))
coding_state = coding_sm.next_state(c)
if coding_state == MachineState.error:
coding_sm.active = False
self.active_sm_count -= 1
Expand Down
9 changes: 4 additions & 5 deletions chardet/hebrewprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@

from .charsetprober import CharSetProber
from .enums import ProbingState
from .compat import wrap_ord

# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
Expand Down Expand Up @@ -177,8 +176,8 @@ def set_model_probers(self, logicalProber, visualProber):
self._visual_prober = visualProber

def is_final(self, c):
return wrap_ord(c) in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]
return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
self.FINAL_PE, self.FINAL_TSADI]

def is_non_final(self, c):
# The normal Tsadi is not a good Non-Final letter due to words like
Expand All @@ -191,8 +190,8 @@ def is_non_final(self, c):
# for example legally end with a Non-Final Pe or Kaf. However, the
# benefit of these letters as Non-Final letters outweighs the damage
# since these words are quite rare.
return wrap_ord(c) in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]
return c in [self.NORMAL_KAF, self.NORMAL_MEM,
self.NORMAL_NUN, self.NORMAL_PE]

def feed(self, byte_str):
# Final letter analysis for logical-visual decision.
Expand Down
9 changes: 4 additions & 5 deletions chardet/jpcntx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from .compat import wrap_ord

# This is hiragana 2-char sequence table, the number in each cell represents its frequency category
jp2CharContext = (
Expand Down Expand Up @@ -194,7 +193,7 @@ def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
char_len = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
Expand All @@ -204,7 +203,7 @@ def get_order(self, byte_str):

# return its order if it is hiragana
if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1])
second_char = byte_str[1]
if (first_char == 202) and (0x9F <= second_char <= 0xF1):
return second_char - 0x9F, char_len

Expand All @@ -215,7 +214,7 @@ def get_order(self, byte_str):
if not byte_str:
return -1, 1
# find out current char's byte length
first_char = wrap_ord(byte_str[0])
first_char = byte_str[0]
if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
char_len = 2
elif first_char == 0x8F:
Expand All @@ -225,7 +224,7 @@ def get_order(self, byte_str):

# return its order if it is hiragana
if len(byte_str) > 1:
second_char = wrap_ord(byte_str[1])
second_char = byte_str[1]
if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
return second_char - 0xA1, char_len

Expand Down
3 changes: 1 addition & 2 deletions chardet/latin1prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
######################### END LICENSE BLOCK #########################

from .charsetprober import CharSetProber
from .compat import wrap_ord
from .enums import ProbingState

FREQ_CAT_NUM = 4
Expand Down Expand Up @@ -113,7 +112,7 @@ def charset_name(self):
def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[wrap_ord(c)]
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)
+ char_class]
if freq == 0:
Expand Down
9 changes: 4 additions & 5 deletions chardet/sbcharsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
######################### END LICENSE BLOCK #########################

from .charsetprober import CharSetProber
from .compat import wrap_ord
from .enums import ProbingState


Expand Down Expand Up @@ -74,11 +73,11 @@ def charset_name(self):
def feed(self, byte_str):
if not self._model['keep_english_letter']:
byte_str = self.filter_international_words(byte_str)
num_bytes = len(byte_str)
if not num_bytes:
if not byte_str:
return self.state
for c in byte_str:
order = self._model['char_to_order_map'][wrap_ord(c)]
char_to_order_map = self._model['char_to_order_map']
for i, c in enumerate(byte_str):
order = char_to_order_map[c] - 1
if order < self.SYMBOL_CAT_ORDER:
self._total_char += 1
if order < self.SAMPLE_SIZE:
Expand Down
2 changes: 2 additions & 0 deletions chardet/universaldetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ def feed(self, byte_str):
if not len(byte_str):
return

if not isinstance(byte_str, bytearray):
byte_str = bytearray(byte_str)
# First check for known BOMs, since these are guaranteed to be correct
if not self._got_data:
# If the data starts with BOM, we know it is UTF
Expand Down

0 comments on commit cec00af

Please sign in to comment.