Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

treat spaces as single-char bytes instead of strings for python 2/3 #92

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 8 additions & 5 deletions chardet/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@
bin_type = (bytes, bytearray)


def wrap_ord(a):
if PY2 and isinstance(a, base_str):
return ord(a)
else:
return a
if PY2:
def wrap_ord(a):
if isinstance(a, base_str):
return ord(a)
else:
return a
else:
wrap_ord = int # leaves the input unchanged
14 changes: 8 additions & 6 deletions chardet/hebrewprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@
# model probers scores. The answer is returned in the form of the name of the
# charset identified, either "windows-1255" or "ISO-8859-8".

SPACE_BYTE = b' '[0] # does the right thing PY2/PY3

class HebrewProber(CharSetProber):
# windows-1255 / ISO-8859-8 code points of interest
FINAL_KAF = 0xea
Expand Down Expand Up @@ -168,8 +170,8 @@ def reset(self):
# The two last characters seen in the previous buffer,
# mPrev and mBeforePrev are initialized to space in order to simulate
# a word delimiter at the beginning of the data
self._prev = ' '
self._before_prev = ' '
self._prev = SPACE_BYTE
self._before_prev = SPACE_BYTE
# These probers are owned by the group prober.

def set_model_probers(self, logicalProber, visualProber):
Expand Down Expand Up @@ -228,9 +230,9 @@ def feed(self, byte_str):
byte_str = self.filter_high_byte_only(byte_str)

for cur in byte_str:
if cur == ' ':
if cur == SPACE_BYTE:
# We stand on a space - a word just ended
if self._before_prev != ' ':
if self._before_prev != SPACE_BYTE:
# next-to-last char was not a space so self._prev is not a
# 1 letter word
if self.is_final(self._prev):
Expand All @@ -242,8 +244,8 @@ def feed(self, byte_str):
self._final_char_visual_score += 1
else:
# Not standing on a space
if ((self._before_prev == ' ') and
(self.is_final(self._prev)) and (cur != ' ')):
if ((self._before_prev == SPACE_BYTE) and
(self.is_final(self._prev)) and (cur != SPACE_BYTE)):
# case (3) [-2:space][-1:final letter][cur:not space]
self._final_char_visual_score += 1
self._before_prev = self._prev
Expand Down