Skip to content

Commit

Permalink
Merge pull request #208 from chardet/feature/xml_tag_cleanup
Browse files Browse the repository at this point in the history
Always remove XML tags when detecting single-byte charset encodings that use ASCII letters
  • Loading branch information
dan-blanchard committed Dec 12, 2020
2 parents 0f11e55 + 7cf5243 commit dc8fe6c
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
14 changes: 4 additions & 10 deletions chardet/charsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,10 @@ def filter_international_words(buf):
return filtered

@staticmethod
def filter_with_english_letters(buf):
def remove_xml_tags(buf):
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
Also retains English alphabet and high byte characters immediately
before occurrences of >.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
Expand All @@ -118,22 +116,18 @@ def filter_with_english_letters(buf):
for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr : curr + 1]
# Check if we're coming out of or entering an HTML tag
# Check if we're coming out of or entering an XML tag
if buf_char == b">":
prev = curr + 1
in_tag = False
elif buf_char == b"<":
in_tag = True

# If current character is not extended-ASCII and not alphabetic...
if buf_char < b"\x80" and not buf_char.isalpha():
# ...and we're not in a tag
if curr > prev and not in_tag:
# Keep everything after last non-extended-ASCII,
# non-alphabetic character
filtered.extend(buf[prev:curr])
# Output a space to delimit stretch we kept
filtered.extend(b" ")
prev = curr + 1
in_tag = True

# If we're not in a tag...
if not in_tag:
Expand Down
2 changes: 1 addition & 1 deletion chardet/latin1prober.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def language(self):
return ""

def feed(self, byte_str):
byte_str = self.filter_with_english_letters(byte_str)
byte_str = self.remove_xml_tags(byte_str)
for c in byte_str:
char_class = Latin1_CharToClass[c]
freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]
Expand Down
2 changes: 2 additions & 0 deletions chardet/sbcharsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ def feed(self, byte_str):
# TODO: Make filter_international_words keep things in self.alphabet
if not self._model.keep_ascii_letters:
byte_str = self.filter_international_words(byte_str)
else:
byte_str = self.remove_xml_tags(byte_str)
if not byte_str:
return self.state
char_to_order_map = self._model.char_to_order_map
Expand Down

0 comments on commit dc8fe6c

Please sign in to comment.