Merge pull request #208 from chardet/feature/xml_tag_cleanup

Always remove XML tags when detecting single-byte charset encodings that use ASCII letters
chardet · Dec 12, 2020 · dc8fe6c · dc8fe6c
2 parents 0f11e55 + 7cf5243
commit dc8fe6c
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 11 deletions.
diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py
@@ -100,12 +100,10 @@ def filter_international_words(buf):
         return filtered
 
     @staticmethod
-    def filter_with_english_letters(buf):
+    def remove_xml_tags(buf):
         """
         Returns a copy of ``buf`` that retains only the sequences of English
         alphabet and high byte characters that are not between <> characters.
-        Also retains English alphabet and high byte characters immediately
-        before occurrences of >.
 
         This filter can be applied to all scripts which contain both English
         characters and extended ASCII characters, but is currently only used by
@@ -118,22 +116,18 @@ def filter_with_english_letters(buf):
         for curr in range(len(buf)):
             # Slice here to get bytes instead of an int with Python 3
             buf_char = buf[curr : curr + 1]
-            # Check if we're coming out of or entering an HTML tag
+            # Check if we're coming out of or entering an XML tag
             if buf_char == b">":
+                prev = curr + 1
                 in_tag = False
             elif buf_char == b"<":
-                in_tag = True
-
-            # If current character is not extended-ASCII and not alphabetic...
-            if buf_char < b"\x80" and not buf_char.isalpha():
-                # ...and we're not in a tag
                 if curr > prev and not in_tag:
                     # Keep everything after last non-extended-ASCII,
                     # non-alphabetic character
                     filtered.extend(buf[prev:curr])
                     # Output a space to delimit stretch we kept
                     filtered.extend(b" ")
-                prev = curr + 1
+                in_tag = True
 
         # If we're not in a tag...
         if not in_tag:

diff --git a/chardet/latin1prober.py b/chardet/latin1prober.py
@@ -116,7 +116,7 @@ def language(self):
         return ""
 
     def feed(self, byte_str):
-        byte_str = self.filter_with_english_letters(byte_str)
+        byte_str = self.remove_xml_tags(byte_str)
         for c in byte_str:
             char_class = Latin1_CharToClass[c]
             freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM) + char_class]

diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py
@@ -93,6 +93,8 @@ def feed(self, byte_str):
         # TODO: Make filter_international_words keep things in self.alphabet
         if not self._model.keep_ascii_letters:
             byte_str = self.filter_international_words(byte_str)
+        else:
+            byte_str = self.remove_xml_tags(byte_str)
         if not byte_str:
             return self.state
         char_to_order_map = self._model.char_to_order_map