Skip to content

Commit

Permalink
slight increase in performance
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed May 11, 2022
1 parent f9ef56c commit 8fd120b
Showing 1 changed file with 7 additions and 9 deletions.
16 changes: 7 additions & 9 deletions chardet/charsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@

from .enums import ProbingState

INTERNATIONAL_WORDS_PATTERN = re.compile(b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?")


class CharSetProber:

Expand Down Expand Up @@ -70,20 +72,18 @@ def filter_international_words(buf):
alphabet: english alphabets [a-zA-Z]
international: international characters [\x80-\xFF]
marker: everything else [^a-zA-Z\x80-\xFF]
The input buffer can be thought to contain a series of words delimited
by markers. This function works to filter all words that contain at
least one international character. All contiguous sequences of markers
are replaced by a single space ascii character.
This filter applies to all scripts which do not use English characters.
"""
filtered = bytearray()

# This regex expression filters out only words that have at-least one
# international character. The word may include one marker character at
# the end.
words = re.findall(b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?", buf)
words = INTERNATIONAL_WORDS_PATTERN.findall(buf)

for word in words:
filtered.extend(word[:-1])
Expand All @@ -104,18 +104,16 @@ def remove_xml_tags(buf):
"""
Returns a copy of ``buf`` that retains only the sequences of English
alphabet and high byte characters that are not between <> characters.
This filter can be applied to all scripts which contain both English
characters and extended ASCII characters, but is currently only used by
``Latin1Prober``.
"""
filtered = bytearray()
in_tag = False
prev = 0

for curr in range(len(buf)):
# Slice here to get bytes instead of an int with Python 3
buf_char = buf[curr : curr + 1]
buf = memoryview(buf).cast('c')

for curr, buf_char in enumerate(buf):
# Check if we're coming out of or entering an XML tag
if buf_char == b">":
prev = curr + 1
Expand All @@ -134,5 +132,5 @@ def remove_xml_tags(buf):
# Keep everything after last non-extended-ASCII, non-alphabetic
# character
filtered.extend(buf[prev:])

return filtered

0 comments on commit 8fd120b

Please sign in to comment.