Skip to content

Commit

Permalink
Tweak SingleByteCharSetProber confidence (#209)
Browse files Browse the repository at this point in the history
Likely characters count for 25% now, and too many control characters
decrease confidence, just like in uchardet.
  • Loading branch information
dan-blanchard committed Dec 12, 2020
1 parent dc8fe6c commit 8ce9fcf
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions chardet/sbcharsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self, model, reversed=False, name_prober=None):
self._seq_counters = None
self._total_seqs = None
self._total_char = None
self._control_char = None
self._freq_char = None
self.reset()

Expand All @@ -72,6 +73,7 @@ def reset(self):
self._seq_counters = [0] * SequenceLikelihood.get_num_categories()
self._total_seqs = 0
self._total_char = 0
self._control_char = 0
# characters that fall in our sampling range
self._freq_char = 0

Expand Down Expand Up @@ -108,9 +110,6 @@ def feed(self, byte_str):
# _total_char purposes.
if order < CharacterCategory.CONTROL:
self._total_char += 1
# TODO: Follow uchardet's lead and discount confidence for frequent
# control characters.
# See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
if order < self.SAMPLE_SIZE:
self._freq_char += 1
if self._last_order < self.SAMPLE_SIZE:
Expand Down Expand Up @@ -146,10 +145,17 @@ def get_confidence(self):
r = 0.01
if self._total_seqs > 0:
r = (
(1.0 * self._seq_counters[SequenceLikelihood.POSITIVE])
(
self._seq_counters[SequenceLikelihood.POSITIVE]
+ 0.25 * self._seq_counters[SequenceLikelihood.LIKELY]
)
/ self._total_seqs
/ self._model.typical_positive_ratio
)
# The more control characters (proportionnaly to the size
# of the text), the less confident we become in the current
# charset.
r = r * (self._total_char - self._control_char) / self._total_char
r = r * self._freq_char / self._total_char
if r >= 1.0:
r = 0.99
Expand Down

0 comments on commit 8ce9fcf

Please sign in to comment.