Skip to content

Commit

Permalink
Add API option to get all the encodings confidence #96 (#111)
Browse files Browse the repository at this point in the history
* Add API option to get all the encodings confidence #96

* make code more straightforward

by treating the self.done = True as a real finish point of the analysis

* use detect_all instead of detect(.., all=True)

* fix corner case of when there is no good prober
  • Loading branch information
mdamien authored and dan-blanchard committed Oct 3, 2017
1 parent ec3bce7 commit c68f120
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 1 deletion.
45 changes: 44 additions & 1 deletion chardet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
######################### END LICENSE BLOCK #########################


from .compat import PY2, PY3
from .universaldetector import UniversalDetector
from .enums import InputState
from .version import __version__, VERSION


__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']


def detect(byte_str):
"""
Detect the encoding of the given byte string.
Expand All @@ -37,3 +40,43 @@ def detect(byte_str):
detector = UniversalDetector()
detector.feed(byte_str)
return detector.close()


def detect_all(byte_str):
"""
Detect all the possible encodings of the given byte string.
:param byte_str: The byte sequence to examine.
:type byte_str: ``bytes`` or ``bytearray``
"""
if not isinstance(byte_str, bytearray):
if not isinstance(byte_str, bytes):
raise TypeError('Expected object of type bytes or bytearray, got: '
'{0}'.format(type(byte_str)))
else:
byte_str = bytearray(byte_str)

detector = UniversalDetector()
detector.feed(byte_str)
detector.close()

if detector._input_state == InputState.HIGH_BYTE:
results = []
for prober in detector._charset_probers:
if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
charset_name = prober.charset_name
lower_charset_name = prober.charset_name.lower()
# Use Windows encoding name instead of ISO-8859 if we saw any
# extra Windows-specific bytes
if lower_charset_name.startswith('iso-8859'):
if detector._has_win_bytes:
charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
charset_name)
results.append({
'encoding': charset_name,
'confidence': prober.get_confidence()
})
if len(results) > 0:
return sorted(results, key=lambda result: -result['confidence'])

return [detector.result]
18 changes: 18 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,21 @@ def string_poisons_following_text(suffix):
result = chardet.detect(extended)
if result and result['encoding'] is not None:
raise JustALengthIssue()


@given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
'utf-32', 'iso-8859-7',
'iso-8859-8', 'windows-1255']),
st.randoms())
@settings(max_examples=200)
def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
try:
data = txt.encode(enc)
except UnicodeEncodeError:
assume(False)
try:
result = chardet.detect(data)
results = chardet.detect_all(data)
assert result['encoding'] == results[0]['encoding']
except Exception:
raise Exception('%s != %s' % (result, results))

0 comments on commit c68f120

Please sign in to comment.