Add API option to get all the encodings confidence #96 (#111)

* Add API option to get all the encodings confidence #96 * make code more straightforward by treating the self.done = True as a real finish point of the analysis * use detect_all instead of detect(.., all=True) * fix corner case of when there is no good prober
chardet · Oct 3, 2017 · c68f120 · c68f120
1 parent ec3bce7
commit c68f120
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 1 deletion.
diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -16,11 +16,14 @@
 ######################### END LICENSE BLOCK #########################
 
 
-from .compat import PY2, PY3
 from .universaldetector import UniversalDetector
+from .enums import InputState
 from .version import __version__, VERSION
 
 
+__all__ = ['UniversalDetector', 'detect', 'detect_all', '__version__', 'VERSION']
+
+
 def detect(byte_str):
     """
     Detect the encoding of the given byte string.
@@ -37,3 +40,43 @@ def detect(byte_str):
     detector = UniversalDetector()
     detector.feed(byte_str)
     return detector.close()
+
+
+def detect_all(byte_str):
+    """
+    Detect all the possible encodings of the given byte string.
+
+    :param byte_str:     The byte sequence to examine.
+    :type byte_str:      ``bytes`` or ``bytearray``
+    """
+    if not isinstance(byte_str, bytearray):
+        if not isinstance(byte_str, bytes):
+            raise TypeError('Expected object of type bytes or bytearray, got: '
+                            '{0}'.format(type(byte_str)))
+        else:
+            byte_str = bytearray(byte_str)
+
+    detector = UniversalDetector()
+    detector.feed(byte_str)
+    detector.close()
+
+    if detector._input_state == InputState.HIGH_BYTE:
+        results = []
+        for prober in detector._charset_probers:
+            if prober.get_confidence() > detector.MINIMUM_THRESHOLD:
+                charset_name = prober.charset_name
+                lower_charset_name = prober.charset_name.lower()
+                # Use Windows encoding name instead of ISO-8859 if we saw any
+                # extra Windows-specific bytes
+                if lower_charset_name.startswith('iso-8859'):
+                    if detector._has_win_bytes:
+                        charset_name = detector.ISO_WIN_MAP.get(lower_charset_name,
+                                                            charset_name)
+                results.append({
+                    'encoding': charset_name,
+                    'confidence': prober.get_confidence()
+                })
+        if len(results) > 0:
+            return sorted(results, key=lambda result: -result['confidence'])
+
+    return [detector.result]
diff --git a/test.py b/test.py
@@ -126,3 +126,21 @@ def string_poisons_following_text(suffix):
                     result = chardet.detect(extended)
                     if result and result['encoding'] is not None:
                         raise JustALengthIssue()
+
+
+    @given(st.text(min_size=1), st.sampled_from(['ascii', 'utf-8', 'utf-16',
+                                                 'utf-32', 'iso-8859-7',
+                                                 'iso-8859-8', 'windows-1255']),
+           st.randoms())
+    @settings(max_examples=200)
+    def test_detect_all_and_detect_one_should_agree(txt, enc, rnd):
+        try:
+            data = txt.encode(enc)
+        except UnicodeEncodeError:
+            assume(False)
+        try:
+            result = chardet.detect(data)
+            results = chardet.detect_all(data)
+            assert result['encoding'] == results[0]['encoding']
+        except Exception:
+            raise Exception('%s != %s' % (result, results))