- move BOM handling to the decode_text

Signed-off-by: SSE4 <tomskside@gmail.com>
conan-io · Jul 18, 2019 · 7c2b1b2 · 7c2b1b2
1 parent f00cc3b
commit 7c2b1b2
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 15 deletions.
diff --git a/conans/test/unittests/client/util/files/decode_text_test.py b/conans/test/unittests/client/util/files/decode_text_test.py
@@ -0,0 +1,24 @@
+# -*- coding: utf-8 -*-
+
+import unittest
+
+from parameterized import parameterized
+
+from conans.util.files import decode_text
+
+
+class DecodeTextTest(unittest.TestCase):
+
+    @parameterized.expand([(b'\x41',),
+                           (b'\xef\xbb\xbf\x41',),
+                           (b'\xfe\xff\x00\x41',),
+                           (b'\xff\xfe\x41\x00',),
+                           (b'\x00\x00\xfe\xff\x00\x00\x00\x41',),
+                           (b'\xff\xfe\x00\x00\x41\x00\x00\x00',),
+                           (b'\x2b\x2f\x76\x38\x41',),
+                           (b'\x2b\x2f\x76\x39\x41',),
+                           (b'\x2b\x2f\x76\x2B\x41',),
+                           (b'\x2b\x2f\x76\x2F\x41',),
+                           (b'\x2b\x2f\x76\x38\x2d\x41',)])
+    def test_encodings(self, text):
+        self.assertEqual('A', decode_text(text))
diff --git a/conans/util/files.py b/conans/util/files.py
@@ -56,6 +56,23 @@ def is_dirty(folder):
 
 
 def decode_text(text):
+    import codecs
+    encodings = {codecs.BOM_UTF8: "utf_8_sig",
+                 codecs.BOM_UTF16_BE: "utf_16_be",
+                 codecs.BOM_UTF16_LE: "utf_16_le",
+                 codecs.BOM_UTF32_BE: "utf_32_be",
+                 codecs.BOM_UTF32_LE: "utf_32_le",
+                 b'\x2b\x2f\x76\x38': "utf_7",
+                 b'\x2b\x2f\x76\x39': "utf_7",
+                 b'\x2b\x2f\x76\x2b': "utf_7",
+                 b'\x2b\x2f\x76\x2f': "utf_7",
+                 b'\x2b\x2f\x76\x38\x2d': "utf_7"}
+    for bom in sorted(encodings, key=len, reverse=True):
+        if text.startswith(bom):
+            try:
+                return text[len(bom):].decode(encodings[bom])
+            except UnicodeDecodeError:
+                continue
     decoders = ["utf-8", "Windows-1252"]
     for decoder in decoders:
         try:
@@ -176,21 +193,6 @@ def load(path, binary=False):
     """ Loads a file content """
     with open(path, 'rb') as handle:
         tmp = handle.read()
-        if not binary:
-            import codecs
-            encodings = {codecs.BOM_UTF8: "utf_8_sig",
-                         codecs.BOM_UTF16_BE: "utf_16_be",
-                         codecs.BOM_UTF16_LE: "utf_16_le",
-                         codecs.BOM_UTF32_BE: "utf_32_be",
-                         codecs.BOM_UTF32_LE: "utf_32_le",
-                         b'\x2b\x2f\x76\x38': "utf_7",
-                         b'\x2b\x2f\x76\x39': "utf_7",
-                         b'\x2b\x2f\x76\x2b': "utf_7",
-                         b'\x2b\x2f\x76\x2f': "utf_7"}
-            for bom in encodings:
-                if tmp.startswith(bom):
-                    return tmp[len(bom):].decode(encodings[bom])
-
         return tmp if binary else decode_text(tmp)