diff --git a/conans/test/unittests/client/util/files/decode_text_test.py b/conans/test/unittests/client/util/files/decode_text_test.py new file mode 100644 index 00000000000..7227d635b47 --- /dev/null +++ b/conans/test/unittests/client/util/files/decode_text_test.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +import unittest + +from parameterized import parameterized + +from conans.util.files import decode_text + + +class DecodeTextTest(unittest.TestCase): + + @parameterized.expand([(b'\x41',), + (b'\xef\xbb\xbf\x41',), + (b'\xfe\xff\x00\x41',), + (b'\xff\xfe\x41\x00',), + (b'\x00\x00\xfe\xff\x00\x00\x00\x41',), + (b'\xff\xfe\x00\x00\x41\x00\x00\x00',), + (b'\x2b\x2f\x76\x38\x41',), + (b'\x2b\x2f\x76\x39\x41',), + (b'\x2b\x2f\x76\x2B\x41',), + (b'\x2b\x2f\x76\x2F\x41',), + (b'\x2b\x2f\x76\x38\x2d\x41',)]) + def test_encodings(self, text): + self.assertEqual('A', decode_text(text)) diff --git a/conans/util/files.py b/conans/util/files.py index 518d577cf84..40adb40bcd1 100644 --- a/conans/util/files.py +++ b/conans/util/files.py @@ -56,6 +56,23 @@ def is_dirty(folder): def decode_text(text): + import codecs + encodings = {codecs.BOM_UTF8: "utf_8_sig", + codecs.BOM_UTF16_BE: "utf_16_be", + codecs.BOM_UTF16_LE: "utf_16_le", + codecs.BOM_UTF32_BE: "utf_32_be", + codecs.BOM_UTF32_LE: "utf_32_le", + b'\x2b\x2f\x76\x38': "utf_7", + b'\x2b\x2f\x76\x39': "utf_7", + b'\x2b\x2f\x76\x2b': "utf_7", + b'\x2b\x2f\x76\x2f': "utf_7", + b'\x2b\x2f\x76\x38\x2d': "utf_7"} + for bom in sorted(encodings, key=len, reverse=True): + if text.startswith(bom): + try: + return text[len(bom):].decode(encodings[bom]) + except UnicodeDecodeError: + continue decoders = ["utf-8", "Windows-1252"] for decoder in decoders: try: @@ -176,20 +193,6 @@ def load(path, binary=False): """ Loads a file content """ with open(path, 'rb') as handle: tmp = handle.read() - if not binary: - import codecs - encodings = {codecs.BOM_UTF8: "utf_8_sig", - codecs.BOM_UTF16_BE: "utf_16_be", - codecs.BOM_UTF16_LE: "utf_16_le", - codecs.BOM_UTF32_BE: "utf_32_be", - codecs.BOM_UTF32_LE: "utf_32_le", - b'\x2b\x2f\x76\x38': "utf_7", - b'\x2b\x2f\x76\x39': "utf_7", - b'\x2b\x2f\x76\x2b': "utf_7", - b'\x2b\x2f\x76\x2f': "utf_7"} - for bom in encodings: - if tmp.startswith(bom): - return tmp[len(bom):].decode(encodings[bom]) return tmp if binary else decode_text(tmp)