Skip to content

Commit

Permalink
- move BOM handling to the decode_text
Browse files Browse the repository at this point in the history
Signed-off-by: SSE4 <tomskside@gmail.com>
  • Loading branch information
SSE4 committed Jul 18, 2019
1 parent f00cc3b commit 7c2b1b2
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 15 deletions.
24 changes: 24 additions & 0 deletions conans/test/unittests/client/util/files/decode_text_test.py
@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

import unittest

from parameterized import parameterized

from conans.util.files import decode_text


class DecodeTextTest(unittest.TestCase):

@parameterized.expand([(b'\x41',),
(b'\xef\xbb\xbf\x41',),
(b'\xfe\xff\x00\x41',),
(b'\xff\xfe\x41\x00',),
(b'\x00\x00\xfe\xff\x00\x00\x00\x41',),
(b'\xff\xfe\x00\x00\x41\x00\x00\x00',),
(b'\x2b\x2f\x76\x38\x41',),
(b'\x2b\x2f\x76\x39\x41',),
(b'\x2b\x2f\x76\x2B\x41',),
(b'\x2b\x2f\x76\x2F\x41',),
(b'\x2b\x2f\x76\x38\x2d\x41',)])
def test_encodings(self, text):
self.assertEqual('A', decode_text(text))
32 changes: 17 additions & 15 deletions conans/util/files.py
Expand Up @@ -56,6 +56,23 @@ def is_dirty(folder):


def decode_text(text):
import codecs
encodings = {codecs.BOM_UTF8: "utf_8_sig",
codecs.BOM_UTF16_BE: "utf_16_be",
codecs.BOM_UTF16_LE: "utf_16_le",
codecs.BOM_UTF32_BE: "utf_32_be",
codecs.BOM_UTF32_LE: "utf_32_le",
b'\x2b\x2f\x76\x38': "utf_7",
b'\x2b\x2f\x76\x39': "utf_7",
b'\x2b\x2f\x76\x2b': "utf_7",
b'\x2b\x2f\x76\x2f': "utf_7",
b'\x2b\x2f\x76\x38\x2d': "utf_7"}
for bom in sorted(encodings, key=len, reverse=True):
if text.startswith(bom):
try:
return text[len(bom):].decode(encodings[bom])
except UnicodeDecodeError:
continue
decoders = ["utf-8", "Windows-1252"]
for decoder in decoders:
try:
Expand Down Expand Up @@ -176,21 +193,6 @@ def load(path, binary=False):
""" Loads a file content """
with open(path, 'rb') as handle:
tmp = handle.read()
if not binary:
import codecs
encodings = {codecs.BOM_UTF8: "utf_8_sig",
codecs.BOM_UTF16_BE: "utf_16_be",
codecs.BOM_UTF16_LE: "utf_16_le",
codecs.BOM_UTF32_BE: "utf_32_be",
codecs.BOM_UTF32_LE: "utf_32_le",
b'\x2b\x2f\x76\x38': "utf_7",
b'\x2b\x2f\x76\x39': "utf_7",
b'\x2b\x2f\x76\x2b': "utf_7",
b'\x2b\x2f\x76\x2f': "utf_7"}
for bom in encodings:
if tmp.startswith(bom):
return tmp[len(bom):].decode(encodings[bom])

return tmp if binary else decode_text(tmp)


Expand Down

0 comments on commit 7c2b1b2

Please sign in to comment.