Skip to content

Commit

Permalink
fix(utils): Fix UnicodeDecodeError on Python 2 (#2657)
Browse files Browse the repository at this point in the history
  • Loading branch information
sentrivana committed Jan 29, 2024
1 parent 704d259 commit e373e35
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 29 deletions.
70 changes: 60 additions & 10 deletions sentry_sdk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,13 @@ def __init__(self, value, metadata):
self.value = value
self.metadata = metadata

def __eq__(self, other):
# type: (Any) -> bool
if not isinstance(other, AnnotatedValue):
return False

return self.value == other.value and self.metadata == other.metadata

@classmethod
def removed_because_raw_data(cls):
# type: () -> AnnotatedValue
Expand Down Expand Up @@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root):
return False


def _truncate_by_bytes(string, max_bytes):
# type: (str, int) -> str
"""
Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes.
"""
# This function technically supports bytes, but only for Python 2 compat.
# XXX remove support for bytes when we drop Python 2
if isinstance(string, bytes):
truncated = string[: max_bytes - 3]
else:
truncated = string.encode("utf-8")[: max_bytes - 3].decode(
"utf-8", errors="ignore"
)

return truncated + "..."


def _get_size_in_bytes(value):
# type: (str) -> Optional[int]
# This function technically supports bytes, but only for Python 2 compat.
# XXX remove support for bytes when we drop Python 2
if not isinstance(value, (bytes, text_type)):
return None

if isinstance(value, bytes):
return len(value)

try:
return len(value.encode("utf-8"))
except (UnicodeEncodeError, UnicodeDecodeError):
return None


def strip_string(value, max_length=None):
# type: (str, Optional[int]) -> Union[AnnotatedValue, str]
if not value:
Expand All @@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None):
if max_length is None:
max_length = DEFAULT_MAX_VALUE_LENGTH

length = len(value.encode("utf-8"))
byte_size = _get_size_in_bytes(value)
text_size = None
if isinstance(value, text_type):
text_size = len(value)

if byte_size is not None and byte_size > max_length:
# truncate to max_length bytes, preserving code points
truncated_value = _truncate_by_bytes(value, max_length)
elif text_size is not None and text_size > max_length:
# fallback to truncating by string length
truncated_value = value[: max_length - 3] + "..."
else:
return value

if length > max_length:
return AnnotatedValue(
value=value[: max_length - 3] + "...",
metadata={
"len": length,
"rem": [["!limit", "x", max_length - 3, max_length]],
},
)
return value
return AnnotatedValue(
value=truncated_value,
metadata={
"len": byte_size or text_size,
"rem": [["!limit", "x", max_length - 3, max_length]],
},
)


def parse_version(version):
Expand Down
48 changes: 29 additions & 19 deletions tests/utils/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,22 +572,32 @@ def test_failed_base64_conversion(input):
assert to_base64(input) is None


def test_strip_string():
# If value is None returns None.
assert strip_string(None) is None

# If max_length is not passed, returns the full text (up to 1024 bytes).
text_1024_long = "a" * 1024
assert strip_string(text_1024_long).count("a") == 1024

# If value exceeds the max_length, returns an AnnotatedValue.
text_1025_long = "a" * 1025
stripped_text = strip_string(text_1025_long)
assert isinstance(stripped_text, AnnotatedValue)
assert stripped_text.value.count("a") == 1021 # + '...' is 1024

# If text has unicode characters, it counts bytes and not number of characters.
# fmt: off
text_with_unicode_character = u"éê"
assert strip_string(text_with_unicode_character, max_length=2).value == u"é..."
# fmt: on
@pytest.mark.parametrize(
"input,max_length,result",
[
[None, None, None],
["a" * 256, None, "a" * 256],
[
"a" * 257,
256,
AnnotatedValue(
value="a" * 253 + "...",
metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]},
),
],
# fmt: off
[u"éééé", None, u"éééé"],
[u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})],
# fmt: on
["éééé", None, "éééé"],
[
"éééé",
5,
AnnotatedValue(
value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]}
),
],
],
)
def test_strip_string(input, max_length, result):
assert strip_string(input, max_length) == result

0 comments on commit e373e35

Please sign in to comment.