diff --git a/sentry_sdk/utils.py b/sentry_sdk/utils.py index 25399cd908..b25dd4bbd5 100644 --- a/sentry_sdk/utils.py +++ b/sentry_sdk/utils.py @@ -383,6 +383,13 @@ def __init__(self, value, metadata): self.value = value self.metadata = metadata + def __eq__(self, other): + # type: (Any) -> bool + if not isinstance(other, AnnotatedValue): + return False + + return self.value == other.value and self.metadata == other.metadata + @classmethod def removed_because_raw_data(cls): # type: () -> AnnotatedValue @@ -1119,6 +1126,39 @@ def _is_in_project_root(abs_path, project_root): return False +def _truncate_by_bytes(string, max_bytes): + # type: (str, int) -> str + """ + Truncate a UTF-8-encodable string to the last full codepoint so that it fits in max_bytes. + """ + # This function technically supports bytes, but only for Python 2 compat. + # XXX remove support for bytes when we drop Python 2 + if isinstance(string, bytes): + truncated = string[: max_bytes - 3] + else: + truncated = string.encode("utf-8")[: max_bytes - 3].decode( + "utf-8", errors="ignore" + ) + + return truncated + "..." + + +def _get_size_in_bytes(value): + # type: (str) -> Optional[int] + # This function technically supports bytes, but only for Python 2 compat. + # XXX remove support for bytes when we drop Python 2 + if not isinstance(value, (bytes, text_type)): + return None + + if isinstance(value, bytes): + return len(value) + + try: + return len(value.encode("utf-8")) + except (UnicodeEncodeError, UnicodeDecodeError): + return None + + def strip_string(value, max_length=None): # type: (str, Optional[int]) -> Union[AnnotatedValue, str] if not value: @@ -1127,17 +1167,27 @@ def strip_string(value, max_length=None): if max_length is None: max_length = DEFAULT_MAX_VALUE_LENGTH - length = len(value.encode("utf-8")) + byte_size = _get_size_in_bytes(value) + text_size = None + if isinstance(value, text_type): + text_size = len(value) + + if byte_size is not None and byte_size > max_length: + # truncate to max_length bytes, preserving code points + truncated_value = _truncate_by_bytes(value, max_length) + elif text_size is not None and text_size > max_length: + # fallback to truncating by string length + truncated_value = value[: max_length - 3] + "..." + else: + return value - if length > max_length: - return AnnotatedValue( - value=value[: max_length - 3] + "...", - metadata={ - "len": length, - "rem": [["!limit", "x", max_length - 3, max_length]], - }, - ) - return value + return AnnotatedValue( + value=truncated_value, + metadata={ + "len": byte_size or text_size, + "rem": [["!limit", "x", max_length - 3, max_length]], + }, + ) def parse_version(version): diff --git a/tests/utils/test_general.py b/tests/utils/test_general.py index 6f53de32c3..d4067bd5c6 100644 --- a/tests/utils/test_general.py +++ b/tests/utils/test_general.py @@ -572,22 +572,32 @@ def test_failed_base64_conversion(input): assert to_base64(input) is None -def test_strip_string(): - # If value is None returns None. - assert strip_string(None) is None - - # If max_length is not passed, returns the full text (up to 1024 bytes). - text_1024_long = "a" * 1024 - assert strip_string(text_1024_long).count("a") == 1024 - - # If value exceeds the max_length, returns an AnnotatedValue. - text_1025_long = "a" * 1025 - stripped_text = strip_string(text_1025_long) - assert isinstance(stripped_text, AnnotatedValue) - assert stripped_text.value.count("a") == 1021 # + '...' is 1024 - - # If text has unicode characters, it counts bytes and not number of characters. - # fmt: off - text_with_unicode_character = u"éê" - assert strip_string(text_with_unicode_character, max_length=2).value == u"é..." - # fmt: on +@pytest.mark.parametrize( + "input,max_length,result", + [ + [None, None, None], + ["a" * 256, None, "a" * 256], + [ + "a" * 257, + 256, + AnnotatedValue( + value="a" * 253 + "...", + metadata={"len": 257, "rem": [["!limit", "x", 253, 256]]}, + ), + ], + # fmt: off + [u"éééé", None, u"éééé"], + [u"éééé", 5, AnnotatedValue(value=u"é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]})], + # fmt: on + ["éééé", None, "éééé"], + [ + "éééé", + 5, + AnnotatedValue( + value="é...", metadata={"len": 8, "rem": [["!limit", "x", 2, 5]]} + ), + ], + ], +) +def test_strip_string(input, max_length, result): + assert strip_string(input, max_length) == result