diff --git a/sentry_sdk/_types.py b/sentry_sdk/_types.py index 814b90c440..fbb9a166b8 100644 --- a/sentry_sdk/_types.py +++ b/sentry_sdk/_types.py @@ -12,7 +12,6 @@ SENSITIVE_DATA_SUBSTITUTE = "[Filtered]" -BLOB_DATA_SUBSTITUTE = "[Blob substitute]" class AnnotatedValue: diff --git a/sentry_sdk/ai/utils.py b/sentry_sdk/ai/utils.py index 56079cd3d6..cb22db0add 100644 --- a/sentry_sdk/ai/utils.py +++ b/sentry_sdk/ai/utils.py @@ -3,8 +3,6 @@ from copy import deepcopy from typing import TYPE_CHECKING -from sentry_sdk._types import BLOB_DATA_SUBSTITUTE -from sentry_sdk.ai.consts import DATA_URL_BASE64_REGEX if TYPE_CHECKING: from typing import Any, Callable, Dict, List, Optional, Tuple @@ -198,104 +196,6 @@ def _find_truncation_index(messages: "List[Dict[str, Any]]", max_bytes: int) -> return 0 -def _is_image_type_with_blob_content(item: "Dict[str, Any]") -> bool: - """ - Some content blocks contain an image_url property with base64 content as its value. - This is used to identify those while not leading to unnecessary copying of data when the image URL does not contain base64 content. - """ - if item.get("type") != "image_url": - return False - - image_url = item.get("image_url", {}).get("url", "") - data_url_match = DATA_URL_BASE64_REGEX.match(image_url) - - return bool(data_url_match) - - -def redact_blob_message_parts( - messages: "List[Dict[str, Any]]", -) -> "List[Dict[str, Any]]": - """ - Redact blob message parts from the messages by replacing blob content with "[Filtered]". - - This function creates a deep copy of messages that contain blob content to avoid - mutating the original message dictionaries. Messages without blob content are - returned as-is to minimize copying overhead. - - e.g: - { - "role": "user", - "content": [ - { - "text": "How many ponies do you see in the image?", - "type": "text" - }, - { - "type": "blob", - "modality": "image", - "mime_type": "image/jpeg", - "content": "data:image/jpeg;base64,..." - } - ] - } - becomes: - { - "role": "user", - "content": [ - { - "text": "How many ponies do you see in the image?", - "type": "text" - }, - { - "type": "blob", - "modality": "image", - "mime_type": "image/jpeg", - "content": "[Filtered]" - } - ] - } - """ - - # First pass: check if any message contains blob content - has_blobs = False - for message in messages: - if not isinstance(message, dict): - continue - content = message.get("content") - if isinstance(content, list): - for item in content: - if isinstance(item, dict) and ( - item.get("type") == "blob" or _is_image_type_with_blob_content(item) - ): - has_blobs = True - break - if has_blobs: - break - - # If no blobs found, return original messages to avoid unnecessary copying - if not has_blobs: - return messages - - # Deep copy messages to avoid mutating the original - messages_copy = deepcopy(messages) - - # Second pass: redact blob content in the copy - for message in messages_copy: - if not isinstance(message, dict): - continue - - content = message.get("content") - if isinstance(content, list): - for item in content: - if isinstance(item, dict): - if item.get("type") == "blob": - item["content"] = BLOB_DATA_SUBSTITUTE - elif _is_image_type_with_blob_content(item): - item["image_url"]["url"] = BLOB_DATA_SUBSTITUTE - - return messages_copy - - def truncate_messages_by_size( messages: "List[Dict[str, Any]]", max_bytes: int = MAX_GEN_AI_MESSAGE_BYTES, @@ -341,8 +241,6 @@ def truncate_and_annotate_messages( if not messages: return None - messages = redact_blob_message_parts(messages) - truncated_message = _truncate_single_message_content_if_present( deepcopy(messages[-1]), max_chars=max_single_message_chars ) @@ -361,8 +259,6 @@ def truncate_and_annotate_embedding_inputs( if not messages: return None - messages = redact_blob_message_parts(messages) - truncated_messages, removed_count = truncate_messages_by_size(messages, max_bytes) if removed_count > 0: scope._gen_ai_original_message_count[span.span_id] = len(messages) diff --git a/tests/test_ai_monitoring.py b/tests/test_ai_monitoring.py index b2247a728e..a110ee9ad6 100644 --- a/tests/test_ai_monitoring.py +++ b/tests/test_ai_monitoring.py @@ -3,7 +3,6 @@ import sentry_sdk from sentry_sdk._types import ( AnnotatedValue, - BLOB_DATA_SUBSTITUTE, ) from sentry_sdk.ai.monitoring import ai_track from sentry_sdk.ai.utils import ( @@ -13,7 +12,6 @@ truncate_messages_by_size, _find_truncation_index, parse_data_uri, - redact_blob_message_parts, ) from sentry_sdk.utils import safe_serialize @@ -523,49 +521,6 @@ def __init__(self): assert isinstance(result, list) assert result[0] == large_messages[-len(result)] - def test_preserves_original_messages_with_blobs(self): - """Test that truncate_and_annotate_messages doesn't mutate the original messages""" - - class MockSpan: - def __init__(self): - self.span_id = "test_span_id" - self.data = {} - - def set_data(self, key, value): - self.data[key] = value - - class MockScope: - def __init__(self): - self._gen_ai_original_message_count = {} - - messages = [ - { - "role": "user", - "content": [ - {"text": "What's in this image?", "type": "text"}, - { - "type": "blob", - "modality": "image", - "content": "data:image/jpeg;base64,original_content", - }, - ], - } - ] - - original_blob_content = messages[0]["content"][1]["content"] - - span = MockSpan() - scope = MockScope() - - # This should NOT mutate the original messages - result = truncate_and_annotate_messages(messages, span, scope) - - # Verify original is unchanged - assert messages[0]["content"][1]["content"] == original_blob_content - - # Verify result has redacted content - assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE - class TestClientAnnotation: def test_client_wraps_truncated_messages_in_annotated_value(self, large_messages): @@ -685,238 +640,6 @@ def __init__(self): assert len(messages_value.value) == len(truncated_messages) -class TestRedactBlobMessageParts: - def test_redacts_single_blob_content(self): - """Test that blob content is redacted without mutating original messages""" - messages = [ - { - "role": "user", - "content": [ - { - "text": "How many ponies do you see in the image?", - "type": "text", - }, - { - "type": "blob", - "modality": "image", - "mime_type": "image/jpeg", - "content": "data:image/jpeg;base64,/9j/4AAQSkZJRg==", - }, - ], - } - ] - - # Save original blob content for comparison - original_blob_content = messages[0]["content"][1]["content"] - - result = redact_blob_message_parts(messages) - - # Original messages should be UNCHANGED - assert messages[0]["content"][1]["content"] == original_blob_content - - # Result should have redacted content - assert ( - result[0]["content"][0]["text"] - == "How many ponies do you see in the image?" - ) - assert result[0]["content"][0]["type"] == "text" - assert result[0]["content"][1]["type"] == "blob" - assert result[0]["content"][1]["modality"] == "image" - assert result[0]["content"][1]["mime_type"] == "image/jpeg" - assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE - - def test_redacts_multiple_blob_parts(self): - """Test that multiple blob parts are redacted without mutation""" - messages = [ - { - "role": "user", - "content": [ - {"text": "Compare these images", "type": "text"}, - { - "type": "blob", - "modality": "image", - "mime_type": "image/jpeg", - "content": "data:image/jpeg;base64,first_image", - }, - { - "type": "blob", - "modality": "image", - "mime_type": "image/png", - "content": "data:image/png;base64,second_image", - }, - ], - } - ] - - original_first = messages[0]["content"][1]["content"] - original_second = messages[0]["content"][2]["content"] - - result = redact_blob_message_parts(messages) - - # Original should be unchanged - assert messages[0]["content"][1]["content"] == original_first - assert messages[0]["content"][2]["content"] == original_second - - # Result should be redacted - assert result[0]["content"][0]["text"] == "Compare these images" - assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE - assert result[0]["content"][2]["content"] == BLOB_DATA_SUBSTITUTE - - def test_redacts_blobs_in_multiple_messages(self): - """Test that blob parts are redacted across multiple messages without mutation""" - messages = [ - { - "role": "user", - "content": [ - {"text": "First message", "type": "text"}, - { - "type": "blob", - "modality": "image", - "content": "data:image/jpeg;base64,first", - }, - ], - }, - { - "role": "assistant", - "content": "I see the image.", - }, - { - "role": "user", - "content": [ - {"text": "Second message", "type": "text"}, - { - "type": "blob", - "modality": "image", - "content": "data:image/jpeg;base64,second", - }, - ], - }, - ] - - original_first = messages[0]["content"][1]["content"] - original_second = messages[2]["content"][1]["content"] - - result = redact_blob_message_parts(messages) - - # Original should be unchanged - assert messages[0]["content"][1]["content"] == original_first - assert messages[2]["content"][1]["content"] == original_second - - # Result should be redacted - assert result[0]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE - assert result[1]["content"] == "I see the image." # Unchanged - assert result[2]["content"][1]["content"] == BLOB_DATA_SUBSTITUTE - - def test_redacts_single_blob_within_image_url_content(self): - messages = [ - { - "role": "user", - "content": [ - { - "text": "How many ponies do you see in the image?", - "type": "text", - }, - { - "type": "image_url", - "image_url": {"url": "data:image/jpeg;base64,/9j/4AAQSkZJRg=="}, - }, - ], - } - ] - - original_blob_content = messages[0]["content"][1] - - result = redact_blob_message_parts(messages) - - assert messages[0]["content"][1] == original_blob_content - - assert ( - result[0]["content"][0]["text"] - == "How many ponies do you see in the image?" - ) - assert result[0]["content"][0]["type"] == "text" - assert result[0]["content"][1]["type"] == "image_url" - assert result[0]["content"][1]["image_url"]["url"] == BLOB_DATA_SUBSTITUTE - - def test_does_not_redact_image_url_content_with_non_blobs(self): - messages = [ - { - "role": "user", - "content": [ - { - "text": "How many ponies do you see in the image?", - "type": "text", - }, - { - "type": "image_url", - "image_url": {"url": "https://example.com/image.jpg"}, - }, - ], - } - ] - - original_blob_content = messages[0]["content"][1] - - result = redact_blob_message_parts(messages) - - assert messages[0]["content"][1] == original_blob_content - - assert ( - result[0]["content"][0]["text"] - == "How many ponies do you see in the image?" - ) - assert result[0]["content"][0]["type"] == "text" - assert result[0]["content"][1]["type"] == "image_url" - assert ( - result[0]["content"][1]["image_url"]["url"] - == "https://example.com/image.jpg" - ) - - def test_no_blobs_returns_original_list(self): - """Test that messages without blobs are returned as-is (performance optimization)""" - messages = [ - {"role": "user", "content": "Simple text message"}, - {"role": "assistant", "content": "Simple response"}, - ] - - result = redact_blob_message_parts(messages) - - # Should return the same list object when no blobs present - assert result is messages - - def test_handles_non_dict_messages(self): - """Test that non-dict messages are handled gracefully""" - messages = [ - "string message", - {"role": "user", "content": "text"}, - None, - 123, - ] - - result = redact_blob_message_parts(messages) - - # Should return same list since no blobs - assert result is messages - - def test_handles_non_dict_content_items(self): - """Test that non-dict content items in arrays are handled""" - messages = [ - { - "role": "user", - "content": [ - "string item", - {"text": "text item", "type": "text"}, - None, - ], - } - ] - - result = redact_blob_message_parts(messages) - - # Should return same list since no blobs - assert result is messages - - class TestParseDataUri: def test_parses_base64_image_data_uri(self): """Test parsing a standard base64-encoded image data URI"""