getsentry · yuvmen · Dec 2, 2025 · Nov 25, 2025 · Nov 25, 2025 · Dec 1, 2025
@@ -12,7 +12,7 @@
 from sentry.constants import DATA_ROOT
 from sentry.grouping.api import get_contributing_variant_and_component
 from sentry.grouping.grouping_info import get_grouping_info_from_variants_legacy
-from sentry.grouping.variants import BaseVariant, ComponentVariant
+from sentry.grouping.variants import BaseVariant
 from sentry.killswitches import killswitch_matches_context
 from sentry.models.organization import Organization
 from sentry.models.project import Project
@@ -31,10 +31,9 @@
 # platforms getting sent to Seer during ingest.
 SEER_INELIGIBLE_EVENT_PLATFORMS = frozenset(["other"])  # We don't know what's in the event
 # Event platforms corresponding to project platforms which were backfilled before we started
-# blocking events with more than `MAX_FRAME_COUNT` frames from being sent to Seer (which we do to
-# prevent possible over-grouping). Ultimately we want a more unified solution, but for now, we're
-# just not going to apply the filter to events from these platforms.
-EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK = frozenset(
+# filtering stacktraces by length. To keep new events matching with existing data, we bypass
+# length checks for these platforms (their stacktraces will be truncated instead).
+EVENT_PLATFORMS_BYPASSING_STACKTRACE_LENGTH_CHECK = frozenset(
     [
         "go",
         "javascript",
@@ -337,10 +336,10 @@ def stacktrace_exceeds_limits(
     """
     Check if a stacktrace exceeds length limits for Seer similarity analysis.
 
-    This checks both frame count and token count limits to determine if the stacktrace
-    is too long to send to Seer. Different platforms have different filtering behaviors:
-    - Platforms in EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK bypass all checks
-    - Other platforms are checked against MAX_FRAME_COUNT and max_token_count limits
+    For platforms that bypass length checks (to maintain consistency with backfilled data),
+    all stacktraces pass through. For other platforms, we use a two-step approach:
+    1. First check raw string length - if shorter than token limit, pass immediately
+    2. Only if string is long enough to potentially exceed limit, run expensive token count
     """
     platform: str = event.platform or "unknown"
     shared_tags = {"referrer": referrer.value, "platform": platform}
@@ -351,23 +350,18 @@ def stacktrace_exceeds_limits(
     # is using it for grouping (in which case none of the below conditions should apply), but still
     # worth checking that we have enough information to answer the question just in case
     if (
-        # Fingerprint, checksum, fallback variants
-        not isinstance(contributing_variant, ComponentVariant)
-        # Security violations, log-message-based grouping
-        or contributing_variant.variant_name == "default"
-        # Any ComponentVariant will have this, but this reassures mypy
-        or not contributing_component
-        # Exception-message-based grouping
-        or not hasattr(contributing_component, "frame_counts")
+        # Should always have it, but this reassures mypy
+        not contributing_component
+        # Filter out events that don't use stacktrace-based grouping
+        or "stacktrace" not in contributing_variant.key
     ):
         # We don't bother to collect a metric on this outcome, because we shouldn't have called the
         # function in the first place
         return False
 
-    # Certain platforms were backfilled before we added this filter, so to keep new events matching
-    # with the existing data, we turn off the filter for them (instead their stacktraces will be
-    # truncated)
-    if platform in EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK:
+    # Certain platforms were backfilled before we added length filtering, so to keep new events
+    # matching with existing data, we bypass the filter for them (their stacktraces will be truncated)
+    if platform in EVENT_PLATFORMS_BYPASSING_STACKTRACE_LENGTH_CHECK:
         metrics.incr(
             "grouping.similarity.stacktrace_length_filter",
             sample_rate=options.get("seer.similarity.metrics_sample_rate"),
@@ -376,22 +370,28 @@ def stacktrace_exceeds_limits(
         report_token_count_metric(event, variants, "bypass")
         return False
 
+    max_token_count = options.get("seer.similarity.max_token_count")
+
     stacktrace_type = "in_app" if contributing_variant.variant_name == "app" else "system"
-    key = f"{stacktrace_type}_contributing_frames"
     shared_tags["stacktrace_type"] = stacktrace_type
 
-    if contributing_component.frame_counts[key] > MAX_FRAME_COUNT:
+    # raw string length check
+    stacktrace_text = event.data.get("stacktrace_string")
+    if stacktrace_text is None:
+        stacktrace_text = get_stacktrace_string(get_grouping_info_from_variants_legacy(variants))
+
+    string_length = len(stacktrace_text)
+    if string_length < max_token_count:
         metrics.incr(
             "grouping.similarity.stacktrace_length_filter",
             sample_rate=options.get("seer.similarity.metrics_sample_rate"),
-            tags={**shared_tags, "outcome": "block_frames"},
+            tags={**shared_tags, "outcome": "pass_string_length"},
         )
-        report_token_count_metric(event, variants, "block_frames")
-        return True
+        report_token_count_metric(event, variants, "pass_string_length")
+        return False
 
-    # For platforms that filter by frame count, also check token count
+    # String is long enough that it might exceed token limit - run actual token count
     token_count = get_token_count(event, variants, platform)
-    max_token_count = options.get("seer.similarity.max_token_count")
 
     if token_count > max_token_count:
         metrics.incr(

@@ -5,7 +5,6 @@
 from sentry.grouping.ingest.seer import maybe_check_seer_for_matching_grouphash
 from sentry.models.grouphash import GroupHash
 from sentry.seer.similarity.types import GroupingVersion
-from sentry.seer.similarity.utils import MAX_FRAME_COUNT
 from sentry.services.eventstore.models import Event
 from sentry.testutils.cases import TestCase
 
@@ -72,72 +71,6 @@ def test_simple(self, mock_get_similarity_data: MagicMock) -> None:
             },
         )
 
-    @patch("sentry.grouping.ingest.seer.record_did_call_seer_metric")
-    @patch("sentry.grouping.ingest.seer.get_seer_similar_issues")
-    @patch("sentry.seer.similarity.utils.metrics")
-    def test_too_many_frames(
-        self,
-        mock_metrics: MagicMock,
-        mock_get_similar_issues: MagicMock,
-        mock_record_did_call_seer: MagicMock,
-    ) -> None:
-        self.project.update_option("sentry:similarity_backfill_completed", int(time()))
-
-        error_type = "FailedToFetchError"
-        error_value = "Charlie didn't bring the ball back"
-        context_line = f"raise {error_type}('{error_value}')"
-        new_event = Event(
-            project_id=self.project.id,
-            event_id="22312012112120120908201304152013",
-            data={
-                "title": f"{error_type}('{error_value}')",
-                "exception": {
-                    "values": [
-                        {
-                            "type": error_type,
-                            "value": error_value,
-                            "stacktrace": {
-                                "frames": [
-                                    {
-                                        "function": f"play_fetch_{i}",
-                                        "filename": f"dogpark{i}.py",
-                                        "context_line": context_line,
-                                    }
-                                    for i in range(MAX_FRAME_COUNT + 1)
-                                ]
-                            },
-                        }
-                    ]
-                },
-                "platform": "java",
-            },
-        )
-
-        new_grouphash = GroupHash.objects.create(
-            project=self.project, group=new_event.group, hash=new_event.get_primary_hash()
-        )
-        group_hashes = list(GroupHash.objects.filter(project_id=self.project.id))
-        maybe_check_seer_for_matching_grouphash(
-            new_event, new_grouphash, new_event.get_grouping_variants(), group_hashes
-        )
-
-        sample_rate = options.get("seer.similarity.metrics_sample_rate")
-        mock_metrics.incr.assert_any_call(
-            "grouping.similarity.stacktrace_length_filter",
-            sample_rate=sample_rate,
-            tags={
-                "platform": "java",
-                "referrer": "ingest",
-                "stacktrace_type": "system",
-                "outcome": "block_frames",
-            },
-        )
-        mock_record_did_call_seer.assert_any_call(
-            new_event, call_made=False, blocker="stacktrace-too-long"
-        )
-
-        mock_get_similar_issues.assert_not_called()
-
     @patch("sentry.grouping.ingest.seer.record_did_call_seer_metric")
     @patch("sentry.grouping.ingest.seer.get_seer_similar_issues")
     @patch("sentry.seer.similarity.utils.metrics")
@@ -172,9 +105,7 @@ def test_too_many_tokens(
                                             "filename": f"dogpark{i}.py",
                                             "context_line": context_line,
                                         }
-                                        for i in range(
-                                            3
-                                        )  # Just 3 frames, well under MAX_FRAME_COUNT
+                                        for i in range(3)  # Just 3 frames
                                     ]
                                 },
                             }
@@ -199,8 +130,8 @@ def test_too_many_tokens(
                 tags={
                     "platform": "java",
                     "referrer": "ingest",
-                    "stacktrace_type": "system",
                     "outcome": "block_tokens",
+                    "stacktrace_type": "system",
                 },
             )
             mock_record_did_call_seer.assert_any_call(
@@ -210,64 +141,69 @@ def test_too_many_tokens(
             mock_get_similar_issues.assert_not_called()
 
     @patch("sentry.grouping.ingest.seer.get_similarity_data_from_seer", return_value=[])
-    def test_too_many_frames_bypassed_platform(self, mock_get_similarity_data: MagicMock) -> None:
+    def test_bypassed_platform_calls_seer_regardless_of_length(
+        self, mock_get_similarity_data: MagicMock
+    ) -> None:
         self.project.update_option("sentry:similarity_backfill_completed", int(time()))
 
-        error_type = "FailedToFetchError"
-        error_value = "Charlie didn't bring the ball back"
-        context_line = f"raise {error_type}('{error_value}')"
-        new_event = Event(
-            project_id=self.project.id,
-            event_id="22312012112120120908201304152013",
-            data={
-                "title": f"{error_type}('{error_value}')",
-                "exception": {
-                    "values": [
-                        {
-                            "type": error_type,
-                            "value": error_value,
-                            "stacktrace": {
-                                "frames": [
-                                    {
-                                        "function": f"play_fetch_{i}",
-                                        "filename": f"dogpark{i}.py",
-                                        "context_line": context_line,
-                                    }
-                                    for i in range(MAX_FRAME_COUNT + 1)
-                                ]
-                            },
-                        }
-                    ]
+        # Set a low token limit to ensure the stacktrace would be blocked if not bypassed
+        with self.options({"seer.similarity.max_token_count": 100}):
+            error_type = "FailedToFetchError"
+            error_value = "Charlie didn't bring the ball back"
+            context_line = f"raise {error_type}('{error_value}')"
+            # Create a stacktrace that would exceed the token limit if not bypassed
+            new_event = Event(
+                project_id=self.project.id,
+                event_id="22312012112120120908201304152013",
+                data={
+                    "title": f"{error_type}('{error_value}')",
+                    "exception": {
+                        "values": [
+                            {
+                                "type": error_type,
+                                "value": error_value,
+                                "stacktrace": {
+                                    "frames": [
+                                        {
+                                            "function": f"play_fetch_{i}",
+                                            "filename": f"dogpark{i}.py",
+                                            "context_line": context_line,
+                                        }
+                                        for i in range(20)
+                                    ]
+                                },
+                            }
+                        ]
+                    },
+                    "platform": "python",
                 },
-                "platform": "python",
-            },
-        )
+            )
 
-        new_grouphash = GroupHash.objects.create(
-            project=self.project, group=new_event.group, hash=new_event.get_primary_hash()
-        )
-        group_hashes = list(GroupHash.objects.filter(project_id=self.project.id))
-        maybe_check_seer_for_matching_grouphash(
-            new_event, new_grouphash, new_event.get_grouping_variants(), group_hashes
-        )
+            new_grouphash = GroupHash.objects.create(
+                project=self.project, group=new_event.group, hash=new_event.get_primary_hash()
+            )
+            group_hashes = list(GroupHash.objects.filter(project_id=self.project.id))
+            maybe_check_seer_for_matching_grouphash(
+                new_event, new_grouphash, new_event.get_grouping_variants(), group_hashes
+            )
 
-        mock_get_similarity_data.assert_called_with(
-            {
-                "event_id": new_event.event_id,
-                "hash": new_event.get_primary_hash(),
-                "project_id": self.project.id,
-                "stacktrace": ANY,
-                "exception_type": "FailedToFetchError",
-                "k": 1,
-                "referrer": "ingest",
-                "use_reranking": True,
-                "model": GroupingVersion.V1,
-                "training_mode": False,
-            },
-            {
-                "platform": "python",
-                "model_version": "v1",
-                "training_mode": False,
-                "hybrid_fingerprint": False,
-            },
-        )
+            mock_get_similarity_data.assert_called_with(
+                {
+                    "event_id": new_event.event_id,
+                    "hash": new_event.get_primary_hash(),
+                    "project_id": self.project.id,
+                    "stacktrace": ANY,
+                    "exception_type": "FailedToFetchError",
+                    "k": 1,
+                    "referrer": "ingest",
+                    "use_reranking": True,
+                    "model": GroupingVersion.V1,
+                    "training_mode": False,
+                },
+                {
+                    "platform": "python",
+                    "model_version": "v1",
+                    "training_mode": False,
+                    "hybrid_fingerprint": False,
+                },
+            )