-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
feat(seer_grouping): Filter Seer grouping requests by token count instead of frame count #103997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3708585
2ed0bfb
e036cd6
7df4650
af1c991
8cbfab1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| from sentry.constants import DATA_ROOT | ||
| from sentry.grouping.api import get_contributing_variant_and_component | ||
| from sentry.grouping.grouping_info import get_grouping_info_from_variants_legacy | ||
| from sentry.grouping.variants import BaseVariant, ComponentVariant | ||
| from sentry.grouping.variants import BaseVariant | ||
| from sentry.killswitches import killswitch_matches_context | ||
| from sentry.models.organization import Organization | ||
| from sentry.models.project import Project | ||
|
|
@@ -31,10 +31,9 @@ | |
| # platforms getting sent to Seer during ingest. | ||
| SEER_INELIGIBLE_EVENT_PLATFORMS = frozenset(["other"]) # We don't know what's in the event | ||
| # Event platforms corresponding to project platforms which were backfilled before we started | ||
| # blocking events with more than `MAX_FRAME_COUNT` frames from being sent to Seer (which we do to | ||
| # prevent possible over-grouping). Ultimately we want a more unified solution, but for now, we're | ||
| # just not going to apply the filter to events from these platforms. | ||
| EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK = frozenset( | ||
| # filtering stacktraces by length. To keep new events matching with existing data, we bypass | ||
| # length checks for these platforms (their stacktraces will be truncated instead). | ||
| EVENT_PLATFORMS_BYPASSING_STACKTRACE_LENGTH_CHECK = frozenset( | ||
| [ | ||
| "go", | ||
| "javascript", | ||
|
|
@@ -337,10 +336,10 @@ def stacktrace_exceeds_limits( | |
| """ | ||
| Check if a stacktrace exceeds length limits for Seer similarity analysis. | ||
|
|
||
| This checks both frame count and token count limits to determine if the stacktrace | ||
| is too long to send to Seer. Different platforms have different filtering behaviors: | ||
| - Platforms in EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK bypass all checks | ||
| - Other platforms are checked against MAX_FRAME_COUNT and max_token_count limits | ||
| For platforms that bypass length checks (to maintain consistency with backfilled data), | ||
| all stacktraces pass through. For other platforms, we use a two-step approach: | ||
| 1. First check raw string length - if shorter than token limit, pass immediately | ||
| 2. Only if string is long enough to potentially exceed limit, run expensive token count | ||
| """ | ||
| platform: str = event.platform or "unknown" | ||
| shared_tags = {"referrer": referrer.value, "platform": platform} | ||
|
|
@@ -351,23 +350,18 @@ def stacktrace_exceeds_limits( | |
| # is using it for grouping (in which case none of the below conditions should apply), but still | ||
| # worth checking that we have enough information to answer the question just in case | ||
| if ( | ||
| # Fingerprint, checksum, fallback variants | ||
| not isinstance(contributing_variant, ComponentVariant) | ||
| # Security violations, log-message-based grouping | ||
| or contributing_variant.variant_name == "default" | ||
| # Any ComponentVariant will have this, but this reassures mypy | ||
| or not contributing_component | ||
| # Exception-message-based grouping | ||
| or not hasattr(contributing_component, "frame_counts") | ||
| # Should always have it, but this reassures mypy | ||
| not contributing_component | ||
| # Filter out events that don't use stacktrace-based grouping | ||
| or "stacktrace" not in contributing_variant.key | ||
| ): | ||
| # We don't bother to collect a metric on this outcome, because we shouldn't have called the | ||
| # function in the first place | ||
| return False | ||
|
|
||
| # Certain platforms were backfilled before we added this filter, so to keep new events matching | ||
| # with the existing data, we turn off the filter for them (instead their stacktraces will be | ||
| # truncated) | ||
| if platform in EVENT_PLATFORMS_BYPASSING_FRAME_COUNT_CHECK: | ||
| # Certain platforms were backfilled before we added length filtering, so to keep new events | ||
| # matching with existing data, we bypass the filter for them (their stacktraces will be truncated) | ||
| if platform in EVENT_PLATFORMS_BYPASSING_STACKTRACE_LENGTH_CHECK: | ||
| metrics.incr( | ||
| "grouping.similarity.stacktrace_length_filter", | ||
| sample_rate=options.get("seer.similarity.metrics_sample_rate"), | ||
|
|
@@ -376,22 +370,28 @@ def stacktrace_exceeds_limits( | |
| report_token_count_metric(event, variants, "bypass") | ||
| return False | ||
|
|
||
| max_token_count = options.get("seer.similarity.max_token_count") | ||
|
|
||
| stacktrace_type = "in_app" if contributing_variant.variant_name == "app" else "system" | ||
| key = f"{stacktrace_type}_contributing_frames" | ||
| shared_tags["stacktrace_type"] = stacktrace_type | ||
lobsterkatie marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if contributing_component.frame_counts[key] > MAX_FRAME_COUNT: | ||
| # raw string length check | ||
| stacktrace_text = event.data.get("stacktrace_string") | ||
| if stacktrace_text is None: | ||
| stacktrace_text = get_stacktrace_string(get_grouping_info_from_variants_legacy(variants)) | ||
|
|
||
| string_length = len(stacktrace_text) | ||
| if string_length < max_token_count: | ||
| metrics.incr( | ||
| "grouping.similarity.stacktrace_length_filter", | ||
| sample_rate=options.get("seer.similarity.metrics_sample_rate"), | ||
| tags={**shared_tags, "outcome": "block_frames"}, | ||
| tags={**shared_tags, "outcome": "pass_string_length"}, | ||
| ) | ||
| report_token_count_metric(event, variants, "block_frames") | ||
| return True | ||
| report_token_count_metric(event, variants, "pass_string_length") | ||
| return False | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: String length compared to token count without conversionThe code compares
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrong analysis here, we are comparing exactly because tokens are 4~ characters, which means if a string length is even less than the max token count it will never exceed the token count limit. We could have even made it 4 times that according to this anaylsis, so what we are doing is very conservative actually. |
||
|
|
||
| # For platforms that filter by frame count, also check token count | ||
| # String is long enough that it might exceed token limit - run actual token count | ||
| token_count = get_token_count(event, variants, platform) | ||
| max_token_count = options.get("seer.similarity.max_token_count") | ||
|
|
||
| if token_count > max_token_count: | ||
| metrics.incr( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why remove this fail-fast option?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
well it seemed like I no longer need it to have
frame_countsto be able to do this check, which just needs the raw stacktrace, but maybe if it doesnt have it its indicative of something more important?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, now that variants all have a
keyproperty, you could just do something likeif 'stacktrace' not in contributing_variant.key or not contributing_component: ...and that'd catch all the cases mentioned in the current version of the check.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice, refactored the conditions there to this 👍
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that there's no type-check ahead of the mypy-appeasment check, the comment about it doesn't make sense - I'd just remove it (the comment that is, not the check).