From 46134e716a7695f47753b0d05a2613f0ac0f5248 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Mon, 1 Dec 2025 15:17:33 -0500 Subject: [PATCH 1/3] feat(segment-enrichment): Apply clusterer to segment names --- .../transaction_clusterer/normalization.py | 80 +++++++- .../consumers/process_segments/message.py | 2 +- .../test_normalization.py | 173 +++++++++++++++++- 3 files changed, 242 insertions(+), 13 deletions(-) diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py index 0214f78506e00b..8e8e82b6f242e2 100644 --- a/src/sentry/ingest/transaction_clusterer/normalization.py +++ b/src/sentry/ingest/transaction_clusterer/normalization.py @@ -4,7 +4,10 @@ import orjson from sentry_conventions.attributes import ATTRIBUTE_NAMES +from sentry.ingest.transaction_clusterer import ClustererNamespace from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED +from sentry.ingest.transaction_clusterer.rules import get_sorted_rules +from sentry.models.project import Project from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value # Ported from Relay: @@ -54,22 +57,26 @@ ) -def normalize_segment_name(segment_span: CompatibleSpan): +def normalize_segment_name(project: Project, segment_span: CompatibleSpan): segment_name = attribute_value( segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME ) or segment_span.get("name") if segment_name: _scrub_identifiers(segment_span, segment_name) + _apply_clustering_rule(project, segment_span, segment_name) @dataclass(frozen=True) class Remark: ty: str rule_id: str - range: tuple[int, int] + range: tuple[int, int] | None def serialize(self) -> list: - return [self.rule_id, self.ty, self.range[0], self.range[1]] + serialized: list[str | int] = [self.rule_id, self.ty] + if self.range: + serialized.extend([self.range[0], self.range[1]]) + return serialized # Ported from Relay: @@ -88,10 +95,11 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): if len(remarks) == 0: return - remarks.sort(key=lambda remark: remark.range[1]) + remarks.sort(key=lambda remark: remark.range[1]) # type: ignore[index] str_parts: list[str] = [] last_end = 0 for remark in remarks: + assert remark.range is not None # for typing - always set above start, end = remark.range str_parts.append(segment_name[last_end:start]) str_parts.append("*") @@ -114,3 +122,67 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): "value": orjson.dumps({"meta": {"": {"rem": [r.serialize() for r in remarks]}}}).decode(), } segment_span["attributes"] = attributes + + +def _apply_clustering_rule(project: Project, segment_span: CompatibleSpan, segment_name: str): + segment_name_parts = segment_name.split("/") + + def apply_rule(rule: str) -> str | None: + output = [] + rule_parts = rule.split("/") + for i, rule_part in enumerate(rule_parts): + if i >= len(segment_name_parts): + # A segment name only matches a longer rule if the remainder of + # the rule is a multi-part wildcard (`**`). + if rule_part == "**": + break + else: + return None + + segment_name_part = segment_name_parts[i] + if rule_part == "**": + # `**`` matches the remainder of the segment name but does not replace it. + output.extend([part for part in segment_name_parts[i:]]) + break + + if rule_part == "*" and segment_name_part != "": + # `*` matches a single part and replaces it in the output with `*`. + output.append("*") + elif rule_part == segment_name_part: + # The segment name part and rule part match, so keep applying this rule. + output.append(segment_name_part) + else: + # If the segment name part and rule part didn't match, then this + # whole rule doesn't match. + return None + + return "/".join(output) + + rules = get_sorted_rules(ClustererNamespace.TRANSACTIONS, project) + for rule, _ in rules: + if clustered_name := apply_rule(rule): + segment_span["name"] = clustered_name + attributes = segment_span.get("attributes") or {} + attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] = { + "type": "string", + "value": clustered_name, + } + attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = { + "type": "string", + "value": TRANSACTION_SOURCE_SANITIZED, + } + attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = { + "type": "string", + "value": orjson.dumps( + { + "meta": { + "": { + "val": segment_name, + "rem": [Remark(ty="s", rule_id=rule, range=None).serialize()], + } + } + }, + ).decode(), + } + segment_span["attributes"] = attributes + return diff --git a/src/sentry/spans/consumers/process_segments/message.py b/src/sentry/spans/consumers/process_segments/message.py index 43fa8757e23146..978fda1cd5a41f 100644 --- a/src/sentry/spans/consumers/process_segments/message.py +++ b/src/sentry/spans/consumers/process_segments/message.py @@ -163,7 +163,7 @@ def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> N unknown_if_parameterized = not source known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL if unknown_if_parameterized or known_to_be_unparameterized: - normalize_segment_name(segment_span) + normalize_segment_name(project, segment_span) record_segment_name(project, segment_span) diff --git a/tests/sentry/ingest/transaction_clusterer/test_normalization.py b/tests/sentry/ingest/transaction_clusterer/test_normalization.py index 01c9b9d0cf1962..65a5c4688afa04 100644 --- a/tests/sentry/ingest/transaction_clusterer/test_normalization.py +++ b/tests/sentry/ingest/transaction_clusterer/test_normalization.py @@ -1,8 +1,13 @@ +from unittest import mock + import orjson +import pytest from sentry_conventions.attributes import ATTRIBUTE_NAMES +from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name -from sentry.spans.consumers.process_segments.types import CompatibleSpan +from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value +from sentry.testutils.pytest.fixtures import django_db_all def _segment_span(**kwargs) -> CompatibleSpan: @@ -27,10 +32,13 @@ def _segment_span(**kwargs) -> CompatibleSpan: # Ported from Relay: # https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/transactions/processor.rs#L789 -def test_identifiers_scrubbed(): - segment_span = _segment_span(name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0") +@django_db_all +def test_identifiers_scrubbed(default_project: mock.MagicMock): + segment_span = _segment_span( + name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0", + ) - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/foo/*/user/*/0" attributes = segment_span.get("attributes") or {} @@ -50,7 +58,8 @@ def test_identifiers_scrubbed(): } -def test_name_attribute_takes_precedence_over_name(): +@django_db_all +def test_name_attribute_takes_precedence_over_name(default_project: mock.MagicMock): segment_span = _segment_span( name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0", attributes={ @@ -61,7 +70,7 @@ def test_name_attribute_takes_precedence_over_name(): }, ) - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/bar/*" attributes = segment_span.get("attributes") or {} @@ -79,11 +88,159 @@ def test_name_attribute_takes_precedence_over_name(): } -def test_no_meta_changes_if_no_name_changes(): +@django_db_all +def test_no_meta_changes_if_no_name_changes(default_project: mock.MagicMock): segment_span = _segment_span(name="/foo") - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/foo" attributes = segment_span.get("attributes") or {} assert len(attributes) == 0 + + +@django_db_all +@mock.patch( + "sentry.ingest.transaction_clusterer.normalization.get_sorted_rules", + return_value=[("/users/*/posts/*/**", 0), ("/users/*/**", 0), ("GET /users/*/**", 0)], +) +@pytest.mark.parametrize( + ("segment_name", "expected", "expected_rule"), + [ + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something", + "/users/*/posts/*", + "/users/*/posts/*/**", + id="matches both wildcards", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/", + "/users/*/posts/*/", + "/users/*/posts/*/**", + id="matches both wildcards (trailing slash)", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments", + "/users/*/posts/*/comments", + "/users/*/posts/*/**", + id="matches both wildcards with suffix", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments/", + "/users/*/posts/*/comments/", + "/users/*/posts/*/**", + id="matches both wildcards with suffix (trailing slash)", + ), + pytest.param( + "/users/my-user-name/settings", + "/users/*/settings", + "/users/*/**", + id="matches first wildcards with suffix", + ), + pytest.param( + "/users/my-user-name/settings", + "/users/*/settings", + "/users/*/**", + id="matches first wildcards with suffix (trailing slash)", + ), + pytest.param( + "/users/my-user-name", + "/users/*", + "/users/*/**", + id="matches first wildcard", + ), + pytest.param( + "/users/my-user-name/", + "/users/*/", + "/users/*/**", + id="matches first wildcard (trailing slash)", + ), + pytest.param( + "/users", + "/users", + None, + id="ends before first wildcard", + ), + pytest.param( + "/users/", + "/users/", + None, + id="ends before first wildcard (trailing slash)", + ), + pytest.param( + "/user", + "/user", + None, + id="matching text prefix", + ), + pytest.param( + "/user/", + "/user/", + None, + id="matching text prefix (trailing slash)", + ), + pytest.param( + "/foo", + "/foo", + None, + id="unrelated text prefix", + ), + pytest.param( + "/foo/", + "/foo/", + None, + id="unrelated text prefix (trailing slash)", + ), + pytest.param( + "GET /users/my-user-name", + "GET /users/*", + "GET /users/*/**", + id="method prefix: matches wildcard", + ), + pytest.param( + "GET /users/my-user-name/", + "GET /users/*/", + "GET /users/*/**", + id="method prefix: matches wildcard (trailing slash)", + ), + pytest.param( + "GET /users", + "GET /users", + None, + id="method prefix: ends before wildcard", + ), + pytest.param( + "GET /users/", + "GET /users/", + None, + id="method prefix: ends before wildcard (trailing slash)", + ), + ], +) +def test_clusterer_applies_rules( + mock_get_sorted_rules: mock.MagicMock, + segment_name: str, + expected: str, + expected_rule: str | None, + default_project: mock.MagicMock, +): + segment_span = _segment_span(name=segment_name) + + normalize_segment_name(default_project, segment_span) + + assert segment_span["name"] == expected + if segment_name != expected: + assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected + assert ( + attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) + == TRANSACTION_SOURCE_SANITIZED + ) + assert ( + attribute_value( + segment_span, + f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}", + ) + == orjson.dumps( + {"meta": {"": {"val": segment_name, "rem": [[expected_rule, "s"]]}}} + ).decode() + ) From e902c4e9d0cd42cbe32092a707d0b69ed7245528 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Mon, 1 Dec 2025 15:50:07 -0500 Subject: [PATCH 2/3] fix + test chaining scrubbing and clustering --- .../transaction_clusterer/normalization.py | 10 ++++-- .../test_normalization.py | 32 +++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py index 8e8e82b6f242e2..0a708d8b245014 100644 --- a/src/sentry/ingest/transaction_clusterer/normalization.py +++ b/src/sentry/ingest/transaction_clusterer/normalization.py @@ -124,7 +124,13 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): segment_span["attributes"] = attributes -def _apply_clustering_rule(project: Project, segment_span: CompatibleSpan, segment_name: str): +def _apply_clustering_rule( + project: Project, segment_span: CompatibleSpan, original_segment_name: str +): + segment_name = attribute_value( + segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME + ) or segment_span.get("name") + assert segment_name is not None segment_name_parts = segment_name.split("/") def apply_rule(rule: str) -> str | None: @@ -177,7 +183,7 @@ def apply_rule(rule: str) -> str | None: { "meta": { "": { - "val": segment_name, + "val": original_segment_name, "rem": [Remark(ty="s", rule_id=rule, range=None).serialize()], } } diff --git a/tests/sentry/ingest/transaction_clusterer/test_normalization.py b/tests/sentry/ingest/transaction_clusterer/test_normalization.py index 65a5c4688afa04..d9ff86cdcbb4ed 100644 --- a/tests/sentry/ingest/transaction_clusterer/test_normalization.py +++ b/tests/sentry/ingest/transaction_clusterer/test_normalization.py @@ -244,3 +244,35 @@ def test_clusterer_applies_rules( {"meta": {"": {"val": segment_name, "rem": [[expected_rule, "s"]]}}} ).decode() ) + + +@django_db_all +@mock.patch( + "sentry.ingest.transaction_clusterer.normalization.get_sorted_rules", + return_value=[("/users/*/**", 0)], +) +def test_clusterer_works_with_scrubbing( + mock_get_sorted_rules: mock.MagicMock, + default_project: mock.MagicMock, +): + segment_name = "/users/my-user-name/94576097f3a64b68b85a59c7d4e3ee2a" + segment_span = _segment_span(name=segment_name) + + normalize_segment_name(default_project, segment_span) + + expected = "/users/*/*" + assert segment_span["name"] == expected + assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected + assert ( + attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) + == TRANSACTION_SOURCE_SANITIZED + ) + assert ( + attribute_value( + segment_span, + f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}", + ) + == orjson.dumps( + {"meta": {"": {"val": segment_name, "rem": [["/users/*/**", "s"]]}}} + ).decode() + ) From b86b207366637632b75a525c99ea37be201953c7 Mon Sep 17 00:00:00 2001 From: Matt Quinn Date: Tue, 2 Dec 2025 11:52:52 -0500 Subject: [PATCH 3/3] review feedback: extract inline function also fixes a test case copy/paste error, and marks unused params in tests. --- .../transaction_clusterer/normalization.py | 75 ++++++++++--------- .../test_normalization.py | 8 +- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py index 0a708d8b245014..a0c046ac6360c0 100644 --- a/src/sentry/ingest/transaction_clusterer/normalization.py +++ b/src/sentry/ingest/transaction_clusterer/normalization.py @@ -1,4 +1,5 @@ import re +from collections.abc import Sequence from dataclasses import dataclass import orjson @@ -63,7 +64,7 @@ def normalize_segment_name(project: Project, segment_span: CompatibleSpan): ) or segment_span.get("name") if segment_name: _scrub_identifiers(segment_span, segment_name) - _apply_clustering_rule(project, segment_span, segment_name) + _apply_clustering_rules(project, segment_span, segment_name) @dataclass(frozen=True) @@ -124,7 +125,7 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): segment_span["attributes"] = attributes -def _apply_clustering_rule( +def _apply_clustering_rules( project: Project, segment_span: CompatibleSpan, original_segment_name: str ): segment_name = attribute_value( @@ -133,40 +134,9 @@ def _apply_clustering_rule( assert segment_name is not None segment_name_parts = segment_name.split("/") - def apply_rule(rule: str) -> str | None: - output = [] - rule_parts = rule.split("/") - for i, rule_part in enumerate(rule_parts): - if i >= len(segment_name_parts): - # A segment name only matches a longer rule if the remainder of - # the rule is a multi-part wildcard (`**`). - if rule_part == "**": - break - else: - return None - - segment_name_part = segment_name_parts[i] - if rule_part == "**": - # `**`` matches the remainder of the segment name but does not replace it. - output.extend([part for part in segment_name_parts[i:]]) - break - - if rule_part == "*" and segment_name_part != "": - # `*` matches a single part and replaces it in the output with `*`. - output.append("*") - elif rule_part == segment_name_part: - # The segment name part and rule part match, so keep applying this rule. - output.append(segment_name_part) - else: - # If the segment name part and rule part didn't match, then this - # whole rule doesn't match. - return None - - return "/".join(output) - rules = get_sorted_rules(ClustererNamespace.TRANSACTIONS, project) for rule, _ in rules: - if clustered_name := apply_rule(rule): + if clustered_name := _apply_clustering_rule_to_segment_name(segment_name_parts, rule): segment_span["name"] = clustered_name attributes = segment_span.get("attributes") or {} attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] = { @@ -192,3 +162,40 @@ def apply_rule(rule: str) -> str | None: } segment_span["attributes"] = attributes return + + +def _apply_clustering_rule_to_segment_name( + segment_name_parts: Sequence[str], rule: str +) -> str | None: + """Tries to apply the given `rule` to the segment name given as a + `/`-separated sequence in `segment_name_parts`. Returns a clustered segment + name if the rule matches, or `None` if it does not.""" + output = [] + rule_parts = rule.split("/") + for i, rule_part in enumerate(rule_parts): + if i >= len(segment_name_parts): + # A segment name only matches a longer rule if the remainder of + # the rule is a multi-part wildcard (`**`). + if rule_part == "**": + break + else: + return None + + segment_name_part = segment_name_parts[i] + if rule_part == "**": + # `**`` matches the remainder of the segment name but does not replace it. + output.extend([part for part in segment_name_parts[i:]]) + break + + if rule_part == "*" and segment_name_part != "": + # `*` matches a single part and replaces it in the output with `*`. + output.append("*") + elif rule_part == segment_name_part: + # The segment name part and rule part match, so keep applying this rule. + output.append(segment_name_part) + else: + # If the segment name part and rule part didn't match, then this + # whole rule doesn't match. + return None + + return "/".join(output) diff --git a/tests/sentry/ingest/transaction_clusterer/test_normalization.py b/tests/sentry/ingest/transaction_clusterer/test_normalization.py index d9ff86cdcbb4ed..700a6a4441eef8 100644 --- a/tests/sentry/ingest/transaction_clusterer/test_normalization.py +++ b/tests/sentry/ingest/transaction_clusterer/test_normalization.py @@ -138,8 +138,8 @@ def test_no_meta_changes_if_no_name_changes(default_project: mock.MagicMock): id="matches first wildcards with suffix", ), pytest.param( - "/users/my-user-name/settings", - "/users/*/settings", + "/users/my-user-name/settings/", + "/users/*/settings/", "/users/*/**", id="matches first wildcards with suffix (trailing slash)", ), @@ -218,7 +218,7 @@ def test_no_meta_changes_if_no_name_changes(default_project: mock.MagicMock): ], ) def test_clusterer_applies_rules( - mock_get_sorted_rules: mock.MagicMock, + _mock_get_sorted_rules: mock.MagicMock, segment_name: str, expected: str, expected_rule: str | None, @@ -252,7 +252,7 @@ def test_clusterer_applies_rules( return_value=[("/users/*/**", 0)], ) def test_clusterer_works_with_scrubbing( - mock_get_sorted_rules: mock.MagicMock, + _mock_get_sorted_rules: mock.MagicMock, default_project: mock.MagicMock, ): segment_name = "/users/my-user-name/94576097f3a64b68b85a59c7d4e3ee2a"