diff --git a/src/sentry/ingest/transaction_clusterer/normalization.py b/src/sentry/ingest/transaction_clusterer/normalization.py index 0214f78506e00b..a0c046ac6360c0 100644 --- a/src/sentry/ingest/transaction_clusterer/normalization.py +++ b/src/sentry/ingest/transaction_clusterer/normalization.py @@ -1,10 +1,14 @@ import re +from collections.abc import Sequence from dataclasses import dataclass import orjson from sentry_conventions.attributes import ATTRIBUTE_NAMES +from sentry.ingest.transaction_clusterer import ClustererNamespace from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED +from sentry.ingest.transaction_clusterer.rules import get_sorted_rules +from sentry.models.project import Project from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value # Ported from Relay: @@ -54,22 +58,26 @@ ) -def normalize_segment_name(segment_span: CompatibleSpan): +def normalize_segment_name(project: Project, segment_span: CompatibleSpan): segment_name = attribute_value( segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME ) or segment_span.get("name") if segment_name: _scrub_identifiers(segment_span, segment_name) + _apply_clustering_rules(project, segment_span, segment_name) @dataclass(frozen=True) class Remark: ty: str rule_id: str - range: tuple[int, int] + range: tuple[int, int] | None def serialize(self) -> list: - return [self.rule_id, self.ty, self.range[0], self.range[1]] + serialized: list[str | int] = [self.rule_id, self.ty] + if self.range: + serialized.extend([self.range[0], self.range[1]]) + return serialized # Ported from Relay: @@ -88,10 +96,11 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): if len(remarks) == 0: return - remarks.sort(key=lambda remark: remark.range[1]) + remarks.sort(key=lambda remark: remark.range[1]) # type: ignore[index] str_parts: list[str] = [] last_end = 0 for remark in remarks: + assert remark.range is not None # for typing - always set above start, end = remark.range str_parts.append(segment_name[last_end:start]) str_parts.append("*") @@ -114,3 +123,79 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str): "value": orjson.dumps({"meta": {"": {"rem": [r.serialize() for r in remarks]}}}).decode(), } segment_span["attributes"] = attributes + + +def _apply_clustering_rules( + project: Project, segment_span: CompatibleSpan, original_segment_name: str +): + segment_name = attribute_value( + segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME + ) or segment_span.get("name") + assert segment_name is not None + segment_name_parts = segment_name.split("/") + + rules = get_sorted_rules(ClustererNamespace.TRANSACTIONS, project) + for rule, _ in rules: + if clustered_name := _apply_clustering_rule_to_segment_name(segment_name_parts, rule): + segment_span["name"] = clustered_name + attributes = segment_span.get("attributes") or {} + attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] = { + "type": "string", + "value": clustered_name, + } + attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = { + "type": "string", + "value": TRANSACTION_SOURCE_SANITIZED, + } + attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = { + "type": "string", + "value": orjson.dumps( + { + "meta": { + "": { + "val": original_segment_name, + "rem": [Remark(ty="s", rule_id=rule, range=None).serialize()], + } + } + }, + ).decode(), + } + segment_span["attributes"] = attributes + return + + +def _apply_clustering_rule_to_segment_name( + segment_name_parts: Sequence[str], rule: str +) -> str | None: + """Tries to apply the given `rule` to the segment name given as a + `/`-separated sequence in `segment_name_parts`. Returns a clustered segment + name if the rule matches, or `None` if it does not.""" + output = [] + rule_parts = rule.split("/") + for i, rule_part in enumerate(rule_parts): + if i >= len(segment_name_parts): + # A segment name only matches a longer rule if the remainder of + # the rule is a multi-part wildcard (`**`). + if rule_part == "**": + break + else: + return None + + segment_name_part = segment_name_parts[i] + if rule_part == "**": + # `**`` matches the remainder of the segment name but does not replace it. + output.extend([part for part in segment_name_parts[i:]]) + break + + if rule_part == "*" and segment_name_part != "": + # `*` matches a single part and replaces it in the output with `*`. + output.append("*") + elif rule_part == segment_name_part: + # The segment name part and rule part match, so keep applying this rule. + output.append(segment_name_part) + else: + # If the segment name part and rule part didn't match, then this + # whole rule doesn't match. + return None + + return "/".join(output) diff --git a/src/sentry/spans/consumers/process_segments/message.py b/src/sentry/spans/consumers/process_segments/message.py index 43fa8757e23146..978fda1cd5a41f 100644 --- a/src/sentry/spans/consumers/process_segments/message.py +++ b/src/sentry/spans/consumers/process_segments/message.py @@ -163,7 +163,7 @@ def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> N unknown_if_parameterized = not source known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL if unknown_if_parameterized or known_to_be_unparameterized: - normalize_segment_name(segment_span) + normalize_segment_name(project, segment_span) record_segment_name(project, segment_span) diff --git a/tests/sentry/ingest/transaction_clusterer/test_normalization.py b/tests/sentry/ingest/transaction_clusterer/test_normalization.py index 01c9b9d0cf1962..700a6a4441eef8 100644 --- a/tests/sentry/ingest/transaction_clusterer/test_normalization.py +++ b/tests/sentry/ingest/transaction_clusterer/test_normalization.py @@ -1,8 +1,13 @@ +from unittest import mock + import orjson +import pytest from sentry_conventions.attributes import ATTRIBUTE_NAMES +from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name -from sentry.spans.consumers.process_segments.types import CompatibleSpan +from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value +from sentry.testutils.pytest.fixtures import django_db_all def _segment_span(**kwargs) -> CompatibleSpan: @@ -27,10 +32,13 @@ def _segment_span(**kwargs) -> CompatibleSpan: # Ported from Relay: # https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/transactions/processor.rs#L789 -def test_identifiers_scrubbed(): - segment_span = _segment_span(name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0") +@django_db_all +def test_identifiers_scrubbed(default_project: mock.MagicMock): + segment_span = _segment_span( + name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0", + ) - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/foo/*/user/*/0" attributes = segment_span.get("attributes") or {} @@ -50,7 +58,8 @@ def test_identifiers_scrubbed(): } -def test_name_attribute_takes_precedence_over_name(): +@django_db_all +def test_name_attribute_takes_precedence_over_name(default_project: mock.MagicMock): segment_span = _segment_span( name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0", attributes={ @@ -61,7 +70,7 @@ def test_name_attribute_takes_precedence_over_name(): }, ) - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/bar/*" attributes = segment_span.get("attributes") or {} @@ -79,11 +88,191 @@ def test_name_attribute_takes_precedence_over_name(): } -def test_no_meta_changes_if_no_name_changes(): +@django_db_all +def test_no_meta_changes_if_no_name_changes(default_project: mock.MagicMock): segment_span = _segment_span(name="/foo") - normalize_segment_name(segment_span) + normalize_segment_name(default_project, segment_span) assert segment_span["name"] == "/foo" attributes = segment_span.get("attributes") or {} assert len(attributes) == 0 + + +@django_db_all +@mock.patch( + "sentry.ingest.transaction_clusterer.normalization.get_sorted_rules", + return_value=[("/users/*/posts/*/**", 0), ("/users/*/**", 0), ("GET /users/*/**", 0)], +) +@pytest.mark.parametrize( + ("segment_name", "expected", "expected_rule"), + [ + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something", + "/users/*/posts/*", + "/users/*/posts/*/**", + id="matches both wildcards", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/", + "/users/*/posts/*/", + "/users/*/posts/*/**", + id="matches both wildcards (trailing slash)", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments", + "/users/*/posts/*/comments", + "/users/*/posts/*/**", + id="matches both wildcards with suffix", + ), + pytest.param( + "/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments/", + "/users/*/posts/*/comments/", + "/users/*/posts/*/**", + id="matches both wildcards with suffix (trailing slash)", + ), + pytest.param( + "/users/my-user-name/settings", + "/users/*/settings", + "/users/*/**", + id="matches first wildcards with suffix", + ), + pytest.param( + "/users/my-user-name/settings/", + "/users/*/settings/", + "/users/*/**", + id="matches first wildcards with suffix (trailing slash)", + ), + pytest.param( + "/users/my-user-name", + "/users/*", + "/users/*/**", + id="matches first wildcard", + ), + pytest.param( + "/users/my-user-name/", + "/users/*/", + "/users/*/**", + id="matches first wildcard (trailing slash)", + ), + pytest.param( + "/users", + "/users", + None, + id="ends before first wildcard", + ), + pytest.param( + "/users/", + "/users/", + None, + id="ends before first wildcard (trailing slash)", + ), + pytest.param( + "/user", + "/user", + None, + id="matching text prefix", + ), + pytest.param( + "/user/", + "/user/", + None, + id="matching text prefix (trailing slash)", + ), + pytest.param( + "/foo", + "/foo", + None, + id="unrelated text prefix", + ), + pytest.param( + "/foo/", + "/foo/", + None, + id="unrelated text prefix (trailing slash)", + ), + pytest.param( + "GET /users/my-user-name", + "GET /users/*", + "GET /users/*/**", + id="method prefix: matches wildcard", + ), + pytest.param( + "GET /users/my-user-name/", + "GET /users/*/", + "GET /users/*/**", + id="method prefix: matches wildcard (trailing slash)", + ), + pytest.param( + "GET /users", + "GET /users", + None, + id="method prefix: ends before wildcard", + ), + pytest.param( + "GET /users/", + "GET /users/", + None, + id="method prefix: ends before wildcard (trailing slash)", + ), + ], +) +def test_clusterer_applies_rules( + _mock_get_sorted_rules: mock.MagicMock, + segment_name: str, + expected: str, + expected_rule: str | None, + default_project: mock.MagicMock, +): + segment_span = _segment_span(name=segment_name) + + normalize_segment_name(default_project, segment_span) + + assert segment_span["name"] == expected + if segment_name != expected: + assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected + assert ( + attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) + == TRANSACTION_SOURCE_SANITIZED + ) + assert ( + attribute_value( + segment_span, + f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}", + ) + == orjson.dumps( + {"meta": {"": {"val": segment_name, "rem": [[expected_rule, "s"]]}}} + ).decode() + ) + + +@django_db_all +@mock.patch( + "sentry.ingest.transaction_clusterer.normalization.get_sorted_rules", + return_value=[("/users/*/**", 0)], +) +def test_clusterer_works_with_scrubbing( + _mock_get_sorted_rules: mock.MagicMock, + default_project: mock.MagicMock, +): + segment_name = "/users/my-user-name/94576097f3a64b68b85a59c7d4e3ee2a" + segment_span = _segment_span(name=segment_name) + + normalize_segment_name(default_project, segment_span) + + expected = "/users/*/*" + assert segment_span["name"] == expected + assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected + assert ( + attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE) + == TRANSACTION_SOURCE_SANITIZED + ) + assert ( + attribute_value( + segment_span, + f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}", + ) + == orjson.dumps( + {"meta": {"": {"val": segment_name, "rem": [["/users/*/**", "s"]]}}} + ).decode() + )