Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 89 additions & 4 deletions src/sentry/ingest/transaction_clusterer/normalization.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import re
from collections.abc import Sequence
from dataclasses import dataclass

import orjson
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer import ClustererNamespace
from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED
from sentry.ingest.transaction_clusterer.rules import get_sorted_rules
from sentry.models.project import Project
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value

# Ported from Relay:
Expand Down Expand Up @@ -54,22 +58,26 @@
)


def normalize_segment_name(segment_span: CompatibleSpan):
def normalize_segment_name(project: Project, segment_span: CompatibleSpan):
segment_name = attribute_value(
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
) or segment_span.get("name")
if segment_name:
_scrub_identifiers(segment_span, segment_name)
_apply_clustering_rules(project, segment_span, segment_name)


@dataclass(frozen=True)
class Remark:
ty: str
rule_id: str
range: tuple[int, int]
range: tuple[int, int] | None

def serialize(self) -> list:
return [self.rule_id, self.ty, self.range[0], self.range[1]]
serialized: list[str | int] = [self.rule_id, self.ty]
if self.range:
serialized.extend([self.range[0], self.range[1]])
return serialized


# Ported from Relay:
Expand All @@ -88,10 +96,11 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
if len(remarks) == 0:
return

remarks.sort(key=lambda remark: remark.range[1])
remarks.sort(key=lambda remark: remark.range[1]) # type: ignore[index]
str_parts: list[str] = []
last_end = 0
for remark in remarks:
assert remark.range is not None # for typing - always set above
start, end = remark.range
str_parts.append(segment_name[last_end:start])
str_parts.append("*")
Expand All @@ -114,3 +123,79 @@ def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
"value": orjson.dumps({"meta": {"": {"rem": [r.serialize() for r in remarks]}}}).decode(),
}
segment_span["attributes"] = attributes


def _apply_clustering_rules(
project: Project, segment_span: CompatibleSpan, original_segment_name: str
):
segment_name = attribute_value(
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
) or segment_span.get("name")
assert segment_name is not None
segment_name_parts = segment_name.split("/")

rules = get_sorted_rules(ClustererNamespace.TRANSACTIONS, project)
for rule, _ in rules:
if clustered_name := _apply_clustering_rule_to_segment_name(segment_name_parts, rule):
segment_span["name"] = clustered_name
attributes = segment_span.get("attributes") or {}
attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] = {
"type": "string",
"value": clustered_name,
}
attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
"type": "string",
"value": TRANSACTION_SOURCE_SANITIZED,
}
attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {
"type": "string",
"value": orjson.dumps(
{
"meta": {
"": {
"val": original_segment_name,
"rem": [Remark(ty="s", rule_id=rule, range=None).serialize()],
}
}
},
).decode(),
}
segment_span["attributes"] = attributes
return


def _apply_clustering_rule_to_segment_name(
segment_name_parts: Sequence[str], rule: str
) -> str | None:
"""Tries to apply the given `rule` to the segment name given as a
`/`-separated sequence in `segment_name_parts`. Returns a clustered segment
name if the rule matches, or `None` if it does not."""
output = []
rule_parts = rule.split("/")
for i, rule_part in enumerate(rule_parts):
if i >= len(segment_name_parts):
# A segment name only matches a longer rule if the remainder of
# the rule is a multi-part wildcard (`**`).
if rule_part == "**":
break
else:
return None

segment_name_part = segment_name_parts[i]
if rule_part == "**":
# `**`` matches the remainder of the segment name but does not replace it.
output.extend([part for part in segment_name_parts[i:]])
break

if rule_part == "*" and segment_name_part != "":
# `*` matches a single part and replaces it in the output with `*`.
output.append("*")
elif rule_part == segment_name_part:
# The segment name part and rule part match, so keep applying this rule.
output.append(segment_name_part)
else:
# If the segment name part and rule part didn't match, then this
# whole rule doesn't match.
return None

return "/".join(output)
2 changes: 1 addition & 1 deletion src/sentry/spans/consumers/process_segments/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _normalize_segment_name(segment_span: CompatibleSpan, project: Project) -> N
unknown_if_parameterized = not source
known_to_be_unparameterized = source == TRANSACTION_SOURCE_URL
if unknown_if_parameterized or known_to_be_unparameterized:
normalize_segment_name(segment_span)
normalize_segment_name(project, segment_span)

record_segment_name(project, segment_span)

Expand Down
205 changes: 197 additions & 8 deletions tests/sentry/ingest/transaction_clusterer/test_normalization.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from unittest import mock

import orjson
import pytest
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer.datasource import TRANSACTION_SOURCE_SANITIZED
from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
from sentry.spans.consumers.process_segments.types import CompatibleSpan
from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value
from sentry.testutils.pytest.fixtures import django_db_all


def _segment_span(**kwargs) -> CompatibleSpan:
Expand All @@ -27,10 +32,13 @@ def _segment_span(**kwargs) -> CompatibleSpan:

# Ported from Relay:
# https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/transactions/processor.rs#L789
def test_identifiers_scrubbed():
segment_span = _segment_span(name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0")
@django_db_all
def test_identifiers_scrubbed(default_project: mock.MagicMock):
segment_span = _segment_span(
name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0",
)

normalize_segment_name(segment_span)
normalize_segment_name(default_project, segment_span)

assert segment_span["name"] == "/foo/*/user/*/0"
attributes = segment_span.get("attributes") or {}
Expand All @@ -50,7 +58,8 @@ def test_identifiers_scrubbed():
}


def test_name_attribute_takes_precedence_over_name():
@django_db_all
def test_name_attribute_takes_precedence_over_name(default_project: mock.MagicMock):
segment_span = _segment_span(
name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0",
attributes={
Expand All @@ -61,7 +70,7 @@ def test_name_attribute_takes_precedence_over_name():
},
)

normalize_segment_name(segment_span)
normalize_segment_name(default_project, segment_span)

assert segment_span["name"] == "/bar/*"
attributes = segment_span.get("attributes") or {}
Expand All @@ -79,11 +88,191 @@ def test_name_attribute_takes_precedence_over_name():
}


def test_no_meta_changes_if_no_name_changes():
@django_db_all
def test_no_meta_changes_if_no_name_changes(default_project: mock.MagicMock):
segment_span = _segment_span(name="/foo")

normalize_segment_name(segment_span)
normalize_segment_name(default_project, segment_span)

assert segment_span["name"] == "/foo"
attributes = segment_span.get("attributes") or {}
assert len(attributes) == 0


@django_db_all
@mock.patch(
"sentry.ingest.transaction_clusterer.normalization.get_sorted_rules",
return_value=[("/users/*/posts/*/**", 0), ("/users/*/**", 0), ("GET /users/*/**", 0)],
)
@pytest.mark.parametrize(
("segment_name", "expected", "expected_rule"),
[
pytest.param(
"/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something",
"/users/*/posts/*",
"/users/*/posts/*/**",
id="matches both wildcards",
),
pytest.param(
"/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/",
"/users/*/posts/*/",
"/users/*/posts/*/**",
id="matches both wildcards (trailing slash)",
),
pytest.param(
"/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments",
"/users/*/posts/*/comments",
"/users/*/posts/*/**",
id="matches both wildcards with suffix",
),
pytest.param(
"/users/my-user-name/posts/2022-01-01-that-one-time-i-did-something/comments/",
"/users/*/posts/*/comments/",
"/users/*/posts/*/**",
id="matches both wildcards with suffix (trailing slash)",
),
pytest.param(
"/users/my-user-name/settings",
"/users/*/settings",
"/users/*/**",
id="matches first wildcards with suffix",
),
pytest.param(
"/users/my-user-name/settings/",
"/users/*/settings/",
"/users/*/**",
id="matches first wildcards with suffix (trailing slash)",
),
pytest.param(
"/users/my-user-name",
"/users/*",
"/users/*/**",
id="matches first wildcard",
),
pytest.param(
"/users/my-user-name/",
"/users/*/",
"/users/*/**",
id="matches first wildcard (trailing slash)",
),
pytest.param(
"/users",
"/users",
None,
id="ends before first wildcard",
),
pytest.param(
"/users/",
"/users/",
None,
id="ends before first wildcard (trailing slash)",
),
pytest.param(
"/user",
"/user",
None,
id="matching text prefix",
),
pytest.param(
"/user/",
"/user/",
None,
id="matching text prefix (trailing slash)",
),
pytest.param(
"/foo",
"/foo",
None,
id="unrelated text prefix",
),
pytest.param(
"/foo/",
"/foo/",
None,
id="unrelated text prefix (trailing slash)",
),
pytest.param(
"GET /users/my-user-name",
"GET /users/*",
"GET /users/*/**",
id="method prefix: matches wildcard",
),
pytest.param(
"GET /users/my-user-name/",
"GET /users/*/",
"GET /users/*/**",
id="method prefix: matches wildcard (trailing slash)",
),
pytest.param(
"GET /users",
"GET /users",
None,
id="method prefix: ends before wildcard",
),
pytest.param(
"GET /users/",
"GET /users/",
None,
id="method prefix: ends before wildcard (trailing slash)",
),
],
)
def test_clusterer_applies_rules(
_mock_get_sorted_rules: mock.MagicMock,
segment_name: str,
expected: str,
expected_rule: str | None,
default_project: mock.MagicMock,
):
segment_span = _segment_span(name=segment_name)

normalize_segment_name(default_project, segment_span)

assert segment_span["name"] == expected
if segment_name != expected:
assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected
assert (
attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
== TRANSACTION_SOURCE_SANITIZED
)
assert (
attribute_value(
segment_span,
f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}",
)
== orjson.dumps(
{"meta": {"": {"val": segment_name, "rem": [[expected_rule, "s"]]}}}
).decode()
)


@django_db_all
@mock.patch(
"sentry.ingest.transaction_clusterer.normalization.get_sorted_rules",
return_value=[("/users/*/**", 0)],
)
def test_clusterer_works_with_scrubbing(
_mock_get_sorted_rules: mock.MagicMock,
default_project: mock.MagicMock,
):
segment_name = "/users/my-user-name/94576097f3a64b68b85a59c7d4e3ee2a"
segment_span = _segment_span(name=segment_name)

normalize_segment_name(default_project, segment_span)

expected = "/users/*/*"
assert segment_span["name"] == expected
assert attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME) == expected
assert (
attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
== TRANSACTION_SOURCE_SANITIZED
)
assert (
attribute_value(
segment_span,
f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}",
)
== orjson.dumps(
{"meta": {"": {"val": segment_name, "rem": [["/users/*/**", "s"]]}}}
).decode()
)
Loading