Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ dependencies = [
"rfc3986-validator>=0.1.1",
# [end] jsonschema format validators
"sentry-arroyo>=2.33.1",
"sentry-conventions>=0.3.0",
"sentry-forked-email-reply-parser>=0.5.12.post1",
"sentry-kafka-schemas>=2.1.15",
"sentry-ophio>=1.1.3",
Expand Down
2 changes: 2 additions & 0 deletions src/sentry/features/temporary.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def register_temporary_features(manager: FeatureManager) -> None:
manager.add("organizations:more-workflows", OrganizationFeature, FeatureHandlerStrategy.INTERNAL, api_expose=False)
# Generate charts using detector/open period payload
manager.add("organizations:new-metric-issue-charts", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
# Normalize segment names during span enrichment
manager.add("organizations:normalize_segment_names_in_span_enrichment", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=False)
# Extract on demand metrics
manager.add("organizations:on-demand-metrics-extraction", OrganizationFeature, FeatureHandlerStrategy.FLAGPOLE, api_expose=True)
# Extract on demand metrics (experimental features)
Expand Down
115 changes: 115 additions & 0 deletions src/sentry/ingest/transaction_clusterer/normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import re
from dataclasses import dataclass

import orjson
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.spans.consumers.process_segments.types import CompatibleSpan, attribute_value

# Ported from Relay:
# https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/regexes.rs#L9
TRANSACTION_NAME_NORMALIZER_REGEX = re.compile(
r"""(?x)
(?P<uuid>[^/\\]*
(?a:\b)[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}(?a:\b)
[^/\\]*) |
(?P<sha1>[^/\\]*
(?a:\b)[0-9a-fA-F]{40}(?a:\b)
[^/\\]*) |
(?P<md5>[^/\\]*
(?a:\b)[0-9a-fA-F]{32}(?a:\b)
[^/\\]*) |
(?P<date>[^/\\]*
(?:
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]\.[0-9]+([+-][0-2][0-9]:[0-5][0-9]|Z))|
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]:[0-5][0-9]([+-][0-2][0-9]:[0-5][0-9]|Z))|
(?:[0-9]{4}-[01][0-9]-[0-3][0-9]T[0-2][0-9]:[0-5][0-9]([+-][0-2][0-9]:[0-5][0-9]|Z))
) |
(?:
(?a:\b)(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat)(?a:\s)+)?
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?a:\s)+
(?:[0-9]{1,2})(?a:\s)+
(?:[0-9]{2}:[0-9]{2}:[0-9]{2})(?a:\s)+
[0-9]{4}
) |
(?:
(?a:\b)(?:(Sun|Mon|Tue|Wed|Thu|Fri|Sat),(?a:\s)+)?
(?:0[1-9]|[1-2]?[0-9]|3[01])(?a:\s)+
(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)(?a:\s)+
(?:19[0-9]{2}|[2-9][0-9]{3})(?a:\s)+
(?:2[0-3]|[0-1][0-9]):([0-5][0-9])
(?::(60|[0-5][0-9]))?(?a:\s)+
(?:[-\+][0-9]{2}[0-5][0-9]|(?:UT|GMT|(?:E|C|M|P)(?:ST|DT)|[A-IK-Z]))
)
[^/\\]*) |
(?P<hex>[^/\\]*
(?a:\b)0[xX][0-9a-fA-F]+(?a:\b)
[^/\\]*) |
(?:^|[/\\])
(?P<int>
(:?[^%/\\]|%[0-9a-fA-F]{2})*[0-9]{2,}
[^/\\]*)""",
re.UNICODE,
)


def normalize_segment_name(segment_span: CompatibleSpan):
segment_name = attribute_value(
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
) or segment_span.get("name")
if segment_name:
_scrub_identifiers(segment_span, segment_name)


@dataclass(frozen=True)
class Remark:
ty: str
rule_id: str
range: tuple[int, int]

def serialize(self) -> list:
return [self.rule_id, self.ty, self.range[0], self.range[1]]


# Ported from Relay:
# https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/transactions/processor.rs#L350
def _scrub_identifiers(segment_span: CompatibleSpan, segment_name: str):
matches = TRANSACTION_NAME_NORMALIZER_REGEX.finditer(segment_name)
remarks = []
for m in matches:
remarks.extend(
[
Remark(ty="s", rule_id=group_name, range=(m.start(group_name), m.end(group_name)))
for group_name in m.groupdict().keys()
if m.start(group_name) > -1
]
)
if len(remarks) == 0:
return

remarks.sort(key=lambda remark: remark.range[1])
str_parts: list[str] = []
last_end = 0
for remark in remarks:
start, end = remark.range
str_parts.append(segment_name[last_end:start])
str_parts.append("*")
last_end = end
str_parts.append(segment_name[last_end:])
normalized_segment_name = "".join(str_parts)

segment_span["name"] = normalized_segment_name
attributes = segment_span.get("attributes") or {}
attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] = {
"type": "string",
"value": normalized_segment_name,
}
attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
"type": "string",
"value": "sanitized",
}
attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] = {
"type": "string",
"value": orjson.dumps({"meta": {"": {"rem": [r.serialize() for r in remarks]}}}).decode(),
}
segment_span["attributes"] = attributes
24 changes: 23 additions & 1 deletion src/sentry/spans/consumers/process_segments/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@

import sentry_sdk
from django.core.exceptions import ValidationError
from sentry_conventions.attributes import ATTRIBUTE_NAMES
from sentry_kafka_schemas.schema_types.ingest_spans_v1 import SpanEvent

from sentry import options
from sentry import features, options
from sentry.constants import DataCategory
from sentry.dynamic_sampling.rules.helpers.latest_releases import record_latest_release
from sentry.event_manager import INSIGHT_MODULE_TO_PROJECT_FLAG_NAME
from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
from sentry.insights import FilterSpan
from sentry.insights import modules as insights_modules
from sentry.issue_detection.performance_detection import detect_performance_problems
Expand All @@ -35,6 +37,7 @@
from sentry.utils.dates import to_datetime
from sentry.utils.outcomes import Outcome, OutcomeAggregator
from sentry.utils.projectflags import set_project_flag_and_signal
from sentry.utils.safe import safe_execute

logger = logging.getLogger(__name__)

Expand All @@ -61,6 +64,7 @@ def process_segment(
# If the project does not exist then it might have been deleted during ingestion.
return []

safe_execute(_normalize_segment_name, segment_span, project.organization)
_add_segment_name(segment_span, spans)
_compute_breakdowns(segment_span, spans, project)
_create_models(segment_span, project)
Expand Down Expand Up @@ -140,6 +144,24 @@ def _enrich_spans(
return segment, spans


@metrics.wraps("spans.consumers.process_segments.normalize_segment_name")
def _normalize_segment_name(segment_span: CompatibleSpan, organization: Organization) -> None:
if not features.has("organizations:normalize_segment_names_in_span_enrichment", organization):
return

segment_name = attribute_value(
segment_span, ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME
) or segment_span.get("name")
if not segment_name:
return

source = attribute_value(segment_span, ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE)
unknown_if_parameterized = not source
known_to_be_unparameterized = source == "url"
if unknown_if_parameterized or known_to_be_unparameterized:
normalize_segment_name(segment_span)


@metrics.wraps("spans.consumers.process_segments.add_segment_name")
def _add_segment_name(segment: CompatibleSpan, spans: Sequence[CompatibleSpan]) -> None:
segment_name = segment.get("name")
Expand Down
Empty file.
89 changes: 89 additions & 0 deletions tests/sentry/ingest/transaction_clusterer/test_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import orjson
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.ingest.transaction_clusterer.normalization import normalize_segment_name
from sentry.spans.consumers.process_segments.types import CompatibleSpan


def _segment_span(**kwargs) -> CompatibleSpan:
segment_span: CompatibleSpan = {
"organization_id": 1,
"project_id": 1,
"trace_id": "94576097f3a64b68b85a59c7d4e3ee2a",
"span_id": "a49b42af9fb69da0",
"start_timestamp": 1707953018.865,
"end_timestamp": 1707953018.972,
"retention_days": 90,
"received": 1707953019.044972,
"status": "ok",
"exclusive_time": 0.1,
"op": "default",
"sentry_tags": {},
"name": "default",
}
segment_span.update(**kwargs) # type:ignore[call-arg]
return segment_span


# Ported from Relay:
# https://github.com/getsentry/relay/blob/aad4b6099d12422e88dd5df49abae11247efdd99/relay-event-normalization/src/transactions/processor.rs#L789
def test_identifiers_scrubbed():
segment_span = _segment_span(name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0")

normalize_segment_name(segment_span)

assert segment_span["name"] == "/foo/*/user/*/0"
attributes = segment_span.get("attributes") or {}
assert attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] == {
"type": "string",
"value": "/foo/*/user/*/0",
}
assert attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] == {
"type": "string",
"value": "sanitized",
}
assert attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] == {
"type": "string",
"value": orjson.dumps(
{"meta": {"": {"rem": [["int", "s", 5, 45], ["int", "s", 51, 54]]}}}
).decode(),
}


def test_name_attribute_takes_precedence_over_name():
segment_span = _segment_span(
name="/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0",
attributes={
ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME: {
"type": "string",
"value": "/bar/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
}
},
)

normalize_segment_name(segment_span)

assert segment_span["name"] == "/bar/*"
attributes = segment_span.get("attributes") or {}
assert attributes[ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME] == {
"type": "string",
"value": "/bar/*",
}
assert attributes[ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] == {
"type": "string",
"value": "sanitized",
}
assert attributes[f"sentry._meta.fields.attributes.{ATTRIBUTE_NAMES.SENTRY_SEGMENT_NAME}"] == {
"type": "string",
"value": orjson.dumps({"meta": {"": {"rem": [["int", "s", 5, 45]]}}}).decode(),
}


def test_no_meta_changes_if_no_name_changes():
segment_span = _segment_span(name="/foo")

normalize_segment_name(segment_span)

assert segment_span["name"] == "/foo"
attributes = segment_span.get("attributes") or {}
assert len(attributes) == 0
37 changes: 37 additions & 0 deletions tests/sentry/spans/consumers/process_segments/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
from unittest import mock

import pytest
from sentry_conventions.attributes import ATTRIBUTE_NAMES

from sentry.issues.grouptype import PerformanceStreamedSpansGroupTypeExperimental
from sentry.models.environment import Environment
from sentry.models.release import Release
from sentry.spans.consumers.process_segments.message import _verify_compatibility, process_segment
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers.features import Feature
from sentry.testutils.helpers.options import override_options
from sentry.testutils.issue_detection.experiments import exclude_experimental_detectors
from tests.sentry.spans.consumers.process import build_mock_span
Expand Down Expand Up @@ -272,6 +274,41 @@ def test_segment_name_propagation_when_name_missing(self):
child_attributes = child_span["attributes"] or {}
assert child_attributes.get("sentry.segment.name") is None

def test_segment_name_normalization_with_feature(self):
_, segment_span = self.generate_basic_spans()
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"

with self.feature("organizations:normalize_segment_names_in_span_enrichment"):
processed_spans = process_segment([segment_span])

assert processed_spans[0]["name"] == "/foo/*/user/*/0"

def test_segment_name_normalization_without_feature(self):
_, segment_span = self.generate_basic_spans()
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"

with Feature({"organizations:normalize_segment_names_in_span_enrichment": False}):
processed_spans = process_segment([segment_span])

assert (
processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
)

def test_segment_name_normalization_checks_source(self):
_, segment_span = self.generate_basic_spans()
segment_span["name"] = "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
segment_span["attributes"][ATTRIBUTE_NAMES.SENTRY_SPAN_SOURCE] = {
"type": "string",
"value": "route",
}

with self.feature("organizations:normalize_segment_names_in_span_enrichment"):
processed_spans = process_segment([segment_span])

assert (
processed_spans[0]["name"] == "/foo/2fd4e1c67a2d28fced849ee1bb76e7391b93eb12/user/123/0"
)


def test_verify_compatibility():
spans: list[dict[str, Any]] = [
Expand Down
10 changes: 10 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading