Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions src/sentry/replays/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,75 @@ def query_replay_instance(
)["data"]


def query_replay_id_by_prefix(
project_ids: list[int],
replay_id_prefix: str,
start: datetime,
end: datetime,
organization: Organization | None = None,
) -> str | None:
"""
Using a string prefix, query for a full replay ID in the given time range and project list, returning the first matching ID.
Date range is chunked into 14 day intervals, newest to oldest, to avoid timeouts.

TODO: This query ignores the replay_id column index and can do large scans. At the moment it's only used for the Seer Explorer replay details tool.
This is a good candidate for optimization, which can be done with a materialized string column for the first 8 chars, and a secondary index.
Alternatively we can try more consistent ways of passing the full ID to Explorer.
"""

if len(replay_id_prefix) < 8 or len(replay_id_prefix) >= 32:
# Enforce length of 8-31 characters.
return None

# Enforce valid hex chars.
replay_id_prefix = replay_id_prefix.lower()
try:
int(replay_id_prefix, 16)
except ValueError:
return None

window_size = timedelta(days=14)
window_end = end
while window_end > start:
window_start = max(window_end - window_size, start)

query = Query(
match=Entity("replays"),
select=[Column("replay_id")],
where=[
Condition(Column("project_id"), Op.IN, project_ids),
Condition(
Function(
"startsWith",
parameters=[
Function("toString", parameters=[Column("replay_id")]),
replay_id_prefix,
],
),
Op.EQ,
1,
),
Condition(Column("timestamp"), Op.GTE, window_start),
Condition(Column("timestamp"), Op.LT, window_end),
],
granularity=Granularity(3600),
limit=Limit(1),
)

snuba_response = execute_query(
query=query,
tenant_id={"organization_id": organization.id} if organization else {},
referrer="replays.query.short_id_details_query",
)["data"]

if snuba_response:
return str(snuba_response[0]["replay_id"])

window_end = window_start

return None


def query_replay_viewed_by_ids(
project_id: int | list[int],
replay_id: str,
Expand Down Expand Up @@ -973,6 +1042,7 @@ def get_replay_range(
project_id: int,
replay_id: str,
) -> tuple[datetime, datetime] | None:
"""Get the min and max timestamps for a replay. This query is redundant if you're already using query_replay_instance - use the started_at and finished_at fields instead."""
query = Query(
match=Entity("replays"),
select=[
Expand Down
101 changes: 73 additions & 28 deletions src/sentry/seer/explorer/tools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import logging
import uuid
from datetime import UTC, datetime, timedelta, timezone
from typing import Any, Literal
from typing import Any, Literal, cast

from django.urls import reverse

from sentry import eventstore
from sentry import eventstore, features
from sentry.api import client
from sentry.api.serializers.base import serialize
from sentry.api.serializers.models.event import EventSerializer, IssueEventSerializerResponse
Expand All @@ -16,6 +15,8 @@
from sentry.models.organization import Organization
from sentry.models.project import Project
from sentry.models.repository import Repository
from sentry.replays.post_process import process_raw_response
from sentry.replays.query import query_replay_id_by_prefix, query_replay_instance
from sentry.search.eap.types import SearchResolverConfig
from sentry.search.events.types import SnubaParams
from sentry.seer.autofix.autofix import get_all_tags_overview
Expand Down Expand Up @@ -199,6 +200,9 @@ def get_trace_waterfall(trace_id: str, organization_id: int) -> EAPTrace | None:

# Get full trace id if a short id is provided. Queries EAP for a single span.
# Use sliding 14-day windows starting from most recent, up to 90 days in the past, to avoid timeouts.
# TODO: This query ignores the trace_id column index and can do large scans, and is a good candidate for optimization.
# This can be done with a materialized string column for the first 8 chars and a secondary index.
# Alternatively we can try more consistent ways of passing the full ID to Explorer.
if len(trace_id) < 32:
full_trace_id = None
now = datetime.now(timezone.utc)
Expand Down Expand Up @@ -515,15 +519,15 @@ def get_replay_metadata(
*,
replay_id: str,
organization_id: int,
project_id: int | None = None,
project_slug: str | None = None,
) -> dict[str, Any] | None:
"""
Get the metadata for a replay through an aggregate replay event query.

Args:
replay_id: The ID of the replay.
replay_id: The ID of the replay. Either a valid UUID or a 8-character hex string prefix. If known, the full ID is recommended for performance.
organization_id: The ID of the organization the replay belongs to.
project_id: The projects to query. If not provided, all projects in the organization will be queried.
project_slug: The slug of the project to query. If not provided, all projects in the organization will be queried.

Returns:
A dict containing the metadata for the replay, or None if it's not found.
Expand All @@ -538,38 +542,79 @@ def get_replay_metadata(
)
return None

path = reverse(
"sentry-api-0-organization-replay-details",
args=(organization.slug, replay_id),
if not features.has("organizations:session-replay", organization):
return None

p_ids_and_slugs = list(
Project.objects.filter(
organization_id=organization.id,
status=ObjectStatus.ACTIVE,
**({"slug": project_slug} if project_slug else {}),
).values_list("id", "slug")
)
path = path.strip("/")[len("api/0") :] + "/"

params = {}
if project_id:
params["project"] = project_id
if not p_ids_and_slugs:
logger.warning(
"No projects found for given organization and project slug",
extra={"organization_id": organization_id, "project_slug": project_slug},
)
return None

resp = client.get(
auth=ApiKey(organization_id=organization.id, scope_list=["org:read", "project:read"]),
user=None,
path=path,
params=params,
)
start, end = default_start_end_dates()

if resp.status_code != 200 or not (resp.data or {}).get("data"):
if len(replay_id) < 32:
# Subquery for the full replay ID.
full_replay_id = query_replay_id_by_prefix(
project_ids=[id for id, _ in p_ids_and_slugs],
replay_id_prefix=replay_id,
start=start,
end=end,
organization=organization,
)
if not full_replay_id:
logger.warning(
"Replay short ID lookup failed",
extra={"replay_id": replay_id, "organization_id": organization_id},
)
return None

replay_id = full_replay_id

try:
replay_id = str(
uuid.UUID(replay_id)
) # Normalizing with dashes is recommended for the query.
except ValueError:
logger.warning(
"Failed to get replay metadata",
"Invalid replay ID", extra={"replay_id": replay_id, "organization_id": organization_id}
)
return None

snuba_response = query_replay_instance(
project_id=[id for id, _ in p_ids_and_slugs],
replay_id=replay_id,
start=start,
end=end,
organization=organization,
request_user_id=None,
)
response = process_raw_response(
snuba_response,
fields=[],
)
if not response:
logger.warning(
"Replay instance not found - no data returned from query",
extra={
"replay_id": replay_id,
"organization_id": organization_id,
"project_id": project_id,
"status_code": resp.status_code,
},
)
return None

# Add project_slug field.
result = resp.data["data"]
project = Project.objects.get(id=result["project_id"])
result["project_slug"] = project.slug

result = cast(dict[str, Any], response[0])
result["project_slug"] = next(
filter(lambda x: x[0] == int(result["project_id"]), p_ids_and_slugs)
)[1]
return result
Loading
Loading