Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 210 additions & 60 deletions snuba/web/rpc/v1/endpoint_export_trace_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
from datetime import datetime
from typing import Any, Dict, Iterable, NamedTuple, Type, cast

import sentry_sdk
from google.protobuf.json_format import MessageToDict
from google.protobuf.timestamp_pb2 import Timestamp
from sentry_protos.snuba.v1.downsampled_storage_pb2 import DownsampledStorageConfig
from sentry_protos.snuba.v1.endpoint_trace_items_pb2 import (
ExportTraceItemsRequest,
ExportTraceItemsResponse,
)
from sentry_protos.snuba.v1.request_common_pb2 import PageToken, TraceItemType
from sentry_protos.snuba.v1.request_common_pb2 import PageToken, RequestMeta, TraceItemType
from sentry_protos.snuba.v1.trace_item_pb2 import AnyValue, ArrayValue, TraceItem

from snuba import state
Expand Down Expand Up @@ -36,6 +38,7 @@
)
from snuba.web.rpc.common.debug_info import setup_trace_query_settings
from snuba.web.rpc.common.exceptions import BadSnubaRPCRequestException
from snuba.web.rpc.storage_routing.routing_strategies.storage_routing import RoutingDecision

_DEFAULT_PAGE_SIZE = 10_000

Expand All @@ -48,76 +51,164 @@
TraceItemFilter,
)

FLEX_WIN_START = "sentry__time_window.start_timestamp"
FLEX_WIN_END = "sentry__time_window.end_timestamp"


def _flex_time_window_filters(start_sec: int, end_sec: int) -> list[TraceItemFilter]:
return [
TraceItemFilter(
comparison_filter=ComparisonFilter(
key=AttributeKey(name=FLEX_WIN_START),
op=ComparisonFilter.OP_GREATER_THAN_OR_EQUALS,
value=AttributeValue(val_int=start_sec),
)
),
TraceItemFilter(
comparison_filter=ComparisonFilter(
key=AttributeKey(name=FLEX_WIN_END),
op=ComparisonFilter.OP_LESS_THAN,
value=AttributeValue(val_int=end_sec),
)
),
]


def _parse_flex_window_from_filters(filters: list[TraceItemFilter]) -> tuple[int, int] | None:
start_sec: int | None = None
end_sec: int | None = None
for filt in filters:
if not filt.HasField("comparison_filter"):
continue
k = filt.comparison_filter.key.name
if k == FLEX_WIN_START and filt.comparison_filter.value.HasField("val_int"):
start_sec = filt.comparison_filter.value.val_int
elif k == FLEX_WIN_END and filt.comparison_filter.value.HasField("val_int"):
end_sec = filt.comparison_filter.value.val_int
if start_sec is not None and end_sec is not None:
return (start_sec, end_sec)
if start_sec is not None or end_sec is not None:
raise ValueError("Invalid flex time window in page token")
return None


def _parse_last_seen_tuple(
filters: list[TraceItemFilter],
) -> tuple[int, TraceItemType.ValueType, float, str, str]:
"""Parse the 5 'last seen' key equality filters"""
if len(filters) != 5:
Copy link
Copy Markdown
Member

@xurui-c xurui-c Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I lean towards not having hard-coded integers like 2, 5, or 7 - could we think of a better way? One idea I thought of (and please feel free to propose better ways) would be to have _WINDOW_FIELDS = [FLEX_WIN_START, FLEX_WIN_END] and then _KEYSET_CURSOR_FIELDS = [last seen_project_id, last_seen_item_type, etc] and then it would allow you to do if len(filters) != len(_KEYSET_CURSOR_FIELDS), which is more readable. Feel free to use better variable names or another approach

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, previously you added this; I just moved the code. I can think if there's better way. I am not much familiar with patterns in this codebase.

raise ValueError("Invalid last_seen in page token")
if not (
filters[0].comparison_filter.key.name == "last_seen_project_id"
and filters[0].comparison_filter.key.type == AttributeKey.Type.TYPE_INT
):
raise ValueError("Invalid project id")
last_seen_project_id = filters[0].comparison_filter.value.val_int
if not (
filters[1].comparison_filter.key.name == "last_seen_item_type"
and filters[1].comparison_filter.key.type == AttributeKey.Type.TYPE_INT
):
raise ValueError("Invalid item type")
last_seen_item_type = filters[1].comparison_filter.value.val_int
if not (
filters[2].comparison_filter.key.name == "last_seen_timestamp"
and filters[2].comparison_filter.key.type == AttributeKey.Type.TYPE_DOUBLE
):
raise ValueError("Invalid timestamp")
last_seen_timestamp = filters[2].comparison_filter.value.val_double
if not (
filters[3].comparison_filter.key.name == "last_seen_trace_id"
and filters[3].comparison_filter.key.type == AttributeKey.Type.TYPE_STRING
):
raise ValueError("Invalid trace id")
last_seen_trace_id = filters[3].comparison_filter.value.val_str
if not (
filters[4].comparison_filter.key.name == "last_seen_item_id"
and filters[4].comparison_filter.key.type == AttributeKey.Type.TYPE_STRING
):
raise ValueError("Invalid item id")
last_seen_item_id = filters[4].comparison_filter.value.val_str
return (
last_seen_project_id,
cast(TraceItemType.ValueType, last_seen_item_type),
last_seen_timestamp,
last_seen_trace_id,
last_seen_item_id,
)


class ExportTraceItemsPageToken:
"""Page token: always encodes the active [start,end) in unix seconds, shared with flex routing.

2 filters: window only; move to the next time slice (flex) with no keyset cursor.
7 filters: same 2 time-window fields plus 5 equality fields; continue after the
last row within that window (keyset / tuple seek).
"""

def __init__(
self,
last_seen_project_id: int,
last_seen_item_type: TraceItemType.ValueType,
last_seen_timestamp: float,
last_seen_trace_id: str,
last_seen_item_id: str,
*,
window_start_sec: int,
window_end_sec: int,
last_seen_project_id: int = 0,
last_seen_item_type: TraceItemType.ValueType = TraceItemType.TRACE_ITEM_TYPE_UNSPECIFIED,
last_seen_timestamp: float = 0.0,
last_seen_trace_id: str = "",
last_seen_item_id: str = "",
):
self.window_start_sec = window_start_sec
self.window_end_sec = window_end_sec
self.last_seen_project_id = last_seen_project_id
self.last_seen_item_type = last_seen_item_type
self.last_seen_timestamp = last_seen_timestamp
self.last_seen_trace_id = last_seen_trace_id
self.last_seen_item_id = last_seen_item_id

@property
def has_last_seen(self) -> bool:
Comment thread
manessaraj marked this conversation as resolved.
Copy link
Copy Markdown
Member

@xurui-c xurui-c Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I feel like what has_last_seen represents is whether we should resume within the same flex window (correct me if I'm wrong). Could we have a more accurate name to represent this?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, name can be better

return self.last_seen_item_id != ""

@classmethod
def from_protobuf(cls, page_token: PageToken) -> Optional["ExportTraceItemsPageToken"]:
if page_token == PageToken():
return None
filters = page_token.filter_offset.and_filter.filters
if len(filters) != 5:
if not page_token.filter_offset.HasField("and_filter"):
raise ValueError("Invalid page token")

if not (
filters[0].comparison_filter.key.name == "last_seen_project_id"
and filters[0].comparison_filter.key.type == AttributeKey.Type.TYPE_INT
):
raise ValueError("Invalid project id")
last_seen_project_id = filters[0].comparison_filter.value.val_int
if not (
filters[1].comparison_filter.key.name == "last_seen_item_type"
and filters[1].comparison_filter.key.type == AttributeKey.Type.TYPE_INT
):
raise ValueError("Invalid item type")
last_seen_item_type = filters[1].comparison_filter.value.val_int

if not (
filters[2].comparison_filter.key.name == "last_seen_timestamp"
and filters[2].comparison_filter.key.type == AttributeKey.Type.TYPE_DOUBLE
):
raise ValueError("Invalid timestamp")
last_seen_timestamp = filters[2].comparison_filter.value.val_double

if not (
filters[3].comparison_filter.key.name == "last_seen_trace_id"
and filters[3].comparison_filter.key.type == AttributeKey.Type.TYPE_STRING
):
raise ValueError("Invalid trace id")
last_seen_trace_id = filters[3].comparison_filter.value.val_str

if not (
filters[4].comparison_filter.key.name == "last_seen_item_id"
and filters[4].comparison_filter.key.type == AttributeKey.Type.TYPE_STRING
):
raise ValueError("Invalid item id")
last_seen_item_id = filters[4].comparison_filter.value.val_str

return cls(
last_seen_project_id,
cast(TraceItemType.ValueType, last_seen_item_type),
last_seen_timestamp,
last_seen_trace_id,
last_seen_item_id,
)
filters = list(page_token.filter_offset.and_filter.filters)
n = len(filters)
if n == 2:
win = _parse_flex_window_from_filters(filters)
if win is None:
raise ValueError("Invalid page token")
w0, w1 = win
return cls(
window_start_sec=w0,
window_end_sec=w1,
)
if n == 7:
win = _parse_flex_window_from_filters(filters[:2])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would _parse_flex_window_from_filters return incorrect result if you pass in win = _parse_flex_window_from_filters(filters)?

If not, this part seems like it could be consolidated to just be written once:

flex_window = _parse_flex_window_from_filters(filters)
if flex_window is None:
    raise ValueError("Invalid page token")
flex_window_start, flex_window_end = flex_window

if win is None:
raise ValueError("Invalid page token")
w0, w1 = win
pid, ity, lsts, ltr, lid = _parse_last_seen_tuple(filters[2:])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: just spell the variables out

return cls(
window_start_sec=w0,
window_end_sec=w1,
last_seen_project_id=pid,
last_seen_item_type=ity,
last_seen_timestamp=lsts,
last_seen_trace_id=ltr,
last_seen_item_id=lid,
)
raise ValueError("Invalid page token: expected 2 or 7 filter clauses")

def to_protobuf(self) -> PageToken:
filters = TraceItemFilter(
and_filter=AndFilter(
filters=[
and_filters: list[TraceItemFilter] = list(
_flex_time_window_filters(self.window_start_sec, self.window_end_sec)
)
if self.has_last_seen:
and_filters.extend(
[
TraceItemFilter(
comparison_filter=ComparisonFilter(
key=AttributeKey(
Expand Down Expand Up @@ -165,12 +256,41 @@ def to_protobuf(self) -> PageToken:
),
]
)
if not and_filters:
raise ValueError("empty export page token")
return PageToken(
filter_offset=TraceItemFilter(
and_filter=AndFilter(filters=and_filters),
)
)
return PageToken(filter_offset=filters)


def _is_flextime_export(in_msg: ExportTraceItemsRequest) -> bool:
if not in_msg.meta.HasField("downsampled_storage_config"):
return False
return (
in_msg.meta.downsampled_storage_config.mode
== DownsampledStorageConfig.Mode.MODE_HIGHEST_ACCURACY_FLEXTIME
)


def _export_query_meta(
in_msg: ExportTraceItemsRequest, routing_decision: RoutingDecision
) -> RequestMeta:
if routing_decision.time_window is None:
return in_msg.meta
meta = RequestMeta()
meta.CopyFrom(in_msg.meta)
meta.start_timestamp.CopyFrom(routing_decision.time_window.start_timestamp)
meta.end_timestamp.CopyFrom(routing_decision.time_window.end_timestamp)
return meta
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Page token's time window is never applied to query

High Severity

The ExportTraceItemsPageToken encodes window_start_sec/window_end_sec but these values are never used when building the next query. _export_query_meta always derives the time window from routing_decision.time_window (or original in_msg.meta), completely ignoring the incoming page token's window. Similarly, w_start/w_end in _execute are derived from the routing decision, not from the page token. When flex routing emits a 2-filter token to advance to the earlier time slice [orig_start, routed_start), the subsequent request re-queries [routed_start, orig_end) instead, causing an infinite loop.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit b46663e. Configure here.



def _build_query(
in_msg: ExportTraceItemsRequest, limit: int, page_token: ExportTraceItemsPageToken | None = None
in_msg: ExportTraceItemsRequest,
limit: int,
page_token: ExportTraceItemsPageToken | None = None,
query_meta: RequestMeta | None = None,
) -> Query:
selected_columns = [
SelectedExpression("timestamp", f.toUnixTimestamp(column("timestamp"), alias="timestamp")),
Expand Down Expand Up @@ -244,13 +364,14 @@ def _build_query(
),
)
]
if page_token is not None
if page_token is not None and page_token.has_last_seen
else []
)
meta = query_meta if query_meta is not None else in_msg.meta
query = Query(
from_clause=entity,
selected_columns=selected_columns,
condition=base_conditions_and(in_msg.meta, *(page_token_filter)),
condition=base_conditions_and(meta, *(page_token_filter)),
order_by=[
# we add organization_id and project_id to the order by to optimize data reading
# https://clickhouse.com/docs/sql-reference/statements/select/order-by#optimization-of-data-reading
Expand All @@ -269,14 +390,28 @@ def _build_query(


def _build_snuba_request(
in_msg: ExportTraceItemsRequest, limit: int, page_token: ExportTraceItemsPageToken | None = None
in_msg: ExportTraceItemsRequest,
routing_decision: RoutingDecision,
limit: int,
page_token: ExportTraceItemsPageToken | None = None,
) -> SnubaRequest:
query_settings = setup_trace_query_settings() if in_msg.meta.debug else HTTPQuerySettings()
try:
routing_decision.strategy.merge_clickhouse_settings(routing_decision, query_settings)
query_settings.set_sampling_tier(routing_decision.tier)
except Exception as e:
sentry_sdk.capture_message(f"Error merging clickhouse settings: {e}")

query_settings.set_skip_transform_order_by(True)
return SnubaRequest(
id=uuid.UUID(in_msg.meta.request_id),
original_body=MessageToDict(in_msg),
query=_build_query(in_msg, limit, page_token),
query=_build_query(
in_msg,
limit,
page_token,
query_meta=_export_query_meta(in_msg, routing_decision),
),
query_settings=query_settings,
attribution_info=AttributionInfo(
referrer=in_msg.meta.referrer,
Expand Down Expand Up @@ -425,22 +560,37 @@ def _execute(self, in_msg: ExportTraceItemsRequest) -> ExportTraceItemsResponse:
page_token = ExportTraceItemsPageToken.from_protobuf(in_msg.page_token)
results = run_query(
dataset=PluggableDataset(name="eap", all_entities=[]),
request=_build_snuba_request(in_msg, limit, page_token),
request=_build_snuba_request(in_msg, self.routing_decision, limit, page_token),
timer=self._timer,
)

rows = results.result.get("data", [])
processed_results = _convert_rows(rows)
is_flex = _is_flextime_export(in_msg)
orig_start = in_msg.meta.start_timestamp.seconds
routed = self.routing_decision.time_window

w_start = in_msg.meta.start_timestamp.seconds
w_end = in_msg.meta.end_timestamp.seconds
if routed is not None:
w_start, w_end = routed.start_timestamp.seconds, routed.end_timestamp.seconds

next_token: PageToken | None = None
if len(processed_results.items) >= limit:
next_token = ExportTraceItemsPageToken(
window_start_sec=w_start,
window_end_sec=w_end,
last_seen_project_id=processed_results.last_seen_project_id,
last_seen_item_type=processed_results.last_seen_item_type,
last_seen_trace_id=processed_results.last_seen_trace_id,
last_seen_timestamp=processed_results.last_seen_timestamp,
last_seen_item_id=processed_results.last_seen_item_id,
).to_protobuf()
elif is_flex and routed is not None and routed.start_timestamp.seconds > orig_start:
next_token = ExportTraceItemsPageToken(
window_start_sec=orig_start,
window_end_sec=routed.start_timestamp.seconds,
).to_protobuf()
Comment on lines +589 to +593
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: The pagination token's time window (window_start_sec, window_end_sec) is ignored on subsequent requests. The query re-uses the original time window, breaking pagination logic.
Severity: HIGH

Suggested Fix

When a page_token is present and contains window_start_sec and window_end_sec, use these values to constrain the query's time window. This ensures the query respects the time slice specified by the pagination token, rather than re-calculating it from the original request.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent. Verify if this is a real issue. If it is, propose a fix; if not, explain why it's
not valid.

Location: snuba/web/rpc/v1/endpoint_export_trace_items.py#L589-L593

Potential issue: The code generates a page token containing a specific time window
(`window_start_sec`, `window_end_sec`) for the next page of results. However, when a
subsequent request uses this token, the time window values are deserialized but never
used to constrain the new query. Instead, the query's time window is re-calculated from
the original request metadata. This causes the pagination logic to fail, as it ignores
the intended time slice from the token, leading to re-scanning the same data or skipping
data entirely. The fields `window_start_sec` and `window_end_sec` are effectively dead
data after being read from the token.

Did we get this right? 👍 / 👎 to inform future reviews.

else:
next_token = PageToken(end_pagination=True)

Expand Down
5 changes: 3 additions & 2 deletions tests/web/rpc/v1/test_endpoint_export_trace_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,10 @@ def test_with_pagination(self, setup_teardown: Any) -> None:
response = EndpointExportTraceItems().execute(message)
items.extend(response.trace_items)
if len(response.trace_items) == 20:
assert response.page_token.end_pagination == False
assert response.page_token.end_pagination is False
assert len(response.page_token.filter_offset.and_filter.filters) == 7
else:
assert response.page_token.end_pagination == True
assert response.page_token.end_pagination, "End Pagination token mismatch"
break
message.page_token.CopyFrom(response.page_token)

Expand Down
Loading