-
-
Notifications
You must be signed in to change notification settings - Fork 4.5k
feat(suspect-spans): Add new endpoint for suspect spans #29357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This introduced a new endpoint for suspect spans.
|
|
||
| self.min_ago = before_now(minutes=1).replace(microsecond=0) | ||
|
|
||
| self.update_snuba_config_ensure({"write_span_columns_projects": f"[{self.project.id}]"}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This adds nearly 10s per test. If there's not an easy way to make this faster in CI, we may want to xfail these tests for now.
wmak
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM;
- small question around moving some of the classes and functions out of the endpoint to make it smaller
- Also I think if these tests do end up taking 10s each, we likely want to skip instead of xfail cause I think pytest still runs the tests
- Also should leave 1 in as a general smoke test IMO
| @dataclasses.dataclass(frozen=True) | ||
| class ExampleSpan: | ||
| id: str | ||
| start_timestamp: float | ||
| finish_timestamp: float | ||
| exclusive_time: float | ||
|
|
||
| def serialize(self) -> Any: | ||
| return { | ||
| "id": self.id, | ||
| "startTimestamp": self.start_timestamp, | ||
| "finishTimestamp": self.finish_timestamp, | ||
| "exclusiveTime": self.exclusive_time, | ||
| } | ||
|
|
||
|
|
||
| @dataclasses.dataclass(frozen=True) | ||
| class ExampleTransaction: | ||
| id: str | ||
| description: Optional[str] | ||
| start_timestamp: float | ||
| finish_timestamp: float | ||
| non_overlapping_exclusive_time: float | ||
| spans: List[ExampleSpan] | ||
|
|
||
| def serialize(self) -> Any: | ||
| return { | ||
| "id": self.id, | ||
| "description": self.description, | ||
| "startTimestamp": self.start_timestamp, | ||
| "finishTimestamp": self.finish_timestamp, | ||
| "nonOverlappingExclusiveTime": self.non_overlapping_exclusive_time, | ||
| "spans": [span.serialize() for span in self.spans], | ||
| } | ||
|
|
||
|
|
||
| @dataclasses.dataclass(frozen=True) | ||
| class SuspectSpan: | ||
| project_id: int | ||
| project: str | ||
| transaction: str | ||
| op: str | ||
| group: str | ||
| frequency: int | ||
| count: int | ||
| sum_exclusive_time: float | ||
| p50_exclusive_time: float | ||
| p75_exclusive_time: float | ||
| p95_exclusive_time: float | ||
| p99_exclusive_time: float | ||
|
|
||
| def serialize(self) -> Any: | ||
| return { | ||
| "projectId": self.project_id, | ||
| "project": self.project, | ||
| "transaction": self.transaction, | ||
| "op": self.op, | ||
| "group": self.group, | ||
| "frequency": self.frequency, | ||
| "count": self.count, | ||
| "sumExclusiveTime": self.sum_exclusive_time, | ||
| "p50ExclusiveTime": self.p50_exclusive_time, | ||
| "p75ExclusiveTime": self.p75_exclusive_time, | ||
| "p95ExclusiveTime": self.p95_exclusive_time, | ||
| "p99ExclusiveTime": self.p99_exclusive_time, | ||
| } | ||
|
|
||
|
|
||
| @dataclasses.dataclass(frozen=True) | ||
| class SuspectSpanWithExamples(SuspectSpan): | ||
| examples: Optional[List[ExampleTransaction]] = None | ||
|
|
||
| def serialize(self) -> Any: | ||
| serialized = super().serialize() | ||
| serialized["examples"] = ( | ||
| [] if self.examples is None else [ex.serialize() for ex in self.examples] | ||
| ) | ||
| return serialized | ||
|
|
||
|
|
||
| def query_suspect_span_groups( | ||
| params: ParamsType, | ||
| query: Optional[str], | ||
| order_column: str, | ||
| limit: int, | ||
| offset: int, | ||
| ) -> List[SuspectSpan]: | ||
| builder = QueryBuilder( | ||
| dataset=Dataset.Discover, | ||
| params=params, | ||
| selected_columns=[ | ||
| "project.id", | ||
| "project", | ||
| "transaction", | ||
| "array_join(spans_op)", | ||
| "array_join(spans_group)", | ||
| "count_unique(id)", | ||
| *(column.suspect_op_group_column for column in SPAN_PERFORMANCE_COLUMNS.values()), | ||
| ], | ||
| query=query, | ||
| orderby=order_column, | ||
| auto_aggregations=True, | ||
| use_aggregate_conditions=True, | ||
| limit=limit, | ||
| offset=offset, | ||
| functions_acl=["array_join", "sumArray", "percentileArray", "maxArray"], | ||
| ) | ||
|
|
||
| snql_query = builder.get_snql_query() | ||
| results = raw_snql_query(snql_query, "api.organization-events-spans-performance-suspects") | ||
|
|
||
| return [ | ||
| SuspectSpan( | ||
| project_id=suspect["project.id"], | ||
| project=suspect["project"], | ||
| transaction=suspect["transaction"], | ||
| op=suspect["array_join_spans_op"], | ||
| group=suspect["array_join_spans_group"], | ||
| frequency=suspect["count_unique_id"], | ||
| count=suspect["count"], | ||
| sum_exclusive_time=suspect["sumArray_spans_exclusive_time"], | ||
| p50_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_50"), | ||
| p75_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_75"), | ||
| p95_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_95"), | ||
| p99_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_99"), | ||
| ) | ||
| for suspect in results["data"] | ||
| ] | ||
|
|
||
|
|
||
| def query_example_transactions( | ||
| params: ParamsType, | ||
| query: Optional[str], | ||
| order_column: str, | ||
| suspects: List[SuspectSpan], | ||
| per_suspect: int = 3, | ||
| ) -> Dict[Tuple[str, str], List[str]]: | ||
| # there aren't any suspects, early return to save an empty query | ||
| if not suspects: | ||
| return {} | ||
|
|
||
| builder = QueryBuilder( | ||
| dataset=Dataset.Discover, | ||
| params=params, | ||
| selected_columns=[ | ||
| "id", | ||
| "array_join(spans_op)", | ||
| "array_join(spans_group)", | ||
| *(column.suspect_example_column for column in SPAN_PERFORMANCE_COLUMNS.values()), | ||
| ], | ||
| query=query, | ||
| orderby=get_function_alias(order_column), | ||
| auto_aggregations=True, | ||
| use_aggregate_conditions=True, | ||
| # we want only `per_suspect` examples for each suspect | ||
| limit=len(suspects) * per_suspect, | ||
| functions_acl=["array_join", "sumArray", "percentileArray", "maxArray"], | ||
| ) | ||
|
|
||
| # we are only interested in the specific op, group pairs from the suspects | ||
| builder.add_conditions( | ||
| [ | ||
| Condition( | ||
| Function( | ||
| "tuple", | ||
| [ | ||
| builder.resolve_function("array_join(spans_op)"), | ||
| builder.resolve_function("array_join(spans_group)"), | ||
| ], | ||
| ), | ||
| Op.IN, | ||
| Function( | ||
| "tuple", | ||
| [Function("tuple", [suspect.op, suspect.group]) for suspect in suspects], | ||
| ), | ||
| ), | ||
| ] | ||
| ) | ||
|
|
||
| # Hack: the limit by clause only allows columns but here we want to | ||
| # do a limitby on the two array joins. For the time being, directly | ||
| # do the limitby on the internal snuba name for the span group column | ||
| # but this should not be relied upon in production, and if two spans | ||
| # differ only by the span op, this will result in a incorrect query | ||
| builder.limitby = LimitBy(Column("_snuba_array_join_spans_group"), per_suspect) | ||
|
|
||
| snql_query = builder.get_snql_query() | ||
| results = raw_snql_query(snql_query, "api.organization-events-spans-performance-examples") | ||
|
|
||
| examples: Dict[Tuple[str, str], List[str]] = { | ||
| (suspect.op, suspect.group): [] for suspect in suspects | ||
| } | ||
|
|
||
| for example in results["data"]: | ||
| key = example["array_join_spans_op"], example["array_join_spans_group"] | ||
| examples[key].append(example["id"]) | ||
|
|
||
| return examples | ||
|
|
||
|
|
||
| def get_example_transaction( | ||
| project_id: int, | ||
| transaction_id: str, | ||
| span_op: str, | ||
| span_group: str, | ||
| ) -> ExampleTransaction: | ||
| event = eventstore.get_event_by_id(project_id, transaction_id) | ||
| data = event.data | ||
|
|
||
| # the transaction itself is a span as well but we need to reconstruct | ||
| # it from the event as it's not present in the spans array | ||
| trace_context = data.get("contexts", {}).get("trace", {}) | ||
| root_span = { | ||
| "span_id": trace_context["span_id"], | ||
| "op": trace_context["op"], | ||
| "hash": trace_context["hash"], | ||
| "exclusive_time": trace_context["exclusive_time"], | ||
| "description": data["transaction"], | ||
| "start_timestamp": data["start_timestamp"], | ||
| "timestamp": data["timestamp"], | ||
| } | ||
|
|
||
| matching_spans = [ | ||
| span | ||
| for span in chain([root_span], data.get("spans", [])) | ||
| if span["op"] == span_op and span["hash"] == span_group | ||
| ] | ||
|
|
||
| # get the first non-None description | ||
| # use None if all descriptions are None | ||
| description = None | ||
| for span in matching_spans: | ||
| if span.get("description") is None: | ||
| continue | ||
| description = span["description"] | ||
|
|
||
| spans: List[ExampleSpan] = [ | ||
| ExampleSpan( | ||
| id=span["span_id"], | ||
| start_timestamp=span["start_timestamp"], | ||
| finish_timestamp=span["timestamp"], | ||
| exclusive_time=span["exclusive_time"], | ||
| ) | ||
| for span in matching_spans | ||
| ] | ||
|
|
||
| non_overlapping_exclusive_time_windows = union_time_windows( | ||
| [ | ||
| window | ||
| for span in spans | ||
| for window in get_exclusive_time_windows( | ||
| span, | ||
| # don't need to check the root span here because its parent | ||
| # will never be one of the spans in this transaction | ||
| data.get("spans", []), | ||
| ) | ||
| ] | ||
| ) | ||
|
|
||
| return ExampleTransaction( | ||
| id=transaction_id, | ||
| description=description, | ||
| start_timestamp=data["start_timestamp"], | ||
| finish_timestamp=data["timestamp"], | ||
| non_overlapping_exclusive_time=sum( | ||
| window.duration_ms for window in non_overlapping_exclusive_time_windows | ||
| ), | ||
| spans=spans, | ||
| ) | ||
|
|
||
|
|
||
| @dataclasses.dataclass(frozen=True) | ||
| class TimeWindow: | ||
| # Timestamps are in seconds | ||
| start: float | ||
| end: float | ||
|
|
||
| def as_tuple(self) -> Tuple[float, float]: | ||
| return (self.start, self.end) | ||
|
|
||
| @property | ||
| def duration_ms(self) -> float: | ||
| return (self.end - self.start) * 1000 | ||
|
|
||
| def __add__(self, other: "TimeWindow") -> Tuple[Optional["TimeWindow"], "TimeWindow"]: | ||
| if self.start < other.start: | ||
| if self.end < other.start: | ||
| return self, other | ||
| return None, TimeWindow(start=self.start, end=max(self.end, other.end)) | ||
| else: | ||
| if self.start > other.end: | ||
| return other, self | ||
| return None, TimeWindow(start=other.start, end=max(self.end, other.end)) | ||
|
|
||
| def __sub__(self, other: "TimeWindow") -> Tuple[Optional["TimeWindow"], "TimeWindow"]: | ||
| if self.start < other.start: | ||
| if self.end > other.end: | ||
| return ( | ||
| TimeWindow(start=self.start, end=other.start), | ||
| TimeWindow(start=other.end, end=self.end), | ||
| ) | ||
| return None, TimeWindow(start=self.start, end=min(self.end, other.start)) | ||
| else: | ||
| if self.end < other.end: | ||
| return None, TimeWindow(start=self.end, end=self.end) | ||
| return None, TimeWindow(start=max(self.start, other.end), end=self.end) | ||
|
|
||
|
|
||
| def union_time_windows(time_windows: List[TimeWindow]) -> List[TimeWindow]: | ||
| if not time_windows: | ||
| return [] | ||
|
|
||
| previous, *time_windows = sorted(time_windows, key=lambda window: window.as_tuple()) | ||
|
|
||
| unioned: List[TimeWindow] = [] | ||
|
|
||
| for current in time_windows: | ||
| window, previous = previous + current | ||
| if window: | ||
| unioned.append(window) | ||
|
|
||
| unioned.append(previous) | ||
|
|
||
| return unioned | ||
|
|
||
|
|
||
| def remove_time_windows(source: TimeWindow, time_windows: List[TimeWindow]) -> List[TimeWindow]: | ||
| if not time_windows: | ||
| return [source] | ||
|
|
||
| removed: List[TimeWindow] = [] | ||
|
|
||
| for current in time_windows: | ||
| window, source = source - current | ||
| if window: | ||
| removed.append(window) | ||
|
|
||
| removed.append(source) | ||
|
|
||
| return removed | ||
|
|
||
|
|
||
| def get_exclusive_time_windows(span: ExampleSpan, spans: List[Any]) -> List[TimeWindow]: | ||
| non_overlapping_children_time_windows = union_time_windows( | ||
| [ | ||
| TimeWindow(start=child["start_timestamp"], end=child["timestamp"]) | ||
| for child in spans | ||
| if child["parent_span_id"] == span.id | ||
| ] | ||
| ) | ||
| return remove_time_windows( | ||
| TimeWindow(start=span.start_timestamp, end=span.finish_timestamp), | ||
| non_overlapping_children_time_windows, | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thoughts on moving these to utils.py?
| # pagination will try to get 1 extra result to decide if there is a next page, | ||
| # drop the extra result, if present, to fetch less events | ||
| alias = get_function_alias( | ||
| SPAN_PERFORMANCE_COLUMNS[orderby_column].suspect_example_column | ||
| ) | ||
| orderby = direction + alias | ||
| transaction_ids = query_example_transactions( | ||
| params, query, orderby, suspects[: limit - 1] | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤔 won't this break pagination?
If we don't want pagination shouldn't we disable it entirely like we do with the noPagination param?
Or is this for another reason
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So this is done intentionally. Pagination is done by checking the existence of an extra result. What this is doing is skipping the nodestore calls + any calculation done on the event for this extra result. However, when serializing, the extra result will be present but without any example events attached. This way pagination still works as expected but we don't need to make the extra nodestore calls for this extra result.
This introduced a new endpoint for suspect spans.