Skip to content

Conversation

@Zylphrex
Copy link
Member

This introduced a new endpoint for suspect spans.


self.min_ago = before_now(minutes=1).replace(microsecond=0)

self.update_snuba_config_ensure({"write_span_columns_projects": f"[{self.project.id}]"})
Copy link
Member Author

@Zylphrex Zylphrex Oct 19, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This adds nearly 10s per test. If there's not an easy way to make this faster in CI, we may want to xfail these tests for now.

@Zylphrex Zylphrex marked this pull request as ready for review October 19, 2021 18:39
@Zylphrex Zylphrex requested a review from a team as a code owner October 19, 2021 18:39
@Zylphrex Zylphrex requested a review from a team October 19, 2021 18:40
Copy link
Member

@wmak wmak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM;

  • small question around moving some of the classes and functions out of the endpoint to make it smaller
  • Also I think if these tests do end up taking 10s each, we likely want to skip instead of xfail cause I think pytest still runs the tests
    • Also should leave 1 in as a general smoke test IMO

Comment on lines 134 to 487
@dataclasses.dataclass(frozen=True)
class ExampleSpan:
id: str
start_timestamp: float
finish_timestamp: float
exclusive_time: float

def serialize(self) -> Any:
return {
"id": self.id,
"startTimestamp": self.start_timestamp,
"finishTimestamp": self.finish_timestamp,
"exclusiveTime": self.exclusive_time,
}


@dataclasses.dataclass(frozen=True)
class ExampleTransaction:
id: str
description: Optional[str]
start_timestamp: float
finish_timestamp: float
non_overlapping_exclusive_time: float
spans: List[ExampleSpan]

def serialize(self) -> Any:
return {
"id": self.id,
"description": self.description,
"startTimestamp": self.start_timestamp,
"finishTimestamp": self.finish_timestamp,
"nonOverlappingExclusiveTime": self.non_overlapping_exclusive_time,
"spans": [span.serialize() for span in self.spans],
}


@dataclasses.dataclass(frozen=True)
class SuspectSpan:
project_id: int
project: str
transaction: str
op: str
group: str
frequency: int
count: int
sum_exclusive_time: float
p50_exclusive_time: float
p75_exclusive_time: float
p95_exclusive_time: float
p99_exclusive_time: float

def serialize(self) -> Any:
return {
"projectId": self.project_id,
"project": self.project,
"transaction": self.transaction,
"op": self.op,
"group": self.group,
"frequency": self.frequency,
"count": self.count,
"sumExclusiveTime": self.sum_exclusive_time,
"p50ExclusiveTime": self.p50_exclusive_time,
"p75ExclusiveTime": self.p75_exclusive_time,
"p95ExclusiveTime": self.p95_exclusive_time,
"p99ExclusiveTime": self.p99_exclusive_time,
}


@dataclasses.dataclass(frozen=True)
class SuspectSpanWithExamples(SuspectSpan):
examples: Optional[List[ExampleTransaction]] = None

def serialize(self) -> Any:
serialized = super().serialize()
serialized["examples"] = (
[] if self.examples is None else [ex.serialize() for ex in self.examples]
)
return serialized


def query_suspect_span_groups(
params: ParamsType,
query: Optional[str],
order_column: str,
limit: int,
offset: int,
) -> List[SuspectSpan]:
builder = QueryBuilder(
dataset=Dataset.Discover,
params=params,
selected_columns=[
"project.id",
"project",
"transaction",
"array_join(spans_op)",
"array_join(spans_group)",
"count_unique(id)",
*(column.suspect_op_group_column for column in SPAN_PERFORMANCE_COLUMNS.values()),
],
query=query,
orderby=order_column,
auto_aggregations=True,
use_aggregate_conditions=True,
limit=limit,
offset=offset,
functions_acl=["array_join", "sumArray", "percentileArray", "maxArray"],
)

snql_query = builder.get_snql_query()
results = raw_snql_query(snql_query, "api.organization-events-spans-performance-suspects")

return [
SuspectSpan(
project_id=suspect["project.id"],
project=suspect["project"],
transaction=suspect["transaction"],
op=suspect["array_join_spans_op"],
group=suspect["array_join_spans_group"],
frequency=suspect["count_unique_id"],
count=suspect["count"],
sum_exclusive_time=suspect["sumArray_spans_exclusive_time"],
p50_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_50"),
p75_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_75"),
p95_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_95"),
p99_exclusive_time=suspect.get("percentileArray_spans_exclusive_time_0_99"),
)
for suspect in results["data"]
]


def query_example_transactions(
params: ParamsType,
query: Optional[str],
order_column: str,
suspects: List[SuspectSpan],
per_suspect: int = 3,
) -> Dict[Tuple[str, str], List[str]]:
# there aren't any suspects, early return to save an empty query
if not suspects:
return {}

builder = QueryBuilder(
dataset=Dataset.Discover,
params=params,
selected_columns=[
"id",
"array_join(spans_op)",
"array_join(spans_group)",
*(column.suspect_example_column for column in SPAN_PERFORMANCE_COLUMNS.values()),
],
query=query,
orderby=get_function_alias(order_column),
auto_aggregations=True,
use_aggregate_conditions=True,
# we want only `per_suspect` examples for each suspect
limit=len(suspects) * per_suspect,
functions_acl=["array_join", "sumArray", "percentileArray", "maxArray"],
)

# we are only interested in the specific op, group pairs from the suspects
builder.add_conditions(
[
Condition(
Function(
"tuple",
[
builder.resolve_function("array_join(spans_op)"),
builder.resolve_function("array_join(spans_group)"),
],
),
Op.IN,
Function(
"tuple",
[Function("tuple", [suspect.op, suspect.group]) for suspect in suspects],
),
),
]
)

# Hack: the limit by clause only allows columns but here we want to
# do a limitby on the two array joins. For the time being, directly
# do the limitby on the internal snuba name for the span group column
# but this should not be relied upon in production, and if two spans
# differ only by the span op, this will result in a incorrect query
builder.limitby = LimitBy(Column("_snuba_array_join_spans_group"), per_suspect)

snql_query = builder.get_snql_query()
results = raw_snql_query(snql_query, "api.organization-events-spans-performance-examples")

examples: Dict[Tuple[str, str], List[str]] = {
(suspect.op, suspect.group): [] for suspect in suspects
}

for example in results["data"]:
key = example["array_join_spans_op"], example["array_join_spans_group"]
examples[key].append(example["id"])

return examples


def get_example_transaction(
project_id: int,
transaction_id: str,
span_op: str,
span_group: str,
) -> ExampleTransaction:
event = eventstore.get_event_by_id(project_id, transaction_id)
data = event.data

# the transaction itself is a span as well but we need to reconstruct
# it from the event as it's not present in the spans array
trace_context = data.get("contexts", {}).get("trace", {})
root_span = {
"span_id": trace_context["span_id"],
"op": trace_context["op"],
"hash": trace_context["hash"],
"exclusive_time": trace_context["exclusive_time"],
"description": data["transaction"],
"start_timestamp": data["start_timestamp"],
"timestamp": data["timestamp"],
}

matching_spans = [
span
for span in chain([root_span], data.get("spans", []))
if span["op"] == span_op and span["hash"] == span_group
]

# get the first non-None description
# use None if all descriptions are None
description = None
for span in matching_spans:
if span.get("description") is None:
continue
description = span["description"]

spans: List[ExampleSpan] = [
ExampleSpan(
id=span["span_id"],
start_timestamp=span["start_timestamp"],
finish_timestamp=span["timestamp"],
exclusive_time=span["exclusive_time"],
)
for span in matching_spans
]

non_overlapping_exclusive_time_windows = union_time_windows(
[
window
for span in spans
for window in get_exclusive_time_windows(
span,
# don't need to check the root span here because its parent
# will never be one of the spans in this transaction
data.get("spans", []),
)
]
)

return ExampleTransaction(
id=transaction_id,
description=description,
start_timestamp=data["start_timestamp"],
finish_timestamp=data["timestamp"],
non_overlapping_exclusive_time=sum(
window.duration_ms for window in non_overlapping_exclusive_time_windows
),
spans=spans,
)


@dataclasses.dataclass(frozen=True)
class TimeWindow:
# Timestamps are in seconds
start: float
end: float

def as_tuple(self) -> Tuple[float, float]:
return (self.start, self.end)

@property
def duration_ms(self) -> float:
return (self.end - self.start) * 1000

def __add__(self, other: "TimeWindow") -> Tuple[Optional["TimeWindow"], "TimeWindow"]:
if self.start < other.start:
if self.end < other.start:
return self, other
return None, TimeWindow(start=self.start, end=max(self.end, other.end))
else:
if self.start > other.end:
return other, self
return None, TimeWindow(start=other.start, end=max(self.end, other.end))

def __sub__(self, other: "TimeWindow") -> Tuple[Optional["TimeWindow"], "TimeWindow"]:
if self.start < other.start:
if self.end > other.end:
return (
TimeWindow(start=self.start, end=other.start),
TimeWindow(start=other.end, end=self.end),
)
return None, TimeWindow(start=self.start, end=min(self.end, other.start))
else:
if self.end < other.end:
return None, TimeWindow(start=self.end, end=self.end)
return None, TimeWindow(start=max(self.start, other.end), end=self.end)


def union_time_windows(time_windows: List[TimeWindow]) -> List[TimeWindow]:
if not time_windows:
return []

previous, *time_windows = sorted(time_windows, key=lambda window: window.as_tuple())

unioned: List[TimeWindow] = []

for current in time_windows:
window, previous = previous + current
if window:
unioned.append(window)

unioned.append(previous)

return unioned


def remove_time_windows(source: TimeWindow, time_windows: List[TimeWindow]) -> List[TimeWindow]:
if not time_windows:
return [source]

removed: List[TimeWindow] = []

for current in time_windows:
window, source = source - current
if window:
removed.append(window)

removed.append(source)

return removed


def get_exclusive_time_windows(span: ExampleSpan, spans: List[Any]) -> List[TimeWindow]:
non_overlapping_children_time_windows = union_time_windows(
[
TimeWindow(start=child["start_timestamp"], end=child["timestamp"])
for child in spans
if child["parent_span_id"] == span.id
]
)
return remove_time_windows(
TimeWindow(start=span.start_timestamp, end=span.finish_timestamp),
non_overlapping_children_time_windows,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts on moving these to utils.py?

Comment on lines 79 to 87
# pagination will try to get 1 extra result to decide if there is a next page,
# drop the extra result, if present, to fetch less events
alias = get_function_alias(
SPAN_PERFORMANCE_COLUMNS[orderby_column].suspect_example_column
)
orderby = direction + alias
transaction_ids = query_example_transactions(
params, query, orderby, suspects[: limit - 1]
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 won't this break pagination?
If we don't want pagination shouldn't we disable it entirely like we do with the noPagination param?
Or is this for another reason

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is done intentionally. Pagination is done by checking the existence of an extra result. What this is doing is skipping the nodestore calls + any calculation done on the event for this extra result. However, when serializing, the extra result will be present but without any example events attached. This way pagination still works as expected but we don't need to make the extra nodestore calls for this extra result.

@Zylphrex Zylphrex merged commit 36b1637 into master Oct 20, 2021
@Zylphrex Zylphrex deleted the feat/add-new-suspect-span-endpoint branch October 20, 2021 15:52
@github-actions github-actions bot locked and limited conversation to collaborators Nov 5, 2021
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants