Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions src/sentry/api/endpoints/organization_ai_conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ def _build_aggregations_query(
"failure_count()",
"count_if(gen_ai.operation.type,equals,ai_client)",
"count_if(gen_ai.operation.type,equals,tool)",
"sum(gen_ai.usage.total_tokens)",
"sum(gen_ai.cost.total_tokens)",
"sum_if(gen_ai.usage.total_tokens,gen_ai.operation.type,equals,ai_client)",
"sum_if(gen_ai.cost.total_tokens,gen_ai.operation.type,equals,ai_client)",
"min(precise.start_ts)",
"max(precise.finish_ts)",
],
Expand Down Expand Up @@ -425,8 +425,18 @@ def _build_conversations_from_aggregations(
errors=int(row.get("failure_count()") or 0),
llm_calls=int(row.get("count_if(gen_ai.operation.type,equals,ai_client)") or 0),
tool_calls=int(row.get("count_if(gen_ai.operation.type,equals,tool)") or 0),
total_tokens=int(row.get("sum(gen_ai.usage.total_tokens)") or 0),
total_cost=float(row.get("sum(gen_ai.cost.total_tokens)") or 0),
total_tokens=int(
row.get(
"sum_if(gen_ai.usage.total_tokens,gen_ai.operation.type,equals,ai_client)"
)
or 0
),
total_cost=float(
row.get(
"sum_if(gen_ai.cost.total_tokens,gen_ai.operation.type,equals,ai_client)"
)
or 0
),
trace_ids=[],
flow=[],
first_input=None,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,63 @@ def test_stats_period_is_ignored(self) -> None:
assert response.status_code == 200, response.data
assert len(response.data) == 1
assert response.data[0]["gen_ai.conversation.id"] == conversation_id

def test_tokens_on_multiple_span_types(self) -> None:
"""Test that raw spans are returned with their individual token/cost values.

This endpoint returns raw span data without aggregation. Consumers must
filter by gen_ai.operation.type:ai_client when summing tokens/costs
to avoid double counting from agent spans that may also have token data.
"""
now = before_now(days=91).replace(microsecond=0)
trace_id = uuid4().hex
conversation_id = uuid4().hex

# Agent span with tokens/cost
self.store_ai_span(
conversation_id=conversation_id,
timestamp=now - timedelta(seconds=2),
op="gen_ai.invoke_agent",
operation_type="invoke_agent",
description="Test Agent",
agent_name="Test Agent",
trace_id=trace_id,
tokens=500,
cost=0.05,
)

# ai_client span with tokens/cost
self.store_ai_span(
conversation_id=conversation_id,
timestamp=now - timedelta(seconds=1),
op="gen_ai.chat",
operation_type="ai_client",
trace_id=trace_id,
tokens=100,
cost=0.01,
)

query = {
"project": [self.project.id],
"start": (now - timedelta(hours=1)).isoformat(),
"end": (now + timedelta(hours=1)).isoformat(),
}

response = self.do_request(conversation_id, query)
assert response.status_code == 200, response.data
assert len(response.data) == 2

# Sort by timestamp to ensure consistent order (oldest first)
spans = sorted(response.data, key=lambda s: s["precise.start_ts"])

# First span is the agent span with its own token values
agent_span = spans[0]
assert agent_span["gen_ai.operation.type"] == "invoke_agent"
assert agent_span["gen_ai.usage.total_tokens"] == 500
assert agent_span["gen_ai.cost.total_tokens"] == 0.05

# Second span is the ai_client span with its own token values
ai_client_span = spans[1]
assert ai_client_span["gen_ai.operation.type"] == "ai_client"
assert ai_client_span["gen_ai.usage.total_tokens"] == 100
assert ai_client_span["gen_ai.cost.total_tokens"] == 0.01
Original file line number Diff line number Diff line change
Expand Up @@ -1193,3 +1193,56 @@ def test_empty_tool_names_when_no_tool_calls(self) -> None:
assert conversation["toolNames"] == []
assert conversation["toolCalls"] == 0
assert conversation["toolErrors"] == 0

def test_tokens_only_counted_from_ai_client_spans(self) -> None:
"""Test that tokens and costs are only counted from ai_client spans, not agent spans.

This prevents double counting when both agent spans (invoke_agent) and their
child ai_client spans have token/cost data.
"""
now = before_now(days=109).replace(microsecond=0)
conversation_id = uuid4().hex
trace_id = uuid4().hex

# Agent span with tokens/cost (should NOT be counted)
self.store_ai_span(
conversation_id=conversation_id,
timestamp=now - timedelta(seconds=2),
op="gen_ai.invoke_agent",
operation_type="invoke_agent",
description="Test Agent",
agent_name="Test Agent",
trace_id=trace_id,
tokens=500,
cost=0.05,
)

# ai_client span with tokens/cost (should be counted)
self.store_ai_span(
conversation_id=conversation_id,
timestamp=now - timedelta(seconds=1),
op="gen_ai.chat",
operation_type="ai_client",
trace_id=trace_id,
tokens=100,
cost=0.01,
)

query = {
"project": [self.project.id],
"start": (now - timedelta(hours=1)).isoformat(),
"end": (now + timedelta(hours=1)).isoformat(),
}

response = self.do_request(query)
assert response.status_code == 200
assert len(response.data) == 1

conversation = response.data[0]
# Tokens and cost should only come from ai_client span (100, 0.01)
# NOT the sum of both spans (600, 0.06) which would be double counting
assert conversation["totalTokens"] == 100
assert conversation["totalCost"] == 0.01
# Verify counts are correct
assert conversation["llmCalls"] == 1
assert conversation["flow"] == ["Test Agent"]
Loading