# Grounded AI - OpenTelemetry Trace Evaluation Example

This notebook demonstrates how to convert OpenTelemetry GenAI traces into Grounded AI conversations.
It evaluates both standard chat responses and complex tool usage scenarios.

In [None]:
# %pip install grounded-ai

In [None]:
import os
from pydantic import BaseModel, Field
from grounded_ai import Evaluator
from grounded_ai.otel import TraceConverter

# Check for API Key
if "OPENAI_API_KEY" not in os.environ:
    print("⚠️ Please set OPENAI_API_KEY environment variable to run evaluations.")
    # os.environ["OPENAI_API_KEY"] = "sk-..."

## 1. Define Standard Chat Span
A simple conversation: System -> User -> Assistant.

In [None]:
chat_span = {
    "trace_id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4",
    "span_id": "1a2b3c4d1a2b3c4d",
    "name": "chat gpt-4o-mini",
    "start_time": "2026-01-15T10:30:00Z",
    "end_time": "2026-01-15T10:30:02Z",
    "attributes": {
        "gen_ai.system": "openai",
        "gen_ai.request.model": "gpt-4o-mini",
        "gen_ai.usage.input_tokens": 12,
        "gen_ai.usage.output_tokens": 16,
        "gen_ai.input.messages": [
            {
                "role": "system",
                "parts": [{"type": "text", "content": "You are a helpful assistant."}]
            },
            {
                "role": "user",
                "parts": [{"type": "text", "content": "What is the capital of France?"}]
            }
        ],
        "gen_ai.output.messages": [
            {
                "role": "assistant",
                "parts": [{"type": "text", "content": "The capital of France is Berlin."}]
            }
        ]
    }
}

## 2. Define Tool Usage Span
A multi-turn interaction embedded in one span: User -> Assistant (Call) -> Tool (Result) -> Assistant (Final).

In [None]:
tool_span = {
    "trace_id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5",
    "span_id": "2b3c4d5e2b3c4d5e",
    "name": "chat gpt-4",
    "start_time": "2026-01-15T10:35:00Z",
    "end_time": "2026-01-15T10:35:05Z",
    "attributes": {
        "gen_ai.system": "openai",
        "gen_ai.provider.name": "openai",
        "gen_ai.request.model": "gpt-4",
        "gen_ai.request.max_tokens": 200,
        "gen_ai.request.top_p": 1.0,
        "gen_ai.response.id": "chatcmpl-call_VSPygqKTWdrhaFErNvMV18Yl",
        "gen_ai.response.model": "gpt-4-0613",
        "gen_ai.usage.output_tokens": 52,
        "gen_ai.usage.input_tokens": 97,
        "gen_ai.response.finish_reasons": ["stop"],
        "gen_ai.input.messages": [
            {
                "role": "user",
                "parts": [{"type": "text", "content": "Weather in Paris?"}]
            },
            {
                "role": "assistant",
                "parts": [
                    {
                        "type": "tool_call",
                        "id": "call_VSPygqKTWdrhaFErNvMV18Yl",
                        "name": "get_weather",
                        "arguments": {"location": "Paris"}
                    }
                ]
            },
            {
                "role": "tool",
                "parts": [
                    {
                        "type": "tool_call_response",
                        "id": " call_VSPygqKTWdrhaFErNvMV18Yl",
                        "response": "rainy, 57\u00b0F"
                    }
                ]
            }
        ],
        "gen_ai.output.messages": [
            {
                "role": "assistant",
                "parts": [
                    {
                        "type": "text",
                        "content": "The weather in Paris is currently rainy with a temperature of 57\u00b0F."
                    }
                ],
                "finish_reason": "stop"
            }
        ]
    }
}

## 3. Convert Traces
Use `TraceConverter` to parse the OTLP/JSON spans into Grounded AI conversations.

In [None]:
chat_conversation = TraceConverter.from_otlp([chat_span])
tool_conversation = TraceConverter.from_otlp([tool_span])

# Print logical representation
print("--- Chat Conversation ---")
print(chat_conversation.to_evaluation_string())

print("\n--- Tool Conversation ---")
print(tool_conversation.to_evaluation_string())

## 4. Evaluate
Define a metric and run the evaluator on the parsed conversations.

In [None]:
class ResponseCorrectness(BaseModel):
    is_correct: bool = Field(
        description="True if the response accurately answers the user's question."
    )
    explanation: str = Field(description="Reasoning for the score.")

evaluator = Evaluator(
    model="openai/gpt-4o",
    system_prompt="You are an expert judge. Evaluate the accuracy of the assistant's response.",
)

In [None]:
print("Evaluating Chat...")
chat_result = evaluator.evaluate(
    response=chat_conversation.to_evaluation_string(),
    output_schema=ResponseCorrectness
)
print(f"Chat Result: {chat_result}\n")

print("Evaluating Tool Usage...")
tool_result = evaluator.evaluate(
    response=tool_conversation.to_evaluation_string(),
    output_schema=ResponseCorrectness
)
print(f"Tool Result: {tool_result}")