## LangSmith Trace Span Evaluation

In [15]:
import os
from pydantic import BaseModel
from grounded_ai import Evaluator
from grounded_ai.otel import TraceConverter

# 1. Define the Raw Trace Data (simulating a LangSmith export)
# Note: Input contains null content for tool calls, testing our recent fix.
raw_trace = {
  "id": "run-root",
  "name": "AgentExecutor",
  "run_type": "chain",
  "inputs": {"messages": [{"type": "user", "content": "Find the 2007 US PGA winner"}]},
  "outputs": {"output": "Tiger Woods won the 2007 US PGA Championship"},
  "start_time": "2024-01-27T10:30:00.000Z",
  "end_time": "2024-01-27T10:30:10.000Z",
  "child_runs": [
    # 1. First LLM Call: INCORRECTLY decides to check weather instead of golf
    {
      "id": "run-llm-1",
      "name": "gpt-4",
      "run_type": "llm",
      "inputs": {
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Find the 2007 US PGA winner"}
        ]
      },
      "outputs": {
        "generations": [[{
            "text": "",
            "message": {
                "role": "assistant",
                "content": None,
                "tool_calls": [{
                    "id": "call-bad-1",
                    "function": {"name": "search", "arguments": "{\"query\": \"weather in Tokyo\"}"}
                }]
            }
        }]]
      },
      "start_time": "2024-01-27T10:30:01.000Z",
      "end_time": "2024-01-27T10:30:03.000Z",
      "extra": {"token_usage": {"prompt_tokens": 50, "completion_tokens": 20}}
    },
    
    # 2. Tool Execution: Returns irrelevant info
    {
      "id": "run-tool-1",
      "name": "search",
      "run_type": "tool",
      "inputs": {
        "messages": [
          {
            "role": "system",
            "content": "You are a helpful assistant"
          },
          {
            "role": "user",
            "content": "Find the 2007 US PGA winner"
          },
          {
            "role": "assistant",
            "tool_calls": [
                {
                    "id": "call_abc123",
                    "type": "function",
                    "function": {
                    "name": "search",
                    "arguments": "{\"query\": \"2007 US PGA winner\"}"
                    }
                },
                {
                    "id": "call_def456",
                    "type": "function",
                    "function": {
                    "name": "calculator",
                    "arguments": "{\"expression\": \"2007 + 17\"}"
                    }
                }
                ]
          },
          {
            "role": "tool",
            "content": "Tiger Woods won the 2007 PGA Championship at Southern Hills",
            "tool_call_id": "call_abc123"
          }
        ]
      },
      "outputs": {
        "choices": [
          {
            "message": {
              "role": "assistant",
              "content": "Tiger Woods won the 2007 US PGA Championship"
            }
          }
        ]
      },
      "start_time": "2024-01-27T10:30:05.100000Z",
      "end_time": "2024-01-27T10:30:07.000000Z",
      "parent_run_id": "parent-run-uuid",
      "trace_id": "parent-run-uuid",
      "dotted_order": "20240127T103000000000Z.parent.llm2"
    }
  ]
}

In [None]:
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

In [17]:
# 1. Convert Raw Dict -> AgentTrace Object
trace = TraceConverter.from_langsmith(raw_trace)

# 2. Define Output Schema
class TraceEval(BaseModel):
    tool_usage_correct: bool
    issues: list[str]
    reasoning: str

# 3. Serialize the entire trace to a JSON string
reasoning = trace.get_reasoning_chain()
tool_calls = trace.get_tool_spans()

# 4. Initialize Evaluator
evaluator = Evaluator(
    model="openai/gpt-4o",
    system_prompt="Analyze this agent execution trace for correctness."
)

agent_trace_response  = f'Reasoning Chain: {reasoning}' + f'Tool Calls: {tool_calls}'
# 5. Evaluate the Full Trace
result = evaluator.evaluate(
    response=agent_trace_response,
    output_schema=TraceEval
)

In [18]:
from pprint import pprint
pprint(result.model_dump_json(indent=2))

('{\n'
 '  "tool_usage_correct": false,\n'
 '  "issues": [\n'
 '    "The response includes tool calls that contain a search with the query '
 "for the '2007 US PGA winner' which returned the correct answer. However, it "
 "also includes an unrelated tool call for a 'calculator' to add 2007 + 17 "
 'which is unnecessary and irrelevant to the user\'s query."\n'
 '  ],\n'
 '  "reasoning": "The trace indicates that while the appropriate search '
 'function was called to find the winner of the 2007 US PGA Championship, an '
 'additional, unrelated tool operation was also conducted. The main task was '
 'to identify the 2007 US PGA winner, which was correctly accomplished using '
 "the 'search' tool. However, the inclusion of an unrelated 'calculator' "
 "function call that calculates '2007 + 17' suggests a lack of precise tool "
 'usage, as this calculation does not pertain to the task of identifying the '
 'tournament winner."\n'
 '}')
