# Cruise Booking AI Agent Evaluation
#### @author Karthik Kalahasthi https://www.linkedin.com/in/karthikkalahasthi/

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sys
from pathlib import Path

# Get project root (notebook is in evals/notebooks/, so go up 2 levels)
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent.parent
else:
    project_root = current_dir.parent.parent

sys.path.insert(0, str(project_root))
print(f"Project root: {project_root}")

#### Install dependencies

In [None]:
! pip install -r ../../requirements.txt

In [9]:
from phoenix.client import Client
from phoenix.evals import OpenAIModel, llm_classify
from phoenix.experiments import run_experiment,evaluate_experiment
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import Example
import pandas as pd
from pathlib import Path
import sys
import os
import nest_asyncio
nest_asyncio.apply()

project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))
sys.path.insert(0, project_root)
from agents.cruise_booking.agent import root_agent
from evals.eval_prompts import load_all_prompts
all_prompts = load_all_prompts()

Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


üîÑ Initializing Phoenix tracing...
üî≠ OpenTelemetry Tracing Details üî≠
|  Phoenix Project: cruise-booking-agent
|  Span Processor: BatchSpanProcessor
|  Collector Endpoint: localhost:4317
|  Transport: gRPC
|  Transport Headers: {'authorization': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.

‚úÖ Phoenix tracing initialized!
   Endpoint: http://localhost:6006
   Project: cruise-booking-agent
   Batch processing: ‚úÖ
   Auto-instrumentation: ‚úÖ
   Manual instrumentation: ‚úÖ (decorators available)
2026-01-27 22:28:19 - src.tools.vector_store - INFO - Initialized ChromaDB collection: cruises
{"timestamp": null, "level": null, "name": "src.tools.vector_store", "message": "Initialized ChromaDB collection: cruises"}
2026-01-27 22:28:19 - src.tools.semantic_search - INFO - Loading embedding model: BAAI/bge-small-en-v1.5
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Loading embedding model: BAAI/bge

#### Load/Export dataset from phoenix-arize. Make sure to upload dataset before exporting.

In [10]:
px_client = Client()
dataset_identifier = "golden_dataset_small"
dataset = px_client.datasets.get_dataset(dataset=dataset_identifier)
print(f"‚úÖ Retrieved dataset: {dataset_identifier}")
print(dataset[0])
print(dataset.examples[0]['input']['Query'])


‚úÖ Retrieved dataset: golden_dataset_small
{'id': 'RGF0YXNldEV4YW1wbGU6NDI=', 'input': {'Query': 'Find me a 7-day cruise from Miami to the Caribbean'}, 'output': {}, 'metadata': {}, 'updated_at': '2026-01-27T13:01:29.073161+00:00'}
Find me a 7-day cruise from Miami to the Caribbean


In [11]:
# Re-import duckdb after installation
import importlib
import sys

# Re-import duckdb module
try:
    import duckdb
    # Update the duckdb reference in the data_search module
    from src.tools import data_search
    data_search.duckdb = duckdb
    print(f"‚úÖ DuckDB reloaded successfully! Version: {duckdb.__version__}")
except ImportError as e:
    print(f"‚ùå Failed to import DuckDB: {e}")
    print("Please install DuckDB: pip install duckdb==1.4.4")

‚úÖ DuckDB reloaded successfully! Version: 1.4.4


#### Load Cruise data into vector store 

In [12]:
# Ensure cruise data is loaded
from src.tools.data_search import DataSearch
from agents.cruise_booking.tools import data_search_tools
import os

# Get the correct data directory path (relative to project root)
data_dir = os.path.join(project_root, 'data')
print(f"Loading data from: {data_dir}")

# Reload data search with correct path
data_search = DataSearch(data_dir=data_dir)
stats = data_search.get_stats()
print(f"‚úÖ Data loaded: {stats['total_cruises']} cruises, {stats['total_pricing_rows']} pricing rows")

# Update the module-level data search instance
data_search_tools._data_search = data_search
print("‚úÖ Data search tools reloaded with data")


Loading data from: /Users/karu/karthik/technology/code/python/my_repos/agent-evals-demo/data
2026-01-27 22:28:25 - src.tools.data_search - INFO - DataSearch loaded 25 cruises
{"timestamp": null, "level": null, "name": "src.tools.data_search", "message": "DataSearch loaded 25 cruises"}
‚úÖ Data loaded: 25 cruises, 0 pricing rows
‚úÖ Data search tools reloaded with data


#### Helper Functions to call ADK Agent

In [13]:
import asyncio
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.genai import types

agent_outputs = []

# Create session service and runner (outside the loop)
session_service = InMemorySessionService()
runner = Runner(
    app_name="cruise_booking_eval",
    agent=root_agent,
    session_service=session_service
)

async def run_agent_query(query: str, session_id: str):
    """Run a single query through the agent."""
    print("*****query*****", query)
    content = types.Content(parts=[types.Part(text=query)])
    response_parts = []
    sub_agents_used = set()
    tools_used = []
    
    async for event in runner.run_async(
        user_id="eval_user",
        session_id=session_id,
        new_message=content
    ):
        # Collect response text and track tool calls from event.content.parts
        if hasattr(event, 'content') and event.content:
            for part in event.content.parts:
                if hasattr(part, 'text') and part.text:
                    response_parts.append(part.text)
                # Tool calls in ADK are Part objects with function_call attribute
                if hasattr(part, 'function_call') and part.function_call:
                    fc = part.function_call
                    name = getattr(fc, 'name', None) or getattr(fc, 'function_name', None)
                    if name:
                        tools_used.append(name)
        
        # Track sub-agents
        if hasattr(event, 'author') and event.author:
            if event.author != 'CruiseBookingAgent':
                sub_agents_used.add(event.author)
    
    return {
        'output': ' '.join(response_parts),
        'sub_agents_used': list(sub_agents_used),
        'tools_used': tools_used
    }



In [14]:
def run_agent_with_example(example: Example) -> str:
    # Create a session for all queries
    session = asyncio.run(session_service.create_session(
        app_name="cruise_booking_eval",
        user_id="eval_user"
    ))
    result = asyncio.run(run_agent_query(example.input.get('Query'), session.id))
    return result

### Run Expirement 
##### Run examples calling agent for each exaple

In [15]:
dry_run = False
experiment = run_experiment(dataset,
                            run_agent_with_example,
                            dry_run=dry_run,
                            evaluators=[],
                            experiment_name="Cruise Booking Agent Eval",
                            experiment_description="Cruise Booking Agent Evaluation")

üß™ Experiment started.
üì∫ View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDoz/experiments
üîó View this experiment: http://localhost:6006/datasets/RGF0YXNldDoz/compare?experimentId=RXhwZXJpbWVudDoz


running tasks |          | 0/5 (0.0%) | ‚è≥ 00:00<? | ?it/s

*****query***** I'm looking for a romantic cruise with spa and fine dining
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:34 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final response using the required output schema.\n\n      Use this tool to provide your final structured answer instead\n      of outputting text directly.\n      ', 'parameters': {'type': 'object', 'properties': {'message': {'type': 'string'}, 'needFollowUpInfo': 

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:37 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:37 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:38 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:38 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:39 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:39 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:40 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:40 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:41 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:41 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:42 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:42 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:43 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:43 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:44 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:44 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:44 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:44 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:45 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:45 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:46 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:46 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:47 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:47 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:48 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:48 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:49 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:49 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:50 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:50 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:51 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:51 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:52 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:52 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:53 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:28:53 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:54 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:54 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:55 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:55 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:56 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:56 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:57 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:57 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:58 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:58 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:28:59 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:28:59 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:00 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:29:00 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:01 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:29:01 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:02 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:29:02 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:03 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:29:03 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:29 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
[92m22:29:29 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2026-01-27 22:29:30 - src.tools.semantic_search - INFO - Semantic search returned 0 results
{"timestamp": null, "level": null, "name": "src.tools.semantic_search", "message": "Semantic search returned 0 results"}
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "`litellm.set_verbose` is deprecated. Please set `os.environ['LITELLM_LOG'] = 'DEBUG'` for debug logs."}
ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None
[92m22:29:30 - LiteLLM:INFO[0m: utils.py:3872 - 
LiteLLM completion() model= gpt-4.1-mini; provider = openai
Logging Details LiteLLM-Async Success Call, cache_hit=None
Async success callbacks: Got a complete streaming response
{"timestamp": null, "level": null, "name": "LiteLLM", "message": "\nLiteLLM completion() model= gpt-4.1-mini; provider = openai"}
Final returned optional params: {'temperature': 0.7, 'max_tokens': 1000, 'tools': [{'type': 'function', 'function': {'name': 'set_model_response', 'description': 'Set your final res

#### Create Code Based Evaluator

In [23]:
@create_evaluator(name="Tools Used Eval", kind="CODE")
def evaluate_tool_calls(input: dict,output: dict) -> float:
    try:
        print("output",output)
        if output and output.get("tools_used"):
            if(len(output)>0):
                return 1
            else:
                return 0
        else:
            return 0
    except Exception as e:
        print("Error in evaluate_tool_calls",e)
        return 0

In [35]:
phoenix_model = OpenAIModel(
        model="gpt-4.1-mini",
        base_url=os.getenv("OPENAI_API_BASE"),
        api_key=os.getenv("OPENAI_API_KEY")
)
RESPONSE_COMPLETENESS_TEMPLATE = all_prompts['response_completeness']['template']
@create_evaluator(name="Response Completeness Eval", kind="LLM")
def evaluate_response_completeness(input: dict, output: dict) -> bool:
    if output is None:
        return False
    
    query = input.get('Query', '')
    response = output.get('output', '')
    
    if not query or not response:
        return False
    
    
    df = pd.DataFrame({
        'input': [query],
        'output': [response],
        # 'expected_info': [expected_info]
    })
    
    result = llm_classify(
        data=df,
        template=RESPONSE_COMPLETENESS_TEMPLATE,
        rails=['1', '2', '3', '4', '5'],
        model=phoenix_model,
        provide_explanation=True
    )
    
    score = int(result['label'].iloc[0]) if 'label' in result.columns else 3
    return score >= 4

### Run evaluators against Agent outputs

- Run code evaluators : evaluate_tool_calls
- Run LLM-As-Judge Evaluator

In [36]:
experiment = evaluate_experiment(experiment,
                            evaluators=[evaluate_tool_calls,evaluate_response_completeness])

üß† Evaluation started.


running experiment evaluations |          | 0/10 (0.0%) | ‚è≥ 00:00<? | ?it/s

output {'output': '{"message": "Here are some 7-day cruises departing from Miami to the Caribbean:\\n\\n1. Wellness Retreat\\n- Departure Date: 2026-05-20\\n- Ports of Call: St Thomas, St Kitts, Antigua\\n- Cabin Type: Balcony\\n- Price per Person: $2100\\n- Description: Wellness-focused Caribbean cruise with yoga, meditation, and spa treatments.\\n- Amenities: Spa, yoga classes, healthy dining, meditation, fitness center, wellness workshops\\n\\n2. Caribbean Adventure\\n- Departure Date: 2026-06-10\\n- Ports of Call: Jamaica, Grand Cayman, Cozumel\\n- Cabin Type: Oceanview\\n- Price per Person: $800\\n- Description: Family-friendly Caribbean cruise with activities for all ages including water slides and kids programs.\\n- Amenities: Pool, entertainment, kids program, water slides, mini golf\\n\\n3. Ocean Explorer\\n- Departure Date: 2026-06-15\\n- Ports of Call: Nassau, Cozumel, Key West\\n- Cabin Type: Balcony\\n- Price per Person: $1200\\n- Description: A luxurious 7-day Caribbean c

llm_classify |          | 0/1 (0.0%) | ‚è≥ 00:00<? | ?it/s

llm_classify |          | 0/1 (0.0%) | ‚è≥ 00:00<? | ?it/s

output {'output': '{"message": "For a first-time cruiser, I recommend a variety of options depending on your interests and budget:\\n\\n1. Caribbean Adventure (7 days, Miami departure) - This is a family-friendly and budget-friendly cruise with lots of activities including water slides, kids programs, and mini-golf. It\'s great if you want a fun and active introduction to cruising.\\n\\n2. Ocean Explorer (7 days, Miami departure) - A luxurious Caribbean cruise with balcony cabins, spa, fine dining, pool, and wifi. Perfect if you want a more premium and relaxing experience.\\n\\n3. Mediterranean Dream (10 days, Barcelona departure) - A longer cruise with beautiful Mediterranean destinations and premium amenities like a spa, fine dining, and entertainment. Great for a first-timer who wants a romantic and culturally rich experience.\\n\\n4. Alaska Explorer (14 days, Seattle departure) - A nature-focused cruise with adventure excursions and wildlife viewing. Ideal if you want a scenic and 

llm_classify |          | 0/1 (0.0%) | ‚è≥ 00:00<? | ?it/s

output {'output': '{"message": "I found three 7-day cruises departing from Miami in June that might interest you:\\n\\n1. Ocean Explorer - Caribbean\\n   - Departure Date: June 15, 2026\\n   - Cabin Type: Balcony\\n   - Price: $1200 per person ($2400 total for two)\\n   - Amenities: spa, fine dining, pool, wifi\\n   - Description: A luxurious 7-day Caribbean cruise with world-class amenities and stunning destinations like Nassau, Cozumel, and Key West.\\n\\n2. Caribbean Adventure - Caribbean\\n   - Departure Date: June 10, 2026\\n   - Cabin Type: Oceanview\\n   - Price: $800 per person ($1600 total for two)\\n   - Amenities: pool, entertainment, kids program, water slides, mini golf\\n   - Description: Family-friendly Caribbean cruise with activities for all ages including water slides and kids programs. Ports include Jamaica, Grand Cayman, and Cozumel.\\n\\n3. Wellness Retreat - Caribbean\\n   - Departure Date: May 20, 2026\\n   - Cabin Type: Balcony\\n   - Price: $2100 per person ($4

llm_classify |          | 0/1 (0.0%) | ‚è≥ 00:00<? | ?it/s

output {'output': '{"message": "I searched extensively for cruises described as romantic with spa and fine dining amenities, but I couldn\'t find any matching options in the available cruise listings. It seems there are currently no cruises explicitly matching all these specific criteria together. If you would like, I can help you explore cruises with just spa amenities, or fine dining, or romantic themes separately, or assist you with a broader search. Would you like me to proceed with any of these options?", "needFollowUpInfo": true, "follow_up_questions": ["Would you like to see cruises with spa amenities only?", "Would you like to see cruises with fine dining only?", "Would you like to see romantic-themed cruises without specific spa or dining requirements?", "Would you like to broaden your search criteria?"]}', 'sub_agents_used': ['SemanticSearchAgent'], 'tools_used': ['transfer_to_agent', 'semantic_search_cruises', 'semantic_search_cruises', 'semantic_search_cruises', 'semantic_s

llm_classify |          | 0/1 (0.0%) | ‚è≥ 00:00<? | ?it/s


üîó View this experiment: http://localhost:6006/datasets/RGF0YXNldDoz/compare?experimentId=RXhwZXJpbWVudDoz

Experiment Summary (01/27/26 10:47 PM -0500)
--------------------------------------------
| evaluator                  |   n |   n_scores |   avg_score |   n_labels | top_2_labels   |
|:---------------------------|----:|-----------:|------------:|-----------:|:---------------|
| Response Completeness Eval |   5 |          5 |           1 |          5 | {'True': 5}    |
| Tools Used Eval            |   5 |          5 |           1 |          0 |                |

Experiment Summary (01/27/26 10:46 PM -0500)
--------------------------------------------
| evaluator                  |   n |   n_errors | top_error                                                                                          |   n_scores |   avg_score |
|:---------------------------|----:|-----------:|:---------------------------------------------------------------------------------------------------|----