# Cruise Booking AI Agent Evaluation
#### @author Karthik Kalahasthi https://www.linkedin.com/in/karthikkalahasthi/

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sys
from pathlib import Path

# Get project root (notebook is in evals/notebooks/, so go up 2 levels)
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent.parent
else:
    project_root = current_dir.parent.parent

sys.path.insert(0, str(project_root))
print(f"Project root: {project_root}")

#### Install dependencies

In [None]:
! pip install -r ../../requirements.txt

In [None]:
from phoenix.client import Client
from phoenix.evals import OpenAIModel, llm_classify
from phoenix.experiments import run_experiment,evaluate_experiment
from phoenix.experiments.evaluators import create_evaluator
from phoenix.experiments.types import Example
import pandas as pd
from pathlib import Path
import sys
import os
import nest_asyncio
nest_asyncio.apply()

project_root = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))
sys.path.insert(0, project_root)
from agents.cruise_booking.agent import root_agent
from evals.eval_prompts import load_all_prompts
all_prompts = load_all_prompts()

#### Load/Export dataset from phoenix-arize. Make sure to upload dataset before exporting.

In [None]:
px_client = Client()
dataset_identifier = "golden_dataset_small"
dataset = px_client.datasets.get_dataset(dataset=dataset_identifier)
print(f"✅ Retrieved dataset: {dataset_identifier}")
print(dataset[0])
print(dataset.examples[0]['input']['Query'])


In [None]:
# Re-import duckdb after installation
import importlib
import sys

# Re-import duckdb module
try:
    import duckdb
    # Update the duckdb reference in the data_search module
    from src.tools import data_search
    data_search.duckdb = duckdb
    print(f"✅ DuckDB reloaded successfully! Version: {duckdb.__version__}")
except ImportError as e:
    print(f"❌ Failed to import DuckDB: {e}")
    print("Please install DuckDB: pip install duckdb==1.4.4")

#### Load Cruise data into vector store 

In [None]:
# Ensure cruise data is loaded
from src.tools.data_search import DataSearch
from agents.cruise_booking.tools import data_search_tools
import os

# Get the correct data directory path (relative to project root)
data_dir = os.path.join(project_root, 'data')
print(f"Loading data from: {data_dir}")

# Reload data search with correct path
data_search = DataSearch(data_dir=data_dir)
stats = data_search.get_stats()
print(f"✅ Data loaded: {stats['total_cruises']} cruises, {stats['total_pricing_rows']} pricing rows")

# Update the module-level data search instance
data_search_tools._data_search = data_search
print("✅ Data search tools reloaded with data")


#### Helper Functions to call ADK Agent

In [None]:
import asyncio
from google.adk.runners import Runner
from google.adk.sessions import InMemorySessionService
from google.genai import types

agent_outputs = []

# Create session service and runner (outside the loop)
session_service = InMemorySessionService()
runner = Runner(
    app_name="cruise_booking_eval",
    agent=root_agent,
    session_service=session_service
)

async def run_agent_query(query: str, session_id: str):
    """Run a single query through the agent."""
    print("*****query*****", query)
    content = types.Content(parts=[types.Part(text=query)])
    response_parts = []
    sub_agents_used = set()
    tools_used = []
    
    async for event in runner.run_async(
        user_id="eval_user",
        session_id=session_id,
        new_message=content
    ):
        # Collect response text and track tool calls from event.content.parts
        if hasattr(event, 'content') and event.content:
            for part in event.content.parts:
                if hasattr(part, 'text') and part.text:
                    response_parts.append(part.text)
                # Tool calls in ADK are Part objects with function_call attribute
                if hasattr(part, 'function_call') and part.function_call:
                    fc = part.function_call
                    name = getattr(fc, 'name', None) or getattr(fc, 'function_name', None)
                    if name:
                        tools_used.append(name)
        
        # Track sub-agents
        if hasattr(event, 'author') and event.author:
            if event.author != 'CruiseBookingAgent':
                sub_agents_used.add(event.author)
    
    return {
        'output': ' '.join(response_parts),
        'sub_agents_used': list(sub_agents_used),
        'tools_used': tools_used
    }



In [None]:
def run_agent_with_example(example: Example) -> str:
    # Create a session for all queries
    session = asyncio.run(session_service.create_session(
        app_name="cruise_booking_eval",
        user_id="eval_user"
    ))
    result = asyncio.run(run_agent_query(example.input.get('Query'), session.id))
    return result

### Run Expirement 
##### Run examples calling agent for each exaple

In [None]:
dry_run = False
experiment = run_experiment(dataset,
                            run_agent_with_example,
                            dry_run=dry_run,
                            evaluators=[],
                            experiment_name="Cruise Booking Agent Eval",
                            experiment_description="Cruise Booking Agent Evaluation")

#### Create Code Based Evaluator

In [None]:
@create_evaluator(name="Tools Used Eval", kind="CODE")
def evaluate_tool_calls(output: str) -> float:
    try:
        print("output",output)
        if output and output.get("tools_used"):
            if(len(output)>0):
                return 1
            else:
                return 0
        else:
            return 0
    except Exception as e:
        print("Error in evaluate_tool_calls",e)
        return 0

In [None]:
RESPONSE_COMPLETENESS_TEMPLATE = all_prompts['response_completeness']['template']
@create_evaluator(name="Response Completeness Eval", kind="LLM")
def evaluate_response_completeness(input: dict, output: dict) -> bool:
    if output is None:
        return False
    
    query = input.get('question', '')
    response = output.get('final_output', '')
    
    if not query or not response:
        return False
    
    expected_info = "cruise options with relevant details"
    
    df = pd.DataFrame({
        'input': [query],
        'output': [response],
        'expected_info': [expected_info]
    })
    
    result = llm_classify(
        data=df,
        template=RESPONSE_COMPLETENESS_TEMPLATE,
        rails=['1', '2', '3', '4', '5'],
        model=OpenAIModel(model="openai/gpt-4.1-mini"),
        provide_explanation=True
    )
    
    score = int(result['label'].iloc[0]) if 'label' in result.columns else 3
    return score >= 4

### Run evaluators against Agent outputs

- Run code evaluators : evaluate_tool_calls
- Run LLM-As-Judge Evaluator

In [None]:
experiment = evaluate_experiment(experiment,
                            evaluators=[evaluate_tool_calls,evaluate_response_completeness])