# Hotel Support Agent Tutorial

This notebook demonstrates the Agent Catalog hotel support agent with:
- Agent Catalog (agentc) for tool discovery, prompt management, and span logging
- Couchbase Capella as vector store
- Capella AI Services for embeddings and LLM (with OpenAI fallback)
- LangChain for agent execution

## Setup and Imports

Import all necessary modules for the hotel support agent.

In [1]:
import base64
import getpass
import json
import logging
import os
import sys
import time
import requests
from datetime import timedelta

import agentc
import agentc_langchain
import dotenv
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.management.buckets import CreateBucketSettings
from couchbase.management.search import SearchIndex
from couchbase.options import ClusterOptions, ClusterTimeoutOptions
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.tools import Tool
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Setup logging with essential level only
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Suppress verbose logging from external libraries
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.getLogger("agentc_core").setLevel(logging.WARNING)

# Load environment variables
dotenv.load_dotenv(override=True)

print("✅ Setup complete")

✅ Setup complete


## Capella AI Configuration

Setup and test Capella AI configuration with connectivity verification and dynamic dimension detection.

In [2]:
def setup_capella_ai_config():
    """Setup Capella AI configuration - requires environment variables to be set."""
    # Verify required environment variables are set (no defaults)
    required_capella_vars = [
        "CB_USERNAME",
        "CB_PASSWORD",
        "CAPELLA_API_ENDPOINT",
        "CAPELLA_API_EMBEDDING_MODEL",
        "CAPELLA_API_LLM_MODEL",
    ]
    missing_vars = [var for var in required_capella_vars if not os.getenv(var)]
    if missing_vars:
        raise ValueError(f"Missing required Capella AI environment variables: {missing_vars}")

    return {
        "endpoint": os.getenv("CAPELLA_API_ENDPOINT"),
        "embedding_model": os.getenv("CAPELLA_API_EMBEDDING_MODEL"),
        "llm_model": os.getenv("CAPELLA_API_LLM_MODEL"),
        # Note: dimensions are detected dynamically from API response
    }

def test_capella_connectivity():
    """Test connectivity to Capella AI services and detect embedding dimensions."""
    try:
        endpoint = os.getenv("CAPELLA_API_ENDPOINT")
        if not endpoint:
            logger.warning("CAPELLA_API_ENDPOINT not configured")
            return False, None

        # Test embedding model (requires API key)
        if os.getenv("CB_USERNAME") and os.getenv("CB_PASSWORD"):
            api_key = base64.b64encode(
                f"{os.getenv('CB_USERNAME')}:{os.getenv('CB_PASSWORD')}".encode()
            ).decode()

            headers = {"Authorization": f"Basic {api_key}", "Content-Type": "application/json"}

            # Test embedding
            logger.info("Testing Capella AI connectivity...")
            embedding_data = {
                "model": os.getenv("CAPELLA_API_EMBEDDING_MODEL"),
                "input": "test connectivity",
            }

            embedding_response = requests.post(
                f"{endpoint}/embeddings", headers=headers, json=embedding_data, timeout=30
            )

            if embedding_response.status_code == 200:
                embed_result = embedding_response.json()
                embed_dims = len(embed_result["data"][0]["embedding"])
                logger.info(f"✅ Capella AI embedding test successful - dimensions: {embed_dims}")
                return True, embed_dims
            else:
                logger.warning(
                    f"Capella AI embedding test failed: {embedding_response.status_code}"
                )
                logger.warning(f"Response: {embedding_response.text[:200]}...")
                return False, None

        logger.info("✅ Capella AI connectivity tests completed successfully")
        return True, None

    except Exception as e:
        logger.warning(f"Capella AI connectivity test failed: {e}")
        return False, None

print("✅ Capella AI configuration functions defined")

✅ Capella AI configuration functions defined


## Environment Setup

Setup required environment variables with Capella cloud defaults and validation.

In [3]:
def _set_if_undefined(var: str):
    if os.environ.get(var) is None:
        os.environ[var] = getpass.getpass(f"Please provide your {var}: ")

def setup_environment():
    """Setup required environment variables with defaults."""
    # Setup Capella AI configuration first
    try:
        setup_capella_ai_config()
        print("✅ Capella AI configuration validated")
    except ValueError as e:
        print(f"⚠️ Capella AI configuration issue: {e}")
        print("Will attempt to set required variables...")

    # Required variables
    required_vars = ["OPENAI_API_KEY", "CB_CONN_STRING", "CB_USERNAME", "CB_PASSWORD", "CB_BUCKET"]
    for var in required_vars:
        _set_if_undefined(var)

    defaults = {
        "CB_CONN_STRING": "couchbase://localhost",
        "CB_USERNAME": "Administrator",
        "CB_PASSWORD": "password",
        "CB_BUCKET": "vector-search-testing",
    }

    for key, default_value in defaults.items():
        if not os.environ.get(key):
            os.environ[key] = input(f"Enter {key} (default: {default_value}): ") or default_value

    os.environ["CB_INDEX"] = os.getenv("CB_INDEX", "hotel_data_index")
    os.environ["CB_SCOPE"] = os.getenv("CB_SCOPE", "agentc_data")
    os.environ["CB_COLLECTION"] = os.getenv("CB_COLLECTION", "hotel_data")

    # Generate Capella AI API key from username and password if endpoint is provided
    if os.getenv('CAPELLA_API_ENDPOINT'):
        os.environ['CAPELLA_API_KEY'] = base64.b64encode(
            f"{os.getenv('CB_USERNAME')}:{os.getenv('CB_PASSWORD')}".encode("utf-8")
        ).decode("utf-8")
        
        # Ensure endpoint has /v1 suffix for OpenAI compatibility
        if not os.getenv('CAPELLA_API_ENDPOINT').endswith('/v1'):
            os.environ['CAPELLA_API_ENDPOINT'] = os.getenv('CAPELLA_API_ENDPOINT').rstrip('/') + '/v1'
            logger.info(f"Added /v1 suffix to endpoint: {os.getenv('CAPELLA_API_ENDPOINT')}")

    # Test Capella AI connectivity and get dimensions
    global EMBEDDING_DIMENSIONS
    EMBEDDING_DIMENSIONS = None
    
    if os.getenv('CAPELLA_API_ENDPOINT'):
        connectivity_ok, dims = test_capella_connectivity()
        if connectivity_ok:
            EMBEDDING_DIMENSIONS = dims
            logger.info(f"✅ Detected embedding dimensions: {dims}")
        else:
            logger.warning("❌ Capella AI connectivity test failed")
    else:
        print("ℹ️ Capella API not configured - will use OpenAI models")

setup_environment()
print("✅ Environment configured")

2025-07-15 15:17:47,219 - __main__ - INFO - Added /v1 suffix to endpoint: https://6u2le4tknje7yzkg.ai.cloud.couchbase.com/v1
2025-07-15 15:17:47,221 - __main__ - INFO - Testing Capella AI connectivity...


✅ Capella AI configuration validated


2025-07-15 15:17:48,391 - __main__ - INFO - ✅ Capella AI embedding test successful - dimensions: 1024
2025-07-15 15:17:48,393 - __main__ - INFO - ✅ Detected embedding dimensions: 1024


✅ Environment configured


## CouchbaseClient Class

Define the centralized CouchbaseClient class for all database operations with improved error handling and timeouts.

In [4]:
# Import hotel data from the data module
try:
    from data.hotel_data import get_hotel_texts, load_hotel_data_to_couchbase
    print("✅ Hotel data module imported successfully")
except ImportError as e:
    print(f"⚠️ Could not import hotel data module: {e}")
    print("Will use fallback data loading methods")

class CouchbaseClient:
    """Centralized Couchbase client for all database operations."""

    def __init__(self, conn_string: str, username: str, password: str, bucket_name: str):
        """Initialize Couchbase client with connection details."""
        self.conn_string = conn_string
        self.username = username
        self.password = password
        self.bucket_name = bucket_name
        self.cluster = None
        self.bucket = None
        self._collections = {}

    def connect(self):
        """Establish connection to Couchbase cluster."""
        try:
            auth = PasswordAuthenticator(self.username, self.password)
            options = ClusterOptions(auth)
            
            # Use WAN profile for better timeout handling with remote clusters
            options.apply_profile("wan_development")
            
            # Additional timeout configurations for Capella cloud connections
            from couchbase.options import (
                ClusterTimeoutOptions,
                ClusterTracingOptions,
            )
            
            # Configure extended timeouts for cloud connectivity
            timeout_options = ClusterTimeoutOptions(
                kv_timeout=timedelta(seconds=10),  # Key-value operations
                kv_durable_timeout=timedelta(seconds=15),  # Durable writes
                query_timeout=timedelta(seconds=30),  # N1QL queries
                search_timeout=timedelta(seconds=30),  # Search operations
                management_timeout=timedelta(seconds=30),  # Management operations
                bootstrap_timeout=timedelta(seconds=20),  # Initial connection
            )
            options.timeout_options = timeout_options
            
            self.cluster = Cluster(self.conn_string, options)
            # Increased wait time for cloud connections
            self.cluster.wait_until_ready(timedelta(seconds=20))
            logger.info("Successfully connected to Couchbase")
            return self.cluster
        except Exception as e:
            raise ConnectionError(f"Failed to connect to Couchbase: {e!s}")

    def setup_collection(self, scope_name: str, collection_name: str):
        """Setup bucket, scope and collection all in one function."""
        try:
            # Ensure cluster connection
            if not self.cluster:
                self.connect()

            # Setup bucket
            if not self.bucket:
                try:
                    self.bucket = self.cluster.bucket(self.bucket_name)
                    logger.info(f"Bucket '{self.bucket_name}' exists")
                except Exception:
                    logger.info(f"Creating bucket '{self.bucket_name}'...")
                    bucket_settings = CreateBucketSettings(
                        name=self.bucket_name,
                        bucket_type="couchbase",
                        ram_quota_mb=1024,
                        flush_enabled=True,
                        num_replicas=0,
                    )
                    self.cluster.buckets().create_bucket(bucket_settings)
                    time.sleep(5)
                    self.bucket = self.cluster.bucket(self.bucket_name)
                    logger.info(f"Bucket '{self.bucket_name}' created successfully")

            # Setup scope
            bucket_manager = self.bucket.collections()
            scopes = bucket_manager.get_all_scopes()
            scope_exists = any(scope.name == scope_name for scope in scopes)

            if not scope_exists and scope_name != "_default":
                logger.info(f"Creating scope '{scope_name}'...")
                bucket_manager.create_scope(scope_name)
                logger.info(f"Scope '{scope_name}' created successfully")

            # Setup collection
            collections = bucket_manager.get_all_scopes()
            collection_exists = any(
                scope.name == scope_name and collection_name in [col.name for col in scope.collections]
                for scope in collections
            )

            if not collection_exists:
                logger.info(f"Creating collection '{collection_name}'...")
                bucket_manager.create_collection(scope_name, collection_name)
                logger.info(f"Collection '{collection_name}' created successfully")

            time.sleep(3)

            # Create primary index
            try:
                self.cluster.query(
                    f"CREATE PRIMARY INDEX IF NOT EXISTS ON `{self.bucket_name}`.`{scope_name}`.`{collection_name}`"
                ).execute()
                logger.info("Primary index created successfully")
            except Exception as e:
                logger.warning(f"Error creating primary index: {e}")

            logger.info("Collection setup complete")
            return self.bucket.scope(scope_name).collection(collection_name)

        except Exception as e:
            raise RuntimeError(f"Error setting up collection: {e!s}")

    def get_collection(self, scope_name: str, collection_name: str):
        """Get a collection object."""
        key = f"{scope_name}.{collection_name}"
        if key not in self._collections:
            self._collections[key] = self.bucket.scope(scope_name).collection(collection_name)
        return self._collections[key]

    def setup_vector_search_index(self, index_definition: dict, scope_name: str):
        """Setup vector search index."""
        try:
            scope_index_manager = self.bucket.scope(scope_name).search_indexes()
            
            existing_indexes = scope_index_manager.get_all_indexes()
            index_name = index_definition["name"]

            if index_name not in [index.name for index in existing_indexes]:
                logger.info(f"Creating vector search index '{index_name}'...")
                search_index = SearchIndex.from_json(index_definition)
                scope_index_manager.upsert_index(search_index)
                logger.info(f"Vector search index '{index_name}' created successfully")
            else:
                logger.info(f"Vector search index '{index_name}' already exists")
        except Exception as e:
            raise RuntimeError(f"Error setting up vector search index: {e!s}")

    def load_hotel_data(self, scope_name, collection_name, index_name, embeddings):
        """Load hotel data into Couchbase."""
        try:
            # Clear existing data first
            self.clear_collection_data(scope_name, collection_name)
            
            # Setup vector store
            vector_store = CouchbaseVectorStore(
                cluster=self.cluster,
                bucket_name=self.bucket_name,
                scope_name=scope_name,
                collection_name=collection_name,
                embedding=embeddings,
                index_name=index_name,
            )
            
            # Load hotel data using the data loading script
            try:
                load_hotel_data_to_couchbase(
                    cluster=self.cluster,
                    bucket_name=self.bucket_name,
                    scope_name=scope_name,
                    collection_name=collection_name,
                    embeddings=embeddings,
                    index_name=index_name
                )
                logger.info("Hotel data loaded into vector store successfully using data loading script")
            except Exception as e:
                logger.warning(f"Error loading hotel data with script: {e}. Falling back to direct method.")
                # Fallback to the original method
                try:
                    hotel_data = get_hotel_texts()
                    vector_store.add_texts(texts=hotel_data, batch_size=10)
                    logger.info("Hotel data loaded into vector store successfully using fallback method")
                except Exception as e2:
                    logger.warning(f"Fallback method also failed: {e2}")
                    # Create some sample hotel data for demo purposes
                    sample_data = [
                        "Luxury Hotel in New York with spa, pool, and concierge services. Located in Manhattan.",
                        "Budget hotel in San Francisco near airport with free breakfast and WiFi.",
                        "Ocean Breeze Resort in Miami Beach with ocean views, pool, spa, and beachfront location.",
                        "Business hotel in Chicago with conference facilities, fitness center, and restaurant.",
                        "Pet-friendly hotel in Seattle with gym, business center, and parking."
                    ]
                    vector_store.add_texts(texts=sample_data, batch_size=5)
                    logger.info("Sample hotel data loaded for demo purposes")
                
        except Exception as e:
            raise RuntimeError(f"Error loading hotel data: {e!s}")

    def setup_vector_store(
        self, scope_name: str, collection_name: str, index_name: str, embeddings
    ):
        """Setup vector store with hotel data."""
        try:
            # Use the embeddings parameter passed in - no fallbacks
            if not embeddings:
                raise RuntimeError("Embeddings parameter is required - no fallbacks available")
            
            logger.info("✅ Using provided embeddings for vector store setup")

            # Load hotel data
            self.load_hotel_data(scope_name, collection_name, index_name, embeddings)

            # Create vector store
            vector_store = CouchbaseVectorStore(
                cluster=self.cluster,
                bucket_name=self.bucket_name,
                scope_name=scope_name,
                collection_name=collection_name,
                embedding=embeddings,
                index_name=index_name,
            )

            logger.info("Vector store setup complete")
            return vector_store

        except Exception as e:
            raise RuntimeError(f"Error setting up vector store: {e!s}")

    def clear_collection_data(self, scope_name: str, collection_name: str):
        """Clear all documents from the collection to start fresh."""
        try:
            # Delete all documents in the collection
            delete_query = f"DELETE FROM `{self.bucket_name}`.`{scope_name}`.`{collection_name}`"
            result = self.cluster.query(delete_query)
            
            logger.info(f"Cleared existing data from collection {scope_name}.{collection_name}")
            
        except Exception as e:
            logger.warning(f"Could not clear collection data: {e}. Continuing with existing data...")

    def clear_scope(self, scope_name: str):
        """Clear all collections in scope."""
        try:
            bucket_manager = self.bucket.collections()
            scopes = bucket_manager.get_all_scopes()
            
            for scope in scopes:
                if scope.name == scope_name:
                    for collection in scope.collections:
                        self.clear_collection_data(scope_name, collection.name)
            
            logger.info(f"Cleared all collections in scope: {scope_name}")
            
        except Exception as e:
            logger.warning(f"Could not clear scope: {e}")

print("✅ CouchbaseClient class defined")

✅ Hotel data module imported successfully
✅ CouchbaseClient class defined


## Agent Catalog and Span Setup

Initialize Agent Catalog and create application span for logging.

In [5]:
# Initialize agent catalog
catalog = agentc.Catalog()

# Create application span for logging
application_span = catalog.Span(name="Hotel Support Agent")

print("✅ Agent Catalog initialized")
print(f"Application span created: {application_span.name}")

✅ Agent Catalog initialized
Application span created: Hotel Support Agent


## Hotel Search Agent Setup

Setup the complete hotel search agent with Couchbase infrastructure and Agent Catalog integration.

In [6]:
def setup_hotel_support_agent():
    """Setup the hotel support agent with all required components."""
    try:
        with application_span.new("Environment Setup"):
            # Environment already setup above, but verify key variables
            required_vars = ["CB_CONN_STRING", "CB_USERNAME", "CB_PASSWORD", "CB_BUCKET"]
            for var in required_vars:
                if not os.getenv(var):
                    raise ValueError(f"Required environment variable {var} not set")
            print("✅ Environment variables verified")

        with application_span.new("Capella AI Test"):
            if os.getenv('CAPELLA_API_ENDPOINT'):
                connectivity_ok, dims = test_capella_connectivity()
                if not connectivity_ok:
                    logger.warning("❌ Capella AI connectivity test failed. Will use OpenAI fallback.")
            else:
                logger.info("ℹ️ Capella API not configured - will use OpenAI models")

        with application_span.new("Couchbase Connection"):
            couchbase_client = CouchbaseClient(
                conn_string=os.getenv("CB_CONN_STRING"),
                username=os.getenv("CB_USERNAME"),
                password=os.getenv("CB_PASSWORD"),
                bucket_name=os.getenv("CB_BUCKET")
            )
            
            couchbase_client.connect()

        with application_span.new("Couchbase Collection Setup"):
            couchbase_client.setup_collection(
                os.getenv("CB_SCOPE"),
                os.getenv("CB_COLLECTION")
            )

        with application_span.new("Vector Index Setup"):
            try:
                with open('agentcatalog_index.json', 'r') as file:
                    index_definition = json.load(file)
                logger.info("Loaded vector search index definition from agentcatalog_index.json")
            except Exception as e:
                raise ValueError(f"Error loading index definition: {e!s}")
            
            couchbase_client.setup_vector_search_index(index_definition, os.getenv("CB_SCOPE"))

        with application_span.new("Vector Store Setup"):
            # Setup embeddings using CB_USERNAME/CB_PASSWORD like flight search agent
            try:
                if (
                    os.getenv("CB_USERNAME")
                    and os.getenv("CB_PASSWORD")
                    and os.getenv("CAPELLA_API_ENDPOINT")
                    and os.getenv("CAPELLA_API_EMBEDDING_MODEL")
                ):
                    # Create API key for Capella AI
                    api_key = base64.b64encode(
                        f"{os.getenv('CB_USERNAME')}:{os.getenv('CB_PASSWORD')}".encode()
                    ).decode()

                    # Use OpenAI embeddings client with Capella endpoint
                    embeddings = OpenAIEmbeddings(
                        model=os.getenv("CAPELLA_API_EMBEDDING_MODEL"),
                        api_key=api_key,
                        base_url=os.getenv("CAPELLA_API_ENDPOINT"),
                    )
                    logger.info("✅ Using Capella AI for embeddings")
                else:
                    raise ValueError("Capella AI credentials not available")
            except Exception as e:
                logger.error(f"❌ Capella AI embeddings failed: {e}")
                raise RuntimeError("Capella AI embeddings required for this configuration")
            
            couchbase_client.setup_vector_store(
                os.getenv("CB_SCOPE"),
                os.getenv("CB_COLLECTION"),
                os.getenv("CB_INDEX"),
                embeddings
            )

        with application_span.new("LLM Setup"):
            # Setup LLM with Agent Catalog callback - try Capella AI first, fallback to OpenAI
            try:
                # Create API key for Capella AI using same pattern as embeddings
                api_key = base64.b64encode(
                    f"{os.getenv('CB_USERNAME')}:{os.getenv('CB_PASSWORD')}".encode()
                ).decode()

                llm = ChatOpenAI(
                    api_key=api_key,
                    base_url=os.getenv('CAPELLA_API_ENDPOINT'),
                    model=os.getenv('CAPELLA_API_LLM_MODEL'),
                    temperature=0,
                    callbacks=[agentc_langchain.chat.Callback(span=application_span)]
                )
                # Test the LLM works
                llm.invoke("Hello")
                logger.info("✅ Using Capella AI LLM")
            except Exception as e:
                logger.warning(f"⚠️ Capella AI LLM failed: {e}")
                logger.info("🔄 Falling back to OpenAI LLM...")
                if not os.getenv("OPENAI_API_KEY"):
                    os.environ["OPENAI_API_KEY"] = getpass.getpass("Please provide your OPENAI_API_KEY: ")
                llm = ChatOpenAI(
                    api_key=os.getenv("OPENAI_API_KEY"),
                    model="gpt-4o",
                    temperature=0,
                    callbacks=[agentc_langchain.chat.Callback(span=application_span)]
                )
                logger.info("✅ Using OpenAI LLM as fallback")

        with application_span.new("Tool Loading"):
            # Load tools from Agent Catalog - they are now properly decorated
            tool_search = catalog.find("tool", name="search_vector_database")
            tool_details = catalog.find("tool", name="get_hotel_details")
            
            if not tool_search:
                raise ValueError(
                    "Could not find search_vector_database tool. Make sure it's indexed with 'agentc index tools/'"
                )
            if not tool_details:
                raise ValueError(
                    "Could not find get_hotel_details tool. Make sure it's indexed with 'agentc index tools/'"
                )
            
            tools = [
                Tool(
                    name=tool_search.meta.name,
                    description=tool_search.meta.description,
                    func=tool_search.func
                ),
                Tool(
                    name=tool_details.meta.name, 
                    description=tool_details.meta.description,
                    func=tool_details.func
                )
            ]

        with application_span.new("Agent Creation"):
            # Get prompt from Agent Catalog
            hotel_prompt = catalog.find("prompt", name="hotel_search_assistant")
            if not hotel_prompt:
                raise ValueError(
                    "Could not find hotel_search_assistant prompt in catalog. Make sure it's indexed with 'agentc index prompts/'"
                )
            
            # Create a custom prompt using the catalog prompt content
            prompt_content = hotel_prompt.content.strip()
            
            custom_prompt = PromptTemplate(
                template=prompt_content,
                input_variables=["input", "agent_scratchpad"],
                partial_variables={
                    "tools": "\n".join([f"{tool.name}: {tool.description}" for tool in tools]),
                    "tool_names": ", ".join([tool.name for tool in tools])
                }
            )
            
            agent = create_react_agent(llm, tools, custom_prompt)
            agent_executor = AgentExecutor(
                agent=agent, 
                tools=tools, 
                verbose=True, 
                handle_parsing_errors=True,
                max_iterations=5,  # Increased from 3 to 5 for better performance
                max_execution_time=60,  # Increased timeout
                return_intermediate_steps=True
            )

        return agent_executor, application_span

    except Exception as e:
        logger.exception(f"Error setting up hotel support agent: {e}")
        raise

# Setup the agent
agent_executor, application_span = setup_hotel_support_agent()
print("✅ Hotel search agent setup complete")

2025-07-15 15:17:48,658 - __main__ - INFO - Testing Capella AI connectivity...


✅ Environment variables verified


2025-07-15 15:17:49,728 - __main__ - INFO - ✅ Capella AI embedding test successful - dimensions: 1024
2025-07-15 15:17:51,812 - __main__ - INFO - Successfully connected to Couchbase
2025-07-15 15:17:53,092 - __main__ - INFO - Bucket 'vector-search-testing' exists
2025-07-15 15:17:59,090 - __main__ - INFO - Primary index created successfully
2025-07-15 15:17:59,091 - __main__ - INFO - Collection setup complete
2025-07-15 15:17:59,100 - __main__ - INFO - Loaded vector search index definition from agentcatalog_index.json
2025-07-15 15:17:59,991 - __main__ - INFO - Vector search index 'hotel_data_index' already exists
2025-07-15 15:18:00,334 - __main__ - INFO - ✅ Using Capella AI for embeddings
2025-07-15 15:18:00,335 - __main__ - INFO - ✅ Using provided embeddings for vector store setup
2025-07-15 15:18:00,335 - __main__ - INFO - Cleared existing data from collection agentc_data.hotel_data


Loading hotel data to vector-search-testing.agentc_data.hotel_data
Cluster: <couchbase.cluster.Cluster object at 0x105d08650>
Bucket name: vector-search-testing
Scope name: agentc_data
Collection name: hotel_data
Embeddings: client=<openai.resources.embeddings.Embeddings object at 0x105cecbc0> async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x128ef2c00> model='Snowflake/snowflake-arctic-embed-l-v2.0' dimensions=None deployment='text-embedding-ada-002' openai_api_version=None openai_api_base='https://6u2le4tknje7yzkg.ai.cloud.couchbase.com/v1' openai_api_type=None openai_proxy=None embedding_ctx_length=8191 openai_api_key=SecretStr('**********') openai_organization=None allowed_special=None disallowed_special=None chunk_size=1000 max_retries=2 request_timeout=None headers=None tiktoken_enabled=True tiktoken_model_name=None show_progress_bar=False model_kwargs={} skip_empty=False default_headers=None default_query=None retry_min_seconds=4 retry_max_seconds=20 http_cli

2025-07-15 15:18:09,496 - __main__ - INFO - Hotel data loaded into vector store successfully using data loading script


✓ Successfully added 11 documents
✅ Successfully loaded all 11 hotels into vector store


2025-07-15 15:18:12,786 - __main__ - INFO - Vector store setup complete
2025-07-15 15:18:15,232 - __main__ - INFO - ✅ Using Capella AI LLM


✅ Hotel search agent setup complete


## Automated Query Testing

Test the hotel search agent with comprehensive automated testing scenarios covering different types of hotel searches.

In [7]:
def test_hotel_queries():
    """Test the hotel search agent with predefined queries."""
    print("\n🏨 Hotel Search Agent - Automated Query Testing")
    print("=" * 55)
    
    # Comprehensive test queries covering different scenarios
    test_queries = [
        "Find me a luxury hotel in New York with a spa and pool",
        "I need a budget hotel in San Francisco near the airport",
        "Show me business hotels in Chicago with conference facilities",
        "What hotels are available in Miami Beach with ocean views?",
        "Find pet-friendly hotels in Seattle with fitness centers",
        "I need a hotel in Los Angeles with free breakfast and parking"
    ]
    
    print(f"\n📋 Testing {len(test_queries)} hotel search scenarios:")
    for i, query in enumerate(test_queries, 1):
        print(f"   {i}. {query}")
    
    print("\n" + "=" * 55)
    
    results = []
    
    for i, query in enumerate(test_queries, 1):
        print(f"\n🔍 Test {i}: {query}")
        print("-" * 50)
        
        try:
            # Create span for this query
            with application_span.new(f"Test Query {i}") as query_span:
                query_span["query"] = query
                
                # Execute the query
                start_time = time.time()
                result = agent_executor.invoke({"input": query})
                execution_time = time.time() - start_time
                
                response = result['output']
                print(f"🤖 Response: {response}")
                print(f"⏱️ Execution time: {execution_time:.2f} seconds")
                
                query_span["response"] = response
                query_span["execution_time"] = execution_time
                
                results.append({
                    'query': query,
                    'response': response,
                    'execution_time': execution_time
                })
            
        except Exception as e:
            print(f"❌ Error processing query: {e}")
            results.append({
                'query': query,
                'response': f"Error: {str(e)}",
                'execution_time': 0
            })
    
    # Print summary
    print("\n" + "=" * 55)
    print("📊 TESTING SUMMARY")
    print("=" * 55)
    print(f"Total queries tested: {len(test_queries)}")
    
    if results:
        avg_execution_time = sum(r['execution_time'] for r in results) / len(results)
        print(f"Average execution time: {avg_execution_time:.2f} seconds")
    
    print("\n✅ Automated testing completed!")
    return results

# Run the automated test
test_results = test_hotel_queries()


🏨 Hotel Search Agent - Automated Query Testing

📋 Testing 6 hotel search scenarios:
   1. Find me a luxury hotel in New York with a spa and pool
   2. I need a budget hotel in San Francisco near the airport
   3. Show me business hotels in Chicago with conference facilities
   4. What hotels are available in Miami Beach with ocean views?
   5. Find pet-friendly hotels in Seattle with fitness centers
   6. I need a hotel in Los Angeles with free breakfast and parking


🔍 Test 1: Find me a luxury hotel in New York with a spa and pool
--------------------------------------------------


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mOkay, so the user is asking for a luxury hotel in New York with both a spa and a pool. Let me break this down. First, they specified the location as New York, so I need to prioritize that. They also mentioned two key amenities: spa and pool. Since they're looking for luxury, I should focus on high-end properties.

I remember the tools available: s

## Arize Phoenix Evaluation Demo

This section demonstrates how to evaluate the hotel support agent using Arize Phoenix observability platform. The evaluation includes:

- **Relevance Scoring**: Using Phoenix evaluators to score how relevant responses are to queries
- **QA Scoring**: Using Phoenix evaluators to score answer quality  
- **Hallucination Detection**: Detecting fabricated or incorrect information
- **Toxicity Detection**: Ensuring responses are appropriate and safe
- **Hotel-specific Evaluations**: Custom evaluations for hotel information, recommendations, and amenities
- **Phoenix UI**: Real-time observability dashboard at `http://localhost:6006/`

The evaluation system provides comprehensive metrics for hotel search agent performance monitoring.

In [8]:
# Import evaluation dependencies
import pandas as pd
from typing import Dict, List
from uuid import uuid4

# Try to import Arize dependencies with fallback
try:
    import phoenix as px
    from phoenix.otel import register
    from openinference.instrumentation.langchain import LangChainInstrumentor
    from openinference.instrumentation.openai import OpenAIInstrumentor
    from opentelemetry import trace
    from opentelemetry.sdk.trace import TracerProvider
    from opentelemetry.sdk.trace.export import ConsoleSpanExporter, SimpleSpanProcessor
    from phoenix.evals import (
        QA_PROMPT_RAILS_MAP,
        QA_PROMPT_TEMPLATE,
        RAG_RELEVANCY_PROMPT_RAILS_MAP,
        RAG_RELEVANCY_PROMPT_TEMPLATE,
        HALLUCINATION_PROMPT_RAILS_MAP,
        HALLUCINATION_PROMPT_TEMPLATE,
        TOXICITY_PROMPT_RAILS_MAP,
        TOXICITY_PROMPT_TEMPLATE,
        HallucinationEvaluator,
        ToxicityEvaluator,
        RelevanceEvaluator,
        QAEvaluator,
        OpenAIModel,
        llm_classify,
        run_evals,
    )
    ARIZE_AVAILABLE = True
    print("✅ Phoenix/Arize dependencies loaded successfully")
except ImportError as e:
    print(f"⚠️ Phoenix/Arize not available: {e}")
    print("📦 Install with: pip install arize-phoenix openinference-instrumentation-langchain")
    ARIZE_AVAILABLE = False

  from .autonotebook import tqdm as notebook_tqdm
2025-07-15 15:21:30,798 - phoenix.config - INFO - 📋 Ensuring phoenix working directory: /Users/kaustavghosh/.phoenix
2025-07-15 15:21:30,813 - phoenix.inferences.inferences - INFO - Dataset: phoenix_inferences_81bbaf10-e637-4a17-99e6-d7dc51cd9279 initialized


✅ Phoenix/Arize dependencies loaded successfully


In [9]:
class ArizeHotelSupportEvaluator:
    """
    Comprehensive evaluation system for hotel support agents using Arize AI.

    This class provides:
    - Hotel search performance evaluation with multiple metrics
    - Tool effectiveness monitoring (search, details, booking)
    - Response quality assessment and search accuracy tracking
    - Comparative analysis of different hotel search strategies
    """

    def __init__(self, catalog: agentc.Catalog, span: agentc.Span):
        """Initialize the Arize evaluator with Agent Catalog integration."""
        self.catalog = catalog
        self.span = span
        self.phoenix_session = None
        self.tracer_provider = None

        # Initialize Arize observability if available
        if ARIZE_AVAILABLE:
            self._setup_arize_observability()

            # Initialize evaluation models - don't hardcode model
            self.evaluator_llm = OpenAIModel(model="gpt-4o")

            # Define evaluation rails
            self.relevance_rails = list(RAG_RELEVANCY_PROMPT_RAILS_MAP.values())
            self.qa_rails = list(QA_PROMPT_RAILS_MAP.values())
            self.hallucination_rails = list(HALLUCINATION_PROMPT_RAILS_MAP.values())
            self.toxicity_rails = list(TOXICITY_PROMPT_RAILS_MAP.values())
        else:
            print("⚠️ Arize not available - running basic evaluation only")

    def _setup_arize_observability(self):
        """Configure Arize observability with OpenTelemetry instrumentation."""
        try:
            print("🔧 Setting up Arize observability...")

            # Start Phoenix session with environment variable approach
            import subprocess
            
            # Kill any existing Phoenix processes on common ports
            try:
                for port in [4317, 6006, 6007, 6008]:
                    try:
                        subprocess.run(['lsof', '-ti', f':{port}'], capture_output=True, check=True)
                        subprocess.run(['kill', '-9'] + subprocess.run(['lsof', '-ti', f':{port}'], capture_output=True, text=True).stdout.strip().split(), capture_output=True)
                        print(f"🔄 Killed existing process on port {port}")
                    except:
                        pass  # Port not in use, continue
            except:
                pass  # lsof not available or other error
            
            # Try to start Phoenix with environment variables
            ports_to_try = [6006, 6007, 6008, 6009]
            self.phoenix_session = None
            
            for port in ports_to_try:
                try:
                    # Set environment variables for Phoenix
                    os.environ['PHOENIX_PORT'] = str(port)
                    os.environ['PHOENIX_GRPC_PORT'] = str(4317 + (port - 6006))
                    
                    self.phoenix_session = px.launch_app()
                    print(f"🌐 Phoenix UI: http://localhost:{port}")
                    break
                except Exception as e:
                    print(f"⚠️ Phoenix failed on port {port}: {e}")
                    continue
            
            if not self.phoenix_session:
                print("❌ Could not start Phoenix on any port, continuing without Phoenix UI")

            # Register Phoenix OTEL and get tracer provider  
            if self.phoenix_session:
                try:
                    self.tracer_provider = register(
                        project_name="hotel-support-agent-evaluation",
                        endpoint=f"http://localhost:{self.phoenix_session.port}/v1/traces",
                    )
                    print("✅ Phoenix OTEL registered successfully")
                except Exception as e:
                    print(f"⚠️ Phoenix OTEL registration failed: {e}")
                    # Fallback to local tracer
                    self.tracer_provider = TracerProvider()
                    trace.set_tracer_provider(self.tracer_provider)
            else:
                # Fallback to local tracer
                self.tracer_provider = TracerProvider()
                trace.set_tracer_provider(self.tracer_provider)
                print("✅ Local tracing configured successfully")

            # Instrument LangChain and OpenAI
            instrumentors = [
                ("LangChain", LangChainInstrumentor),
                ("OpenAI", OpenAIInstrumentor),
            ]

            for name, instrumentor_class in instrumentors:
                try:
                    instrumentor = instrumentor_class()
                    if not instrumentor.is_instrumented_by_opentelemetry:
                        instrumentor.instrument(tracer_provider=self.tracer_provider)
                        print(f"✅ {name} instrumented successfully")
                    else:
                        print(f"ℹ️ {name} already instrumented, skipping")
                except Exception as e:
                    print(f"⚠️ {name} instrumentation failed: {e}")
                    continue

            print("✅ Arize observability configured successfully")

        except Exception as e:
            print(f"⚠️ Error setting up Arize observability: {e}")

    def run_hotel_search_evaluation(self, test_inputs: List[str]) -> pd.DataFrame:
        """
        Run hotel search evaluation on a set of test inputs.

        Args:
            test_inputs: List of hotel search queries to evaluate

        Returns:
            DataFrame with evaluation results
        """
        print(f"🚀 Running evaluation on {len(test_inputs)} queries...")

        # Create agent (simplified for notebook)
        agent = agent_executor  # Use the already created agent

        results = []

        for i, query in enumerate(test_inputs, 1):
            print(f"  📝 Query {i}/{len(test_inputs)}: {query}")

            try:
                print(f"     🔄 Agent processing query...")
                response_dict = agent.invoke({"input": query})
                response = (
                    response_dict.get("output", str(response_dict))
                    if response_dict
                    else "No response"
                )
                print(f"     ✅ Response received")

                # Analyze response
                has_hotel_info = self._check_hotel_info(response)
                has_recommendations = self._check_recommendations(response)
                has_amenities = self._check_amenities(response)
                appropriate_response = self._check_appropriate_response(query, response)

                # Create response preview
                response_preview = (
                    (response[:150] + "...") if len(response) > 150 else response
                )
                print(f"     💬 Response preview: {response_preview}")

                results.append(
                    {
                        "example_id": f"hotel_query_{i}",
                        "query": query,
                        "output": response,
                        "has_hotel_info": has_hotel_info,
                        "has_recommendations": has_recommendations,
                        "has_amenities": has_amenities,
                        "appropriate_response": appropriate_response,
                        "response_length": len(response),
                    }
                )

            except Exception as e:
                print(f"     ❌ Error processing query: {e}")
                results.append(
                    {
                        "example_id": f"hotel_query_{i}",
                        "query": query,
                        "output": f"Error: {str(e)}",
                        "has_hotel_info": False,
                        "has_recommendations": False,
                        "has_amenities": False,
                        "appropriate_response": False,
                        "response_length": 0,
                    }
                )

        return pd.DataFrame(results)

    def _check_hotel_info(self, response_text: str) -> bool:
        """Check if response contains hotel information."""
        hotel_indicators = [
            "hotel",
            "room",
            "accommodation",
            "stay",
            "booking",
            "reservation",
            "check-in",
            "check-out",
        ]
        return any(indicator in response_text.lower() for indicator in hotel_indicators)

    def _check_recommendations(self, response_text: str) -> bool:
        """Check if response contains hotel recommendations."""
        recommendation_indicators = [
            "recommend",
            "suggest",
            "option",
            "choice",
            "available",
            "found",
            "located",
        ]
        return any(
            indicator in response_text.lower()
            for indicator in recommendation_indicators
        )

    def _check_amenities(self, response_text: str) -> bool:
        """Check if response mentions amenities."""
        amenity_indicators = [
            "amenities",
            "pool",
            "gym",
            "wifi",
            "breakfast",
            "parking",
            "spa",
            "restaurant",
            "bar",
        ]
        return any(
            indicator in response_text.lower() for indicator in amenity_indicators
        )

    def _check_appropriate_response(self, query: str, response: str) -> bool:
        """Check if response is appropriate for the query."""
        # Check for hallucination indicators
        hallucination_indicators = [
            "mars",
            "pluto",
            "atlantis",
            "fictional",
            "not possible",
            "not available",
        ]
        has_hallucination = any(
            indicator in response.lower() for indicator in hallucination_indicators
        )

        # Check for reasonable response length
        reasonable_length = 50 < len(response) < 2000

        # Check for hotel-related content
        has_hotel_content = self._check_hotel_info(response)

        return not has_hallucination and reasonable_length and has_hotel_content

    def run_arize_evaluations(self, results_df: pd.DataFrame) -> pd.DataFrame:
        """
        Run comprehensive Arize-based LLM evaluations on the results using Phoenix evaluators.

        Args:
            results_df: DataFrame with evaluation results

        Returns:
            DataFrame with additional Arize evaluation columns
        """
        if not ARIZE_AVAILABLE:
            print("⚠️ Arize not available - skipping LLM evaluations")
            return results_df

        print(f"🧠 Running Comprehensive Phoenix Evaluations on {len(results_df)} responses...")
        print("   📋 Evaluation criteria:")
        print("      🔍 Relevance: Does the response address the hotel search query?")
        print("      🎯 Correctness: Is the hotel information accurate and helpful?")
        print("      🚨 Hallucination: Does the response contain fabricated information?")
        print("      ☠️  Toxicity: Is the response harmful or inappropriate?")

        try:
            # Prepare data for evaluation with improved reference texts
            evaluation_data = []
            for _, row in results_df.iterrows():
                query = row["query"]
                output = row["output"]
                
                # Create more specific reference text based on query content
                if "luxury" in query.lower():
                    reference = "A relevant response listing luxury hotels with specific amenities like concierge, spa, restaurant, fitness center, and pricing details"
                elif "fitness center" in query.lower() or "gym" in query.lower():
                    reference = "A relevant response about hotels with fitness centers, including 24/7 fitness facilities and other health amenities"
                elif "miami" in query.lower() and "beach" in query.lower():
                    reference = "A relevant response about Miami beach resorts with ocean views, pools, spa amenities, and beachfront locations"
                elif "business" in query.lower():
                    reference = "A relevant response about business hotels with conference facilities, business centers, and executive amenities"
                else:
                    reference = f"A helpful and accurate response about {query.lower()} with specific hotel details, amenities, pricing, and locations"
                
                evaluation_data.append(
                    {
                        # Standard columns for all evaluations
                        "input": query,
                        "output": output,
                        "reference": reference,
                        
                        # Specific columns for different evaluations
                        "query": query,  # For hallucination evaluation
                        "response": output,  # For hallucination evaluation
                        "text": output,  # For toxicity evaluation
                    }
                )

            eval_df = pd.DataFrame(evaluation_data)

            # Initialize evaluators with the model
            evaluators = {
                'relevance': RelevanceEvaluator(self.evaluator_llm),
                'qa': QAEvaluator(self.evaluator_llm),
                'hallucination': HallucinationEvaluator(self.evaluator_llm), 
                'toxicity': ToxicityEvaluator(self.evaluator_llm),
            }

            # Run comprehensive evaluations using Phoenix evaluators
            print(f"\n   🧠 Running advanced Phoenix evaluations...")

            try:
                # Run all evaluations using run_evals for comprehensive analysis
                evaluation_results = run_evals(
                    dataframe=eval_df,
                    evaluators=list(evaluators.values()),
                    provide_explanation=True,
                )

                # Add evaluation results to our DataFrame
                if evaluation_results is not None:
                    # Handle both DataFrame and list results from run_evals
                    if isinstance(evaluation_results, pd.DataFrame) and not evaluation_results.empty:
                        # Extract evaluation results using flexible column matching
                        for col in evaluation_results.columns:
                            if col.endswith('_eval'):
                                # Extract evaluator name from column (e.g., 'relevance_eval' -> 'relevance')
                                eval_name = col.replace('_eval', '')
                                results_df[f"arize_{eval_name}"] = evaluation_results[col].tolist()
                                print(f"      ✅ Extracted {eval_name} evaluations")
                            elif col.endswith('_explanation'):
                                # Extract evaluator name from column (e.g., 'relevance_explanation' -> 'relevance')
                                eval_name = col.replace('_explanation', '')
                                results_df[f"arize_{eval_name}_explanation"] = evaluation_results[col].tolist()
                                print(f"      ✅ Extracted {eval_name} explanations")
                            elif col.endswith('_score'):
                                # Extract evaluator name from column (e.g., 'relevance_score' -> 'relevance')
                                eval_name = col.replace('_score', '')
                                results_df[f"arize_{eval_name}_score"] = evaluation_results[col].tolist()
                                print(f"      ✅ Extracted {eval_name} scores")

                        print(f"   ✅ Comprehensive evaluations completed successfully")
                    else:
                        print(f"   ⚠️ Phoenix evaluators returned unexpected format: {type(evaluation_results)}")
                        # Fall back to basic scoring
                        self._add_basic_evaluations(results_df)
                else:
                    print(f"   ⚠️ Phoenix evaluators returned None")
                    # Fall back to basic scoring
                    self._add_basic_evaluations(results_df)

            except Exception as e:
                print(f"   ⚠️ run_evals failed: {e}, falling back to basic evaluation")
                self._add_basic_evaluations(results_df)

            print(f"   ✅ All Phoenix evaluations completed")

        except Exception as e:
            print(f"   ❌ Error running Phoenix evaluations: {e}")
            # Add default values if evaluation fails
            self._add_basic_evaluations(results_df)

        return results_df

    def _add_basic_evaluations(self, results_df: pd.DataFrame):
        """Add basic evaluation scores as fallback."""
        for eval_type in ['relevance', 'qa', 'hallucination', 'toxicity']:
            results_df[f"arize_{eval_type}"] = "basic_evaluation"
            results_df[f"arize_{eval_type}_explanation"] = "Basic evaluation - full Phoenix evaluation not available"

    def cleanup(self):
        """Clean up resources and close Phoenix session."""
        try:
            # Clean up environment variables
            for var in ['PHOENIX_PORT', 'PHOENIX_GRPC_PORT']:
                if var in os.environ:
                    del os.environ[var]
                    
            print("🔒 Phoenix cleanup completed")
        except Exception as e:
            print(f"⚠️ Error during Phoenix cleanup: {e}")

print("✅ ArizeHotelSupportEvaluator class defined")

✅ ArizeHotelSupportEvaluator class defined


In [10]:
# Setup Phoenix and run hotel search evaluation
def run_hotel_evaluation_demo():
    """Run comprehensive hotel search evaluation with Phoenix."""
    print("🚀 Starting Hotel Support Agent Evaluation with Phoenix")
    print("=" * 60)
    
    # Test queries for hotel search
    test_queries = [
        "Find me a hotel in New York City with a pool",
        "I need a budget hotel in San Francisco near the airport",
        "Show me luxury hotels in Miami Beach with ocean views",
        "Find hotels in Los Angeles with free breakfast and parking",
        "What are the best hotels in Chicago for business travelers?",
        "I need a pet-friendly hotel in Seattle with a gym"
    ]
    
    # Initialize evaluator
    evaluator = ArizeHotelSupportEvaluator(catalog, application_span)
    
    try:
        # Run evaluation
        results_df = evaluator.run_hotel_search_evaluation(test_queries)
        
        # Run Phoenix AI evaluations if available
        if ARIZE_AVAILABLE:
            results_df = evaluator.run_arize_evaluations(results_df)
        
        # Print summary
        print("\n📊 Hotel Search Agent Evaluation Results:")
        print("=" * 50)
        
        total_queries = len(results_df)
        hotel_info_count = results_df['has_hotel_info'].sum()
        recommendations_count = results_df['has_recommendations'].sum()
        amenities_count = results_df['has_amenities'].sum()
        appropriate_count = results_df['appropriate_response'].sum()
        
        print(f"📊 Total queries processed: {total_queries}")
        print(f"🎯 Queries with hotel info: {hotel_info_count} ({hotel_info_count/total_queries*100:.1f}%)")
        print(f"📈 Queries with recommendations: {recommendations_count} ({recommendations_count/total_queries*100:.1f}%)")
        print(f"🛏️ Queries with amenities: {amenities_count} ({amenities_count/total_queries*100:.1f}%)")
        print(f"✅ Appropriate responses: {appropriate_count} ({appropriate_count/total_queries*100:.1f}%)")
        
        # Arize evaluation results
        if ARIZE_AVAILABLE:
            arize_results = {}
            
            # Check which Arize evaluation columns exist and process them
            for eval_type in ['relevance', 'qa', 'hallucination', 'toxicity']:
                col_name = f"arize_{eval_type}"
                if col_name in results_df.columns:
                    try:
                        scores = results_df[col_name].value_counts()
                        arize_results[eval_type] = {k: int(v) for k, v in scores.items()}
                    except Exception as e:
                        print(f"   ⚠️ Error processing {eval_type} results: {e}")
            
            if arize_results:
                print(f"\n🔍 Arize Evaluation Results:")
                for eval_type, scores in arize_results.items():
                    print(f"   📋 {eval_type.title()}: {scores}")
            else:
                print(f"\n⚠️ No Arize evaluation results available (evaluations may have failed)")
        
        if ARIZE_AVAILABLE and hasattr(evaluator, 'phoenix_session') and evaluator.phoenix_session:
            print(f"\n🌟 Phoenix UI is running at: http://localhost:{evaluator.phoenix_session.port}/")
            print("💡 Visit the Phoenix UI to see detailed traces and evaluations")
            print("📊 The UI shows conversation flows, tool calls, and evaluation scores")
        
        print(f"\n💡 Note: Some errors are expected without full Couchbase setup")
        
        return results_df
        
    except Exception as e:
        print(f"❌ Evaluation failed: {e}")
        return None
    finally:
        # Cleanup
        if 'evaluator' in locals():
            try:
                evaluator.cleanup()
            except:
                pass

# Run the evaluation demo
evaluation_results = run_hotel_evaluation_demo()

🚀 Starting Hotel Support Agent Evaluation with Phoenix
🔧 Setting up Arize observability...


2025-07-15 15:21:33,826 - phoenix.config - INFO - 📋 Ensuring phoenix working directory: /Users/kaustavghosh/.phoenix
2025-07-15 15:21:33,889 - alembic.runtime.migration - INFO - Context impl SQLiteImpl.
2025-07-15 15:21:33,889 - alembic.runtime.migration - INFO - Will assume transactional DDL.
2025-07-15 15:21:33,906 - alembic.runtime.migration - INFO - Running upgrade  -> cf03bd6bae1d, init
2025-07-15 15:21:33,927 - alembic.runtime.migration - INFO - Running upgrade cf03bd6bae1d -> 10460e46d750, datasets
2025-07-15 15:21:33,932 - alembic.runtime.migration - INFO - Running upgrade 10460e46d750 -> 3be8647b87d8, add token columns to spans table
2025-07-15 15:21:33,934 - alembic.runtime.migration - INFO - Running upgrade 3be8647b87d8 -> cd164e83824f, users and tokens
2025-07-15 15:21:33,939 - alembic.runtime.migration - INFO - Running upgrade cd164e83824f -> 4ded9e43755f, create project_session table
2025-07-15 15:21:33,948 - alembic.runtime.migration - INFO - Running upgrade 4ded9e43755f

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://arize.com/docs/phoenix
🌐 Phoenix UI: http://localhost:6006
🔭 OpenTelemetry Tracing Details 🔭
|  Phoenix Project: hotel-support-agent-evaluation
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: http://localhost:6006/v1/traces
|  Transport: HTTP + protobuf
|  Transport Headers: {}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.

✅ Phoenix OTEL registered successfully
✅ LangChain instrumented successfully
✅ OpenAI instrumented successfully
✅ Arize observability configured successfully
🚀 Running evaluation on 6 queries...
  📝 Query 1/6: Find me a hotel in New York City with a pool
     🔄 Agent processing query...


[1m> Entering new 

I0000 00:00:1752573328.769017 3958853 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


[32;1m[1;3mOkay, so the user is looking for a pet-friendly hotel in Seattle with a gym. I need to make sure I follow the rules and use only one tool. Since it's a general search, I'll use the search_vector_database tool. The query should include the location first, then the amenities. So, the search terms will be "Seattle pet-friendly hotel gym". This should prioritize exact location matches and include all the necessary amenities. Once I get the results, I can provide a helpful response with the details.

Final Answer:  
Here are some pet-friendly hotels in Seattle with a gym:  
1. Hotel 1: Mountain Lodge in Aspen, Colorado. Rustic mountain retreat perfect for skiing and outdoor adventures. Price range: $150-$300 per night. Rating: 4.5/5. Amenities: Ski-in/Ski-out Access, Stone Fireplace Lounge, Mountain View Restaurant, Outdoor Hot Tub, Hiking Trail Access, Free WiFi.  
2. Hotel 2: Boutique Inn in San Francisco, California. Charming boutique hotel with unique artistic decor and per

run_evals |██████████| 24/24 (100.0%) | ⏳ 01:35<00:00 |  3.96s/it

   ⚠️ Phoenix evaluators returned unexpected format: <class 'list'>
   ✅ All Phoenix evaluations completed

📊 Hotel Search Agent Evaluation Results:
📊 Total queries processed: 6
🎯 Queries with hotel info: 4 (66.7%)
📈 Queries with recommendations: 2 (33.3%)
🛏️ Queries with amenities: 4 (66.7%)
✅ Appropriate responses: 4 (66.7%)

🔍 Arize Evaluation Results:
   📋 Relevance: {'basic_evaluation': 6}
   📋 Qa: {'basic_evaluation': 6}
   📋 Hallucination: {'basic_evaluation': 6}
   📋 Toxicity: {'basic_evaluation': 6}

🌟 Phoenix UI is running at: http://localhost:6006/
💡 Visit the Phoenix UI to see detailed traces and evaluations
📊 The UI shows conversation flows, tool calls, and evaluation scores

💡 Note: Some errors are expected without full Couchbase setup
🔒 Phoenix cleanup completed





In [11]:
# Display detailed evaluation results
if evaluation_results is not None and not evaluation_results.empty:
    print("\n📋 Detailed Evaluation Results:")
    print("=" * 40)
    
    for i, row in evaluation_results.iterrows():
        print(f"\n🔍 Query {i+1}: {row['query']}")
        print(f"🤖 Response: {row['output'][:150]}{'...' if len(row['output']) > 150 else ''}")
        print(f"✅ Metrics:")
        print(f"   Hotel Info: {'✓' if row['has_hotel_info'] else '✗'}")
        print(f"   Recommendations: {'✓' if row['has_recommendations'] else '✗'}")
        print(f"   Amenities: {'✓' if row['has_amenities'] else '✗'}")
        print(f"   Appropriate: {'✓' if row['appropriate_response'] else '✗'}")
        
        # Display Arize evaluation results if available
        arize_evals = ['relevance', 'qa', 'hallucination', 'toxicity']
        for eval_type in arize_evals:
            col_name = f"arize_{eval_type}"
            if col_name in row and pd.notna(row[col_name]):
                print(f"   Arize {eval_type.title()}: {row[col_name]}")
                
                # Show explanation if available
                explanation_col = f"arize_{eval_type}_explanation"
                if explanation_col in row and pd.notna(row[explanation_col]):
                    explanation = str(row[explanation_col])[:100] + "..." if len(str(row[explanation_col])) > 100 else str(row[explanation_col])
                    print(f"      → {explanation}")
        
        print("-" * 40)
        
    # Summary statistics
    print(f"\n📊 Summary Statistics:")
    print(f"   Average response length: {evaluation_results['response_length'].mean():.1f} characters")
    print(f"   Success rate (hotel info): {evaluation_results['has_hotel_info'].mean()*100:.1f}%")
    print(f"   Recommendation rate: {evaluation_results['has_recommendations'].mean()*100:.1f}%")
    print(f"   Amenities mention rate: {evaluation_results['has_amenities'].mean()*100:.1f}%")
    print(f"   Appropriate response rate: {evaluation_results['appropriate_response'].mean()*100:.1f}%")
    
else:
    print("⚠️ No evaluation results to display")
    print("💡 This could be due to missing dependencies or configuration issues")
    print("📝 Try running a simple test query with the agent instead")


📋 Detailed Evaluation Results:

🔍 Query 1: Find me a hotel in New York City with a pool
🤖 Response: Based on your search, here are some hotels in New York City with pools:

1. **Hotel 1**: Boutique Inn in San Francisco, California. Charming boutique ...
✅ Metrics:
   Hotel Info: ✓
   Recommendations: ✗
   Amenities: ✓
   Appropriate: ✓
   Arize Relevance: basic_evaluation
      → Basic evaluation - full Phoenix evaluation not available
   Arize Qa: basic_evaluation
      → Basic evaluation - full Phoenix evaluation not available
   Arize Hallucination: basic_evaluation
      → Basic evaluation - full Phoenix evaluation not available
   Arize Toxicity: basic_evaluation
      → Basic evaluation - full Phoenix evaluation not available
----------------------------------------

🔍 Query 2: I need a budget hotel in San Francisco near the airport
🤖 Response: Here are the top budget hotels in San Francisco near the airport:  

1. **Budget Inn SF**  
   - **Location**: San Francisco Airport Are