In [3]:
import os
from typing import List, Dict, Any
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset
import pandas as pd
import json

In [4]:
def prepare_test_queries() -> List[Dict[str, Any]]:
    """
    Prepare a diverse set of test queries covering different travel scenarios
    Returns a list of dictionaries containing test cases
    """
    return [
        {
            "query": "Plan a 5-day trip to Tokyo in March",
            "expected_agent": "itinerary_agent",
            "category": "itinerary"
        },
        {
            "query": "What are the best flight options from NYC to London next month?",
            "expected_agent": "flight_agent",
            "category": "flights"
        },
        {
            "query": "Recommend hotels in Paris near the Eiffel Tower",
            "expected_agent": "accommodation_agent",
            "category": "accommodation"
        },
        {
            "query": "What's the best time to visit Rome?",
            "expected_agent": "information_agent",
            "category": "information"
        },
        # Add more test cases covering edge cases and complex scenarios
        {
            "query": "Plan a budget-friendly European tour covering 3 countries in 10 days",
            "expected_agent": "itinerary_agent",
            "category": "complex_itinerary"
        }
    ]

In [5]:
def get_agent_response(travel_assistant, query: str) -> Dict[str, Any]:
    """
    Get response from the travel assistant for a given query
    """
    try:
        result = travel_assistant.invoke({"query": query})
        return {
            "response": result.get("agent_response", ""),
            "agent_used": result.get("agent_executor", ""),
            "context": result.get("context", {})
        }
    except Exception as e:
        print(f"Error getting response for query: {query}")
        print(f"Error details: {str(e)}")
        return {"response": "", "agent_used": "", "context": {}}

In [6]:
def create_ragas_dataset(test_results: List[Dict[str, Any]]) -> Dataset:
    """
    Convert test results into a format suitable for RAGAS evaluation
    """
    dataset_dict = {
        "question": [],
        "answer": [],
        "contexts": [],
        "reference": []  # Added reference column for ground truth
    }
    
    for result in test_results:
        dataset_dict["question"].append(result["query"])
        dataset_dict["answer"].append(result["response"])
        # Convert context to list format as required by RAGAS
        contexts = [str(value) for value in result["context"].values()]
        dataset_dict["contexts"].append(contexts)
        # Add the reference (ground truth) answer
        dataset_dict["reference"].append(result["reference"])
    
    return Dataset.from_dict(dataset_dict)

In [7]:
def evaluate_pipeline(travel_assistant) -> Dict[str, float]:
    """
    Evaluate the multi-agent pipeline using RAGAS metrics
    """
    # Prepare test queries
    test_cases = prepare_test_queries()
    
    # Collect responses for all test cases
    test_results = []
    for case in test_cases:
        response_data = get_agent_response(travel_assistant, case["query"])
        test_results.append({
            "query": case["query"],
            "expected_agent": case["expected_agent"],
            "category": case["category"],
            "reference": case["reference"],
            **response_data
        })
    
    # Create RAGAS dataset
    evaluation_dataset = create_ragas_dataset(test_results)
    
    # Define evaluation metrics
    metrics = [
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
    
    # Run RAGAS evaluation
    try:
        results = evaluate(evaluation_dataset, metrics)
        
        # Convert results to dictionary
        metrics_dict = {
            "faithfulness": float(results['faithfulness']),
            "answer_relevancy": float(results['answer_relevancy']),
            "context_precision": float(results['context_precision']),
            "context_recall": float(results['context_recall'])
        }
        
        # Calculate additional metrics
        agent_accuracy = sum(1 for r in test_results 
                           if r["agent_used"] == r["expected_agent"]) / len(test_results)
        metrics_dict["agent_routing_accuracy"] = agent_accuracy
        
        return metrics_dict
        
    except Exception as e:
        print(f"Error during RAGAS evaluation: {str(e)}")
        return {}

In [8]:
def analyze_results_by_category(test_results: List[Dict[str, Any]], 
                              metrics: Dict[str, float]) -> Dict[str, Dict[str, float]]:
    """
    Analyze performance metrics by query category
    """
    categories = {}
    for result in test_results:
        category = result["category"]
        if category not in categories:
            categories[category] = {
                "count": 0,
                "routing_accuracy": 0,
                "response_quality": []
            }
        categories[category]["count"] += 1
        categories[category]["routing_accuracy"] += (
            1 if result["agent_used"] == result["expected_agent"] else 0
        )
    
    # Calculate averages
    for category in categories:
        categories[category]["routing_accuracy"] /= categories[category]["count"]
    
    return categories

In [9]:
def save_detailed_results(test_results: List[Dict[str, Any]], metrics: Dict[str, float]):
    """
    Save detailed evaluation results to a file
    """
    output = {
        "overall_metrics": metrics,
        "detailed_results": []
    }
    
    for result in test_results:
        output["detailed_results"].append({
            "query": result["query"],
            "expected_agent": result["expected_agent"],
            "actual_agent": result["agent_used"],
            "category": result["category"],
            "response": result["response"][:500] + "..." if len(result["response"]) > 500 else result["response"]
        })
    
    with open("detailed_evaluation_results.json", "w") as f:
        json.dump(output, f, indent=4)

In [10]:
def main():
    """
    Main function to run the evaluation
    """
    try:
        # Create travel assistant instance
        travel_assistant = create_travel_assistant_graph()
        
        # Run evaluation
        metrics = evaluate_pipeline(travel_assistant)
        
        # Print results
        print("\nRAGAS Evaluation Results:")
        print("-" * 30)
        for metric, value in metrics.items():
            print(f"{metric}: {value:.3f}")
        
        # Get test results for detailed analysis
        test_results = [get_agent_response(travel_assistant, case["query"]) 
                       for case in prepare_test_queries()]
        
        # Analyze by category
        category_analysis = analyze_results_by_category(test_results, metrics)
        
        print("\nPerformance by Category:")
        print("-" * 30)
        for category, stats in category_analysis.items():
            print(f"{category}:")
            print(f"  Routing Accuracy: {stats['routing_accuracy']:.3f}")
        
        # Save detailed results
        save_detailed_results(test_results, metrics)
            
    except Exception as e:
        print(f"Error in evaluation: {str(e)}")

if __name__ == "__main__":
    main()


Error in evaluation: name 'create_travel_assistant_graph' is not defined
