In [1]:
!pip uninstall -y langchain langchain-core langchain-openai langchain-community langchain-text-splitters



In [2]:
!pip install langchain langchain-core langchain-openai langchain-community langchain-text-splitters

Defaulting to user installation because normal site-packages is not writeable
Collecting langchain
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting langchain-core
  Using cached langchain_core-0.3.39-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-openai
  Using cached langchain_openai-0.3.7-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-text-splitters
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Using cached langchain-0.3.19-py3-none-any.whl (1.0 MB)
Using cached langchain_core-0.3.39-py3-none-any.whl (414 kB)
Using cached langchain_openai-0.3.7-py3-none-any.whl (55 kB)
Downloading langchain_community-0.3.18-py3-none-any.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---------------------------------------- 2.5/2.5 MB 27.8 MB/s eta 0:00:00
Using cached langchain

In [3]:
!pip show langchain langchain-core langchain-openai langchain-community langchain-text-splitters

Name: langchain
Version: 0.3.19
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\dabra\AppData\Roaming\Python\Python313\site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community, ragas
---
Name: langchain-core
Version: 0.3.39
Summary: Building applications with LLMs through composability
Home-page: 
Author: 
Author-email: 
License: MIT
Location: C:\Users\dabra\AppData\Roaming\Python\Python313\site-packages
Requires: jsonpatch, langsmith, packaging, pydantic, PyYAML, tenacity, typing-extensions
Required-by: langchain, langchain-cohere, langchain-community, langchain-huggingface, langchain-openai, langchain-qdrant, langchain-text-splitters, langgraph, langgraph-checkpoint, ragas
---
Name: langchain-openai
Version: 0.3.7
Summary: An integration package connecting OpenAI and LangChain
Home-page

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [None]:
# os.environ["OPENAI_API_KEY"] = "XX"
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
# Set the LLM Configuration
llm = ChatOpenAI(model="gpt-4-turbo", temperature=0.7)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
%run "agent_state.ipynb"

In [None]:
%run "multi_agent_definition.ipynb"

In [6]:
def load_travel_assistant(filename='travel_assistant.pkl'):
    try:
        filepath = os.path.join('saved_models', filename)
        
        # Load using dill
        with open(filepath, 'rb') as f:
            saved_data = dill.load(f)
        
        # Recreate the graph using the saved creation function
        travel_assistant = saved_data['create_graph_function']()
        
        print(f"Travel assistant loaded from {filepath}")
        return travel_assistant
    except Exception as e:
        print(f"Error loading travel assistant: {e}")
        return None

In [None]:
# Loading the agent
travel_assistant = load_travel_assistant()

In [12]:
def generate_comprehensive_test_set(travel_db, num_tests=30):
    """
    Generate a comprehensive test set targeting different agents and scenarios
    """
    # Initialize LLM for test generation
    llm = ChatOpenAI(model="gpt-4o", temperature=0.7)
    
    # Test scenario categories
    test_categories = [
        # Itinerary Agent Tests
        {
            "agent": "Itinerary",
            "scenarios": [
                "Create a 7-day itinerary for a solo traveler",
                "Plan a family vacation with children",
                "Design a romantic honeymoon trip",
                "Develop an adventure-focused travel plan",
                "Create a budget-friendly multi-city trip",
                "Plan a cultural immersion journey",
                "Design an eco-tourism adventure"
            ]
        },
        # Flight Agent Tests
        {
            "agent": "Flight",
            "scenarios": [
                "Find the most cost-effective flight options",
                "Compare flight routes with layovers",
                "Identify flights with best travel times",
                "Find flights with minimal connections",
                "Compare business class options",
                "Find flights with best baggage allowance",
                "Locate flights with most airline rewards"
            ]
        },
        # Information Agent Tests
        {
            "agent": "Information",
            "scenarios": [
                "Provide detailed travel visa requirements",
                "Give comprehensive destination safety information",
                "Explain local cultural customs and etiquette",
                "Provide detailed weather information",
                "Recommend best travel seasons",
                "Give insights on local transportation",
                "Provide health and vaccination guidance"
            ]
        }
    ]
    
    # Test set storage
    test_set = {
        "question": [],
        "agent_target": [],
        "context": [],
        "reference_answer": []
    }
    
    # Extract contexts from travel database
    contexts = []
    for doc_id in travel_db.index_to_docstore_id.values():
        doc = travel_db.docstore.search(doc_id)
        if len(doc.page_content) >= 1000:
            contexts.append(doc.page_content)
    
    # Ensure we have enough contexts
    if len(contexts) < num_tests:
        raise ValueError("Not enough contexts in the travel database")
    
    # Randomly sample contexts
    sampled_contexts = random.sample(contexts, num_tests)
    
    # Generate test cases
    for i in range(num_tests):
        # Select a category and scenario
        category = random.choice(test_categories)
        scenario = random.choice(category["scenarios"])
        context = sampled_contexts[i]
        
        # Generate context-aware question
        question_prompt = f"""
        Based on the following travel context, create a {category['agent']} agent-specific question that tests:
        - Depth of knowledge
        - Ability to provide nuanced recommendations
        - Understanding of travel complexities

        Context Snippet: {context[:1000]}
        Scenario: {scenario}

        Generate a specific, challenging question that requires sophisticated reasoning.
        """
        
        # Generate question
        question_response = llm.invoke(question_prompt)
        question = question_response.content
        
        # Generate reference answer
        answer_prompt = f"""
        Provide a comprehensive, detailed answer to the following question 
        using the given context. The answer should demonstrate:
        - Deep understanding of the travel scenario
        - Specific, actionable recommendations
        - Nuanced insights

        Context: {context}
        Question: {question}
        """
        
        answer_response = llm.invoke(answer_prompt)
        reference_answer = answer_response.content
        
        # Store test case
        test_set["question"].append(question)
        test_set["agent_target"].append(category["agent"])
        test_set["context"].append(context)
        test_set["reference_answer"].append(reference_answer)
    
    # Convert to DataFrame
    df = pd.DataFrame(test_set)
    
    # Save to CSV
    df.to_csv("comprehensive_travel_agent_test_set.csv", index=False)
    
    return df

def validate_test_set(test_set_df):
    """
    Validate the generated test set
    """
    # Check number of tests
    assert len(test_set_df) == 30, "Test set should have exactly 30 tests"
    
    # Check agent distribution
    agent_distribution = test_set_df['agent_target'].value_counts()
    print("Agent Distribution:")
    print(agent_distribution)
    
    # Ensure balanced representation of agents
    assert len(agent_distribution) == 3, "Should have tests for all three agents"
    
    # Check question and answer lengths
    print("\nQuestion Length Statistics:")
    print(test_set_df['question'].str.len().describe())
    
    print("\nReference Answer Length Statistics:")
    print(test_set_df['reference_answer'].str.len().describe())
    
    return True

# Example usage
def main(travel_db):
    # Generate test set
    test_set_df = generate_comprehensive_test_set(travel_db)
    
    # Validate test set
    validate_test_set(test_set_df)
    
    print("\nComprehensive test set generated successfully!")
    return test_set_df

In [None]:
# Note: Ensure to pass your travel_db when calling this function
test_set = main(travel_db)

In [None]:
test_set = pd.read_csv("comprehensive_travel_agent_test_set.csv")
test_set.head()

In [None]:
results

In [None]:
# Generate responses using the travel assistant
generated_answers = []
    
print("Generating responses for evaluation...")
for _, row in test_set.iterrows():
    try:
        # Invoke the travel assistant with the question
        response = travel_assistant.invoke({
            "query": row['question']
        })
            
        # Convert response to string if it's not already
        generated_answer = str(response)
        generated_answers.append(generated_answer)
    except Exception as e:
        print(f"Error generating response for question: {row['question']}")
        print(f"Error: {e}")
        generated_answers.append("Error generating response")
    
# Add generated answers to the DataFrame
test_set['generated_answer'] = generated_answers

test_set.to_csv("test_set_updated_with_agent_answers.csv")

In [42]:
# Prepare dataset for RAGAS evaluation
eval_dataset = Dataset.from_dict({
    'question': test_set['question'],
    'answer': test_set['generated_answer'],                 # Use the generated answers
    'ground_truth': test_set['reference_answer'],           # Reference answers for comparison
    'contexts': test_set['context'].apply(lambda x: [x]),   # Wrap context in a list
})

In [None]:
# Run RAGAS evaluation
print("Running RAGAS evaluation...")
results = evaluate(eval_dataset, metrics=[faithfulness, answer_relevancy, context_precision, context_recall])

In [None]:
# Print and analyze results
print("\nRAGAS Evaluation Results:")

results

In [None]:
# Detailed breakdown by agent type
agent_results = {}

for agent in test_set['agent_target'].unique():
    # print(test_set['agent_target'])
    agent_subset = eval_dataset.filter(lambda x, idx: test_set.iloc[idx]['agent_target'] == agent, with_indices=True)
    agent_eval_results = evaluate(agent_subset, metrics=[faithfulness,answer_relevancy,context_precision,context_recall])
    agent_results[agent] = agent_eval_results

In [None]:
# Print and analyze results by agent type
print("\nRAGAS Evaluation Results:")

agent_results

In [67]:
# Save detailed results
results_df = pd.DataFrame({
    'question': test_set['question'],
    'agent_target': test_set['agent_target'],
    'generated_answer': test_set['generated_answer'],
    'reference_answer': test_set['reference_answer'],
    'faithfulness': results['faithfulness'],
    'answer_relevancy': results['answer_relevancy'],
    'context_precision': results['context_precision'],
    'context_recall': results['context_recall']
})

results_df.to_csv('ragas_evaluation_results.csv', index=False)

In [38]:
def run_comprehensive_ragas_evaluation(travel_assistant, test_set_path='comprehensive_travel_agent_test_set.csv'):
    # Read the test set
    df = pd.read_csv(test_set_path)

    # df.rename(columns={'reference_answer': 'reference'}, inplace=True)
    
    # Generate responses using the travel assistant
    generated_answers = []
    
    print("Generating responses for evaluation...")
    for _, row in df.iterrows():
        try:
            # Invoke the travel assistant with the question
            response = travel_assistant.invoke({
                "query": row['question']
            })
            
            # Convert response to string if it's not already
            generated_answer = str(response)
            generated_answers.append(generated_answer)
        except Exception as e:
            print(f"Error generating response for question: {row['question']}")
            print(f"Error: {e}")
            generated_answers.append("Error generating response")
    
    # Add generated answers to the DataFrame
    df['generated_answer'] = generated_answers
    
    # Prepare dataset for RAGAS evaluation
    eval_dataset = Dataset.from_dict({
        'question': df['question'],
        'answer': df['generated_answer'],  # Use the generated answers
        'ground_truth': df['reference_answer'],  # Reference answers for comparison
        'contexts': df['context'].apply(lambda x: [x]),  # Wrap context in a list
    })
    
    # Run RAGAS evaluation
    print("Running RAGAS evaluation...")
    results = evaluate(
        eval_dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall
        ]
    )
    
    # Print and analyze results
    print("\nRAGAS Evaluation Results:")
    for metric, score in results.items():
        print(f"{metric}: {score}")
    
    # Detailed breakdown by agent type
    agent_results = {}
    for agent in df['agent_target'].unique():
        agent_subset = eval_dataset.filter(
            lambda x, idx: df.loc[idx, 'agent_target'] == agent
        )
        
        agent_eval_results = evaluate(
            agent_subset, 
            metrics=[
                faithfulness,
                answer_relevancy,
                context_precision,
                context_recall
            ]
        )
        
        agent_results[agent] = agent_eval_results
    
    # Print agent-specific results
    print("\nAgent-Specific Evaluation:")
    for agent, agent_metrics in agent_results.items():
        print(f"\n{agent} Agent:")
        for metric, score in agent_metrics.items():
            print(f"  {metric}: {score}")
    
    # Save detailed results
    results_df = pd.DataFrame({
        'question': df['question'],
        'agent_target': df['agent_target'],
        'generated_answer': df['generated_answer'],
        'reference_answer': df['reference_answer'],
        'faithfulness': results['faithfulness'],
        'answer_relevancy': results['answer_relevancy'],
        'context_precision': results['context_precision'],
        'context_recall': results['context_recall']
    })
    
    results_df.to_csv('ragas_evaluation_results.csv', index=False)
    
    return {
        'overall_results': results,
        'agent_results': agent_results,
        'results_dataframe': results_df
    }

In [None]:
# Usage
evaluation_results = run_comprehensive_ragas_evaluation(travel_assistant)