In [2]:
import os
import time
import pandas as pd
import numpy as np
from typing import List, Dict, Any
from datasets import load_dataset
import getpass

# Core LangChain imports
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ParentDocumentRetriever, EnsembleRetriever
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models

# Ragas imports
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    answer_relevancy,
    faithfulness,
    context_entity_recall,
    context_relevancy
)

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Set up API keys
print("Setting up API keys...")
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key:")
os.environ["COHERE_API_KEY"] = getpass.getpass("Enter your Cohere API Key:")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangChain API Key:")

# Initialize models
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

print("Setup complete!")

# Step 1: Load Open-Source Dataset

def load_open_source_dataset():
    """
    Load an open-source dataset suitable for RAG evaluation.
    Using the SQuAD dataset which has questions, contextx, and answers.
    """
    print("Loading SQuAD dataset...")

    dataset = load_dataset("squad", split="validation[:200]")

    documents = []
    for item in dataset:
        doc_content = f"Title: {item['title']}\n\nContext: {item['content']}"

        doc = {
            'page_content': doc_content,
            'metadata': {
                'title': item['title'],
                'context': item['context'],
                'question': item['question'],
                'answer': item['answers']['text'][0] if item['answers']['text'] else "",
                'source': 'squad'
            }
        }
        documents.append(doc)

    print(f"Loaded {len(documents)} documents from SQuAD dataset")
    return documents

# Step 2: Create Retrievers

def create_retrievers(documents):
    """Create all the different retriever types for evaluation."""

    from langchain.schema import Document
    langchain_docs = [
        Document(
            page_content=doc['page_content'],
            metadata=dpc['metadata']
        ) for doc in documents
    ]

    print("Creating retrievers...")
    retrievers = {}

    # Naive Retriever
    print("Creating naive retriever...")
    vectorstore = Qdrant.from_documents(
        langchain_docs,
        embeddings,
        location=":memory:",
        collection_name="NaiveRetrieval"
    )
    retrievers['naive'] = vectorstore.as_retriever(search_kwargs={"k": 5})

    # BM25 Retriever
    print("Creating BM25 retriever...")
    retrievers['bm25'] = BM25Retriever.from_documents(langchain_docs, k=5)

    # Contextual Compression or Reranking
    print("Creating rerank retriever...")
    compressor = CohereRerank(model="rerank-v3.5", top_k=5)
    base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
    retrievers['rerank'] = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=base_retriever
    )

    # Multi-Query Retriever
    print("Creating multi-query retriever...")
    retrievers['multi_query'] = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
        llm=chat_model
    )

    # Parent Document Retriever
    print("Creating parent document retriever...")
    client = QdrantClient(location=":memory:")
    client.create_collection(
        collection_name="parent_docs",
        vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
    )
    parent_vectorstore = QdrantVectorStore(
        collection_name="parent_docs",
        embedding=embeddings,
        client=client
    )
    store = InMemoryStore()
    child_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 50)

    parent_retriever = ParentDocumentRetriever(
        vectorstore=parent_vectorstore,
        docstore=store,
        child_splitter=child_splitter,
        search_kwargs={"k": 5}
    )
    parent_retriever.add_documents(langchain_docs)
    retrievers['parent_document'] = parent_retriever

    # Ensemble Retriever
    print("Creating ensemble retriever...")
    ensemble_retrievers = [
        retrievers['naive'],
        retrievers['bm25'],
        retrievers['rerank']
    ]
    weights = [0.4, 0.3, 0.3]
    retrievers['ensemble'] = EnsembleRetriever(
        retrievers=ensemble_retrievers,
        weights=weights,
        search_kwargs={"k": 5}
    )

    print("All retrievers created!")
    return retrievers, langchain_docs

# Step 3: Generate Synthetic Test Dataset with Ragas

def generate_test_dataset(documents, num_samples=30):
    """Generate a synthetic test dataset using Ragas."""

    print(f"Generating {num_samples} synthetic test cases...")

    from langchain.schema import Document
    langchain_docs = [
        Document(
            page_content=doc['page_content'],
            metadata=doc['metadata']
        ) for doc in documents
    ]

    generator = TestsetGenerator.from_langchain(
        generator_llm=chat_model,
        critic_llm=chat_model,
        embeddings=embeddings
    )

    testset = generator.generate_with_langchain_docs(
        langchain_docs,
        test_size=num_samples,
        distributions={
            simple: 0.5,
            reasoning: 0.25,
            multi_context: 0.25
        }
    )

    print("Synthetic test dataset generated!")
    return testset

# Step 4: Evaluation Framework

def evaluate_retriever(retriever, retriever_name, testset, documents):
    """Evaluate a single retriever using Ragas metrics."""

    print(f"Evaluating {retriever_name} retriever...")

    start_time = time.time()

    eval_data = {
        'question': [],
        'contexts': [],
        'answer': [],
        'ground_truth': []
    }

    for i, row in enumerate(testset.to_pandas().iterrows()):
        _, data = row
        question = data['question']
        ground_truth = data['ground_truth']

        try:
            retrieved_docs = retriever.invoke(question)
            contexts = [doc.page_content for doc in retrieved_docs]

            answer = f"Based on the retrieved contexts: {' '.join(contexts[:2])}"

            eval_data['question'].append(question)
            eval_data['contexts'].append(contexts)
            eval_data['answer'].append(answer)
            eval_data['ground_truth'].append(ground_truth)

        except Exception as e:
            print(f"Error processing question {i}: {e}")
            continue

    eval_dataset = pd.DataFrame(eval_data)

    try:
        result = evaluate(
            eval_dataset,
            metrics=[
                context_precision,
                context_recall,
                context_relevancy,
                faithfulness,
                answer_relevancy
            ],
        )

        end_time = time.time()
        latency = end_time - start_time

        avg_retrieval_time = latency / len(eval_data['question'])

        return {
            'retriever': retriever_name,
            'context_precision': result['context_precision'],
            'context_recall': result['context_recall'],
            'context_relevancy': result['context_relevancy'],
            'faithfulness': result['faithfulness'],
            'answer_relevancy': result['answer_relevancy'],
            'total_latency': latency,
            'avg_retrieval_time': avg_retrieval_time,
            'num_queries': len(eval_data['question'])
        }
    
    except Exception as e:
        print(f"Error in Ragas evaluation: {e}")
        return None
    

def run_comprehensive_evaluation():
    """Run the complete evaluation pipeline."""

    documents = load_open_source_dataset()

    retrievers, langchain_docs = create_retrievers(documents)

    testset = generate_test_dataset(documents, num_samples=20)

    results = []

    for name, retriever in retrievers.items():
        print(f"\n{'='*50}")
        print(f"Evaluating {name.upper()} retriever")
        print(f"{'='*50}")

        try:
            result = evaluate_retriever(retriever, name, testset, documents)
            if result:
                results.append(result)
        except Exception as e:
            print(f"Failed to evaluate {name}: {e}")
            continue

    return results, testset

# Step 5: Analysis and Reporting

def analyze_results(results):
    """Analyze and report on the evaluation results."""
    
    if not results:
        print("No results to analyze!")
        return
    
    # Create results DataFrame
    df = pd.DataFrame(results)
    
    print("\n" + "="*80)
    print("RETRIEVAL EVALUATION RESULTS")
    print("="*80)
    
    # Performance metrics
    print("\nPERFORMANCE METRICS:")
    print("-" * 40)
    metrics_cols = ['context_precision', 'context_recall', 'context_relevancy', 
                   'faithfulness', 'answer_relevancy']
    
    for metric in metrics_cols:
        if metric in df.columns:
            print(f"{metric.replace('_', ' ').title():<20}: {df[metric].mean():.3f} (avg)")
            best_retriever = df.loc[df[metric].idxmax(), 'retriever']
            best_score = df[metric].max()
            print(f"{'Best':<20}: {best_retriever} ({best_score:.3f})")
            print()
    
    # Latency analysis
    print("\nLATENCY ANALYSIS:")
    print("-" * 40)
    df_sorted = df.sort_values('avg_retrieval_time')
    for _, row in df_sorted.iterrows():
        print(f"{row['retriever']:<15}: {row['avg_retrieval_time']:.3f}s per query")
    
    # Cost estimation (rough)
    print("\n💰 ESTIMATED COST ANALYSIS:")
    print("-" * 40)
    cost_estimates = {
        'naive': 'Low (embeddings only)',
        'bm25': 'Lowest (no API calls)',
        'rerank': 'Medium (embeddings + rerank)',
        'multi_query': 'High (multiple LLM calls)',
        'parent_document': 'Low-Medium (embeddings only)',
        'ensemble': 'Highest (combines multiple methods)'
    }
    
    for _, row in df.iterrows():
        retriever_name = row['retriever']
        cost = cost_estimates.get(retriever_name, 'Unknown')
        print(f"{retriever_name:<15}: {cost}")
    
    # Overall recommendation
    print("\nOVERALL ANALYSIS:")
    print("-" * 40)
    
    # Calculate composite score (weighted average of key metrics)
    df['composite_score'] = (
        df['context_precision'] * 0.3 + 
        df['context_recall'] * 0.3 + 
        df['answer_relevancy'] * 0.2 + 
        df['faithfulness'] * 0.2
    )
    
    best_overall = df.loc[df['composite_score'].idxmax()]
    fastest = df.loc[df['avg_retrieval_time'].idxmin()]
    
    print(f"Best Overall Performance: {best_overall['retriever']} (score: {best_overall['composite_score']:.3f})")
    print(f"Fastest Retriever: {fastest['retriever']} ({fastest['avg_retrieval_time']:.3f}s)")
    
    # Recommendations
    print(f"\nRECOMMENDATIONS:")
    print(f"- For best accuracy: Use {best_overall['retriever']}")
    print(f"- For best speed: Use {fastest['retriever']}")
    print(f"- For production balance: Consider ensemble or rerank approaches")
    
    return df

# Main Execution
if __name__ == "__main__":
    # Run the comprehensive evaluation
    results, testset = run_comprehensive_evaluation()
    
    # Analyze results
    results_df = analyze_results(results)
    
    # Save results
    if results_df is not None:
        results_df.to_csv("retriever_evaluation_results.csv", index=False)
        print(f"\nResults saved to 'retriever_evaluation_results.csv'")
    
    print("\nEvaluation complete!")

ModuleNotFoundError: No module named 'pandas'