# Comprehensive RAG System Evaluation

This notebook builds and evaluates a Retrieval-Augmented Generation (RAG) pipeline with enhanced metrics and analysis.

## 1. Setup and Configuration

In [4]:
# First install these core packages
!pip install scikit-learn rouge-score

# Then install bert-score with its specific requirements
!pip install bert-score torch transformers

# Finally install visualization packages
!pip install plotly pandas matplotlib seaborn

Collecting rouge-score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting absl-py (from rouge-score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (pyproject.toml): started
  Building wheel for rouge-score (pyproject.toml): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=25027 sha256=19457051722dbd68c65b71e841ef0e7ee3efe352b1e6cf42e18d20ca161049f6
  Stored in directory: c:\users\villian\appdata\local\pip\cache\wheels\44\af\da\5ffc433e2786f0b1

In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from datetime import datetime
import json
import warnings
warnings.filterwarnings('ignore')

# Evaluation metrics
from sklearn.metrics import precision_score, recall_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score

# LangChain imports
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from langchain.schema import Document
from langchain.evaluation import load_evaluator

# Visualization
from IPython.display import HTML, display
import plotly.express as px

# Config
plt.style.use('ggplot')
sns.set(style='whitegrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

# Constants
CONFIG = {
    "vector_store_dir": "../vector_store",
    "results_dir": "../results",
    "eval_data_path": "../data/evaluation_questions.json",
    "llm_model": "mistralai/Mistral-7B-Instruct-v0.2",
    "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
    "retrieval_k": 5,
    "evaluation_metrics": ["rouge", "bert_score", "answer_relevance", "faithfulness"]
}

# Create directories if not exists
os.makedirs(CONFIG["results_dir"], exist_ok=True)

## 2. Load Evaluation Dataset

In [None]:
def load_evaluation_data(file_path):
    """Load evaluation questions with optional ground truth answers."""
    with open(file_path) as f:
        data = json.load(f)
    
    return pd.DataFrame(data["questions"])

# Sample structure for evaluation_questions.json:
# {
#   "questions": [
#     {
#       "question": "What are common credit card issues?",
#       "ground_truth": "Customers report...",
#       "expected_products": ["Credit card"],
#       "expected_issues": ["Late fees", "Fraud"]
#     }
#   ]
# }

eval_df = load_evaluation_data(CONFIG["eval_data_path"])
print(f"Loaded {len(eval_df)} evaluation questions")

IndentationError: unindent does not match any outer indentation level (<string>, line 8)

## 3. Initialize RAG Components

In [None]:
def initialize_rag_pipeline(config):
    """Initialize all RAG pipeline components."""
    # Embeddings
    embeddings = HuggingFaceEmbeddings(model_name=config["embedding_model"])
    
    # Vector Store
    vectorstore = FAISS.load_local(config["vector_store_dir"], embeddings)
    
    # LLM
    llm = HuggingFaceHub(
        repo_id=config["llm_model"],
        model_kwargs={"temperature": 0.7, "max_length": 1024}
    )
    
    # Prompt Template
    prompt_template = """
    You are a financial analyst assistant. Answer the question based only on:
    
    Context: {context}
    
    Question: {question}
    
    Guidelines:
    1. Be concise but comprehensive
    2. If unsure, say "I cannot determine from the context"
    3. Highlight patterns when apparent
    4. Never invent information
    
    Answer:"""
    
    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    
    # QA Chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": config["retrieval_k"]}),
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True
    )
    
    return {"qa_chain": qa_chain, "vectorstore": vectorstore}

rag_components = initialize_rag_pipeline(CONFIG)

## 4. Evaluation Framework

In [None]:
class RAGEvaluator:
    """Comprehensive RAG system evaluator with multiple metrics."""
    
    def __init__(self, config):
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
        self.evaluators = {
            "relevance": load_evaluator("labeled_score_string"),
            "faithfulness": load_evaluator("labeled_score_string")
        }
    
    def calculate_retrieval_metrics(self, retrieved_docs, expected_products, expected_issues):
        """Calculate precision and recall for retrieved documents."""
        retrieved_products = [doc.metadata.get("product", "") for doc in retrieved_docs]
        retrieved_issues = [doc.metadata.get("issue", "") for doc in retrieved_docs]
        
        # Binary relevance for each doc
        product_relevance = [1 if p in expected_products else 0 for p in retrieved_products]
        issue_relevance = [1 if i in expected_issues else 0 for i in retrieved_issues]
        
        # Calculate metrics
        metrics = {
            "retrieval_product_precision": precision_score(product_relevance, [1]*len(product_relevance)),
            "retrieval_product_recall": recall_score(product_relevance, [1]*len(product_relevance)),
            "retrieval_issue_precision": precision_score(issue_relevance, [1]*len(issue_relevance)),
            "retrieval_issue_recall": recall_score(issue_relevance, [1]*len(issue_relevance)),
            "num_relevant_docs": sum(product_relevance)
        }
        
        return metrics
    
    def calculate_generation_metrics(self, generated_answer, ground_truth):
        """Calculate text generation quality metrics."""
        # ROUGE scores
        rouge_scores = self.scorer.score(ground_truth, generated_answer)
        
        # BERTScore
        _, _, bert_f1 = bert_score([generated_answer], [ground_truth], lang="en")
        
        return {
            "rouge1": rouge_scores["rouge1"].fmeasure,
            "rougeL": rouge_scores["rougeL"].fmeasure,
            "bert_score": bert_f1.mean().item()
        }
    
    def evaluate_response(self, question, generated_answer, retrieved_docs, ground_truth=None, expected_products=None, expected_issues=None):
        """Comprehensive evaluation of a RAG response."""
        metrics = {}
        
        # Retrieval metrics
        if expected_products and expected_issues:
            metrics.update(self.calculate_retrieval_metrics(retrieved_docs, expected_products, expected_issues))
        
        # Generation metrics
        if ground_truth:
            metrics.update(self.calculate_generation_metrics(generated_answer, ground_truth))
        
        # LLM-based evaluations
        if "answer_relevance" in CONFIG["evaluation_metrics"]:
            relevance_result = self.evaluators["relevance"].evaluate_strings(
                prediction=generated_answer,
                input=question,
                reference=ground_truth if ground_truth else ""
            )
            metrics["answer_relevance"] = relevance_result["score"]
        
        if "faithfulness" in CONFIG["evaluation_metrics"]:
            faithfulness_result = self.evaluators["faithfulness"].evaluate_strings(
                prediction=generated_answer,
                input=" ".join([doc.page_content for doc in retrieved_docs])
            )
            metrics["faithfulness"] = faithfulness_result["score"]
        
        return metrics

evaluator = RAGEvaluator(CONFIG)

## 5. Run Evaluation

In [None]:
def run_evaluation(qa_chain, eval_df, evaluator):
    """Run full evaluation pipeline."""
    results = []
    
    for _, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
        try:
            # Get RAG response
            result = qa_chain({"query": row["question"]})
            
            # Evaluate response
            metrics = evaluator.evaluate_response(
                question=row["question"],
                generated_answer=result["result"],
                retrieved_docs=result["source_documents"],
                ground_truth=row.get("ground_truth"),
                expected_products=row.get("expected_products", []),
                expected_issues=row.get("expected_issues", [])
            )
            
            # Record results
            record = {
                "question": row["question"],
                "generated_answer": result["result"],
                "ground_truth": row.get("ground_truth"),
                "retrieved_products": [doc.metadata.get("product") for doc in result["source_documents"]],
                "retrieved_issues": [doc.metadata.get("issue") for doc in result["source_documents"]],
                **metrics
            }
            results.append(record)
            
        except Exception as e:
            print(f"Error evaluating question '{row['question']}': {str(e)}")
            results.append({
                "question": row["question"],
                "error": str(e)
            })
    
    return pd.DataFrame(results)

results_df = run_evaluation(rag_components["qa_chain"], eval_df, evaluator)

## 6. Analyze Results

In [None]:
def analyze_results(results_df):
    """Generate comprehensive analysis of evaluation results."""
    analysis = {}
    
    # Basic stats
    analysis["num_questions"] = len(results_df)
    analysis["success_rate"] = 1 - (results_df["error"].notna().sum() / len(results_df))
    
    # Metric averages
    numeric_cols = results_df.select_dtypes(include=np.number).columns
    analysis["metrics"] = results_df[numeric_cols].mean().to_dict()
    
    # Retrieval analysis
    all_products = [p for sublist in results_df["retrieved_products"].dropna() for p in sublist]
    analysis["top_retrieved_products"] = pd.Series(all_products).value_counts().head(5).to_dict()
    
    all_issues = [i for sublist in results_df["retrieved_issues"].dropna() for i in sublist]
    analysis["top_retrieved_issues"] = pd.Series(all_issues).value_counts().head(5).to_dict()
    
    # Correlation analysis
    if "num_relevant_docs" in results_df and "bert_score" in results_df:
        analysis["relevance_correlation"] = results_df[["num_relevant_docs", "bert_score"]].corr().iloc[0,1]
    
    return analysis

analysis_results = analyze_results(results_df)
print("Evaluation Analysis:")
print(json.dumps(analysis_results, indent=2))

## 7. Visualization

In [None]:
def visualize_results(results_df, analysis):
    """Create interactive visualizations of evaluation results."""
    # Metric distribution
    metric_cols = [col for col in results_df.columns if col in ["rouge1", "rougeL", "bert_score", "answer_relevance", "faithfulness"]]
    
    if metric_cols:
        fig = px.box(results_df[metric_cols], 
                    title="Evaluation Metrics Distribution")
        fig.show()
    
    # Retrieval performance
    if "num_relevant_docs" in results_df:
        fig = px.histogram(results_df, x="num_relevant_docs", 
                         title="Number of Relevant Documents Retrieved")
        fig.show()
    
    # Top retrieved products/issues
    for entity_type in ["products", "issues"]:
        data = analysis.get(f"top_retrieved_{entity_type}", {})
        if data:
            df = pd.DataFrame(list(data.items()), columns=[entity_type[:-1], "count"])
            fig = px.bar(df, x=entity_type[:-1], y="count", 
                        title=f"Top Retrieved {entity_type.capitalize()}")
            fig.show()

visualize_results(results_df, analysis_results)

## 8. Generate Report

In [None]:
def generate_report(results_df, analysis, config):
    """Generate comprehensive evaluation report."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_path = f"{config['results_dir']}/rag_evaluation_report_{timestamp}.html"
    
    # Create report content
    report_content = f"""
    <html>
    <head>
        <title>RAG Evaluation Report</title>
        <style>
            body {{ font-family: Arial; margin: 20px; }}
            h1, h2 {{ color: #2e6c80; }}
            .metric {{ background-color: #f2f2f2; padding: 10px; margin: 5px; }}
            .good {{ color: green; }}
            .bad {{ color: red; }}
        </style>
    </head>
    <body>
        <h1>RAG System Evaluation Report</h1>
        <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <h2>Summary Metrics</h2>
        <div class="metric">Questions Evaluated: {analysis['num_questions']}</div>
        <div class="metric">Success Rate: {analysis['success_rate']:.1%}</div>
        
        <h2>Performance Metrics</h2>
        """
    
    # Add metric cards
    for metric, value in analysis["metrics"].items():
        rating_class = "good" if value > 0.7 else "bad" if value < 0.5 else ""
        report_content += f"<div class='metric {rating_class}'>{metric.replace('_', ' ').title()}: {value:.3f}</div>"
    
    # Add detailed results
    report_content += """
        <h2>Detailed Results</h2>
        <table border="1">
            <tr>
                <th>Question</th>
                <th>BERT Score</th>
                <th>Relevant Docs</th>
                <th>Answer Preview</th>
            </tr>
    """
    
    for _, row in results_df.iterrows():
        preview = row["generated_answer"][:100] + "..." if pd.notna(row.get("generated_answer")) else "ERROR"
        bert_score = f"{row['bert_score']:.3f}" if 'bert_score' in row else "N/A"
        relevant_docs = row.get('num_relevant_docs', "N/A")
        
        report_content += f"""
            <tr>
                <td>{row['question']}</td>
                <td>{bert_score}</td>
                <td>{relevant_docs}</td>
                <td>{preview}</td>
            </tr>
        """
    
    report_content += """
        </table>
    </body>
    </html>
    """
    
    # Save report
    with open(report_path, "w") as f:
        f.write(report_content)
    
    return report_path

report_file = generate_report(results_df, analysis_results, CONFIG)
print(f"Report generated: {report_file}")

## 9. Save Results

In [None]:
def save_results(results_df, analysis, config):
    """Save all evaluation results for future reference."""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save raw results
    results_file = f"{config['results_dir']}/rag_results_{timestamp}.csv"
    results_df.to_csv(results_file, index=False)
    
    # Save analysis
    analysis_file = f"{config['results_dir']}/rag_analysis_{timestamp}.json"
    with open(analysis_file, "w") as f:
        json.dump(analysis, f, indent=2)
    
    return results_file, analysis_file

results_file, analysis_file = save_results(results_df, analysis_results, CONFIG)
print(f"Results saved to {results_file}")
print(f"Analysis saved to {analysis_file}")

## 10. Recommendations

In [None]:
def generate_recommendations(analysis):
    """Generate actionable recommendations based on evaluation results."""
    recommendations = []
    
    # Retrieval recommendations
    if analysis["metrics"].get("retrieval_product_precision", 0) < 0.6:
        recommendations.append({
            "category": "Retrieval",
            "recommendation": "Improve retrieval precision by experimenting with different embedding models or adding query expansion",
            "priority": "high"
        })
    
    # Generation recommendations
    if analysis["metrics"].get("bert_score", 0) < 0.7:
        recommendations.append({
            "category": "Generation",
            "recommendation": "Refine prompt engineering or try different LLM models to improve answer quality",
            "priority": "high"
        })
    
    # General recommendations
    if analysis["success_rate"] < 0.9:
        recommendations.append({
            "category": "Robustness",
            "recommendation": "Add better error handling for failed questions",
            "priority": "medium"
        })
    
    return pd.DataFrame(recommendations)

recommendations_df = generate_recommendations(analysis_results)
display(recommendations_df)