In [None]:
import os
from pathlib import Path
from typing import List, Dict, Any
import json
from datetime import datetime

from dotenv import load_dotenv
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    Settings,
    StorageContext,
    load_index_from_storage
)
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

load_dotenv()

# Setup paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
RESULTS_DIR = PROJECT_ROOT / "results"
RESULTS_DIR.mkdir(exist_ok=True)

In [2]:
# Golden Evaluation Dataset for RAG/Eval Papers
# These questions are designed to expose common RAG failure modes

EVAL_QUESTIONS = [
    # ========== EASY QUESTIONS (Should work with basic RAG) ==========
    {
        "id": "easy_01",
        "question": "What does RAG stand for?",
        "expected_answer": "Retrieval-Augmented Generation",
        "difficulty": "easy",
        "failure_mode": None,
        "source_papers": ["original_rag_2020.pdf"],
        "notes": "Basic fact, should work"
    },
    {
        "id": "easy_02",
        "question": "What is the main purpose of RAG systems?",
        "expected_answer": "To augment LLM responses with retrieved external knowledge to improve accuracy and reduce hallucinations",
        "difficulty": "easy",
        "failure_mode": None,
        "source_papers": ["original_rag_2020.pdf"],
        "notes": "Core concept"
    },
    {
        "id": "easy_03",
        "question": "What are the main components of a RAG system?",
        "expected_answer": "A retriever (to find relevant documents) and a generator (LLM to produce answers using retrieved context)",
        "difficulty": "easy",
        "failure_mode": None,
        "source_papers": ["original_rag_2020.pdf"],
        "notes": "Fundamental architecture"
    },
    
    # ========== MEDIUM - SEMANTIC CONFUSION ==========
    {
        "id": "semantic_01",
        "question": "How does Self-RAG differ from standard RAG?",
        "expected_answer": "Self-RAG adds reflection tokens that let the model decide when to retrieve, what to retrieve, and whether retrieved content is useful, making it more autonomous than standard RAG",
        "difficulty": "medium",
        "failure_mode": "semantic_confusion",
        "source_papers": ["self_rag_2023.pdf", "original_rag_2020.pdf"],
        "notes": "Requires distinguishing two similar concepts"
    },
    {
        "id": "semantic_02",
        "question": "What is the difference between RAGAS and traditional RAG evaluation?",
        "expected_answer": "RAGAS is an automated evaluation framework that measures faithfulness, answer relevance, and context precision/recall, whereas traditional evaluation often relies on manual human assessment",
        "difficulty": "medium",
        "failure_mode": "semantic_confusion",
        "source_papers": ["ragas_paper.pdf"],
        "notes": "Similar terminology, different meanings"
    },
    {
        "id": "semantic_03",
        "question": "What is HyDE and how does it work?",
        "expected_answer": "Hypothetical Document Embeddings (HyDE) generates a hypothetical answer to the query first, then uses that answer for retrieval instead of the query itself, improving semantic matching",
        "difficulty": "medium",
        "failure_mode": "semantic_confusion",
        "source_papers": ["hyde.pdf"],
        "notes": "Technical concept that might get confused with other retrieval methods"
    },
    
    # ========== MEDIUM - CONTRADICTORY/TEMPORAL ==========
    {
        "id": "temporal_01",
        "question": "What are the most recent advances in RAG architectures?",
        "expected_answer": "Recent advances include Self-RAG (2023), Corrective RAG/CRAG (2024), Graph RAG (2024), and agentic RAG systems with iterative retrieval",
        "difficulty": "medium",
        "failure_mode": "temporal_confusion",
        "source_papers": ["graph_rag_2024.pdf", "crag_2024.pdf", "self_rag_2023.pdf"],
        "notes": "Needs to retrieve from recent papers, not old ones"
    },
    {
        "id": "temporal_02",
        "question": "How has RAG evaluation evolved over time?",
        "expected_answer": "Early RAG evaluation relied on QA accuracy metrics, while modern approaches like RAGAS and ARES focus on component-level metrics (faithfulness, relevance) and automated LLM-as-judge techniques",
        "difficulty": "medium",
        "failure_mode": "temporal_confusion",
        "source_papers": ["ragas_paper.pdf", "ares_paper.pdf"],
        "notes": "Requires understanding evolution"
    },
    
    # ========== HARD - MULTI-HOP REASONING ==========
    {
        "id": "multihop_01",
        "question": "If I want to build a RAG system that can self-correct its retrieval, which architecture should I use and how would I evaluate it?",
        "expected_answer": "Use Corrective RAG (CRAG) which evaluates retrieval quality and decides whether to refine queries or use web search. Evaluate using RAGAS metrics for faithfulness and answer correctness, plus CRAG-specific metrics for retrieval correction rate",
        "difficulty": "hard",
        "failure_mode": "missing_context",
        "source_papers": ["crag_2024.pdf", "ragas_paper.pdf"],
        "notes": "Needs info from 2 different papers"
    },
    {
        "id": "multihop_02",
        "question": "What are the trade-offs between Dense Passage Retrieval and HyDE for semantic search?",
        "expected_answer": "DPR requires training on query-document pairs and works well with large labeled datasets, while HyDE is zero-shot and generates hypothetical documents but depends on LLM quality. DPR is more precise with training data; HyDE is more flexible without it",
        "difficulty": "hard",
        "failure_mode": "missing_context",
        "source_papers": ["dense_passage_retrieval.pdf", "hyde.pdf"],
        "notes": "Compare/contrast two papers"
    },
    {
        "id": "multihop_03",
        "question": "How would you combine Graph RAG with LLM-as-judge evaluation?",
        "expected_answer": "Graph RAG builds knowledge graphs for better relationship understanding. Evaluate it using LLM-as-judge to assess: (1) whether retrieved graph segments are relevant, (2) if relationships are accurately represented, (3) answer faithfulness to graph structure",
        "difficulty": "hard",
        "failure_mode": "missing_context",
        "source_papers": ["graph_rag_2024.pdf", "llm_as_judge.pdf"],
        "notes": "Synthesize concepts from multiple papers"
    },
    
    # ========== HARD - SPARSE/TECHNICAL DETAILS ==========
    {
        "id": "sparse_01",
        "question": "What is the mathematical formulation for computing context precision in RAGAS?",
        "expected_answer": "Context Precision = (Sum of precision@k for all relevant chunks) / (Total number of relevant chunks in ground truth), where precision@k measures relevant chunks in top-k retrievals",
        "difficulty": "hard",
        "failure_mode": "sparse_info",
        "source_papers": ["ragas_paper.pdf"],
        "notes": "Very specific technical detail, likely buried in paper"
    },
    {
        "id": "sparse_02",
        "question": "What hyperparameters does the original RAG paper use for DPR training?",
        "expected_answer": "The paper uses learning rate 1e-5, batch size 128, max sequence length 256 for passages, and trains for 40 epochs on Natural Questions dataset",
        "difficulty": "hard",
        "failure_mode": "sparse_info",
        "source_papers": ["original_rag_2020.pdf"],
        "notes": "Specific numbers often in appendix or tables"
    },
    
    # ========== HARD - EDGE CASES ==========
    {
        "id": "edge_01",
        "question": "What are the failure modes of RAG systems mentioned across different papers?",
        "expected_answer": "Common failures include: retrieval of irrelevant chunks, hallucination when context is insufficient, 'lost in the middle' problem with long contexts, contradictory information from multiple sources, and poor performance on multi-hop reasoning",
        "difficulty": "hard",
        "failure_mode": "synthesis",
        "source_papers": ["multiple papers"],
        "notes": "Requires synthesizing info from many papers"
    },
    {
        "id": "edge_02",
        "question": "Which RAG papers discuss handling contradictory information in retrieved documents?",
        "expected_answer": "Corrective RAG (CRAG) and Self-RAG both address contradictory information - CRAG filters low-quality retrievals and Self-RAG uses reflection to assess usefulness of retrieved content",
        "difficulty": "hard",
        "failure_mode": "edge_case",
        "source_papers": ["crag_2024.pdf", "self_rag_2023.pdf"],
        "notes": "Uncommon topic, might be mentioned briefly"
    },
    
    # ========== COMPARISON QUESTIONS ==========
    {
        "id": "compare_01",
        "question": "Compare the evaluation approaches in RAGAS vs ARES papers",
        "expected_answer": "RAGAS uses aspect-based evaluation (faithfulness, relevance, context metrics) with synthetic data generation. ARES focuses on automated evaluation with confidence estimation and handles few-shot learning. Both use LLMs for automated assessment but ARES emphasizes statistical confidence",
        "difficulty": "hard",
        "failure_mode": "comparison",
        "source_papers": ["ragas_paper.pdf", "ares_paper.pdf"],
        "notes": "Direct comparison of two frameworks"
    },
    {
        "id": "compare_02",
        "question": "What are the pros and cons of Self-RAG vs Corrective RAG?",
        "expected_answer": "Self-RAG: Pros - adaptive retrieval, learns when to retrieve. Cons - requires training reflection tokens. CRAG: Pros - can use external sources for correction, post-hoc evaluation. Cons - adds latency with correction step. Self-RAG is more integrated; CRAG is more modular",
        "difficulty": "hard",
        "failure_mode": "comparison",
        "source_papers": ["self_rag_2023.pdf", "crag_2024.pdf"],
        "notes": "Architectural trade-offs"
    },
    
    # ========== QUESTIONS THAT SHOULD FAIL BASELINE ==========
    {
        "id": "fail_01",
        "question": "What does the RAGAS paper say about context relevance?",
        "expected_answer": "RAGAS defines context relevance as whether retrieved chunks contain information needed to answer the query. It measures this by checking if each sentence in the context can be attributed to answering the question",
        "difficulty": "medium",
        "failure_mode": "should_fail_baseline",
        "source_papers": ["ragas_paper.pdf"],
        "notes": "Specific term that might retrieve wrong papers mentioning 'context' or 'relevance'"
    },
    {
        "id": "fail_02",
        "question": "How do you prevent RAG systems from hallucinating?",
        "expected_answer": "Multiple approaches: (1) Use faithfulness metrics in evaluation (RAGAS), (2) Add reflection/self-critique (Self-RAG), (3) Implement retrieval quality checks (CRAG), (4) Use constrained generation that stays within retrieved context",
        "difficulty": "hard",
        "failure_mode": "should_fail_baseline",
        "source_papers": ["multiple papers"],
        "notes": "Broad question requiring synthesis - baseline will likely give generic answer"
    },
    {
        "id": "fail_03",
        "question": "What specific metrics does ARES use for RAG evaluation?",
        "expected_answer": "ARES uses three main metrics: Context Relevance (is retrieved context relevant?), Answer Faithfulness (does answer use only retrieved info?), and Answer Relevance (does answer address the question?). It also provides confidence scores for each metric",
        "difficulty": "medium",
        "failure_mode": "should_fail_baseline",
        "source_papers": ["ares_paper.pdf"],
        "notes": "Specific to ARES - might confuse with RAGAS metrics"
    }
]

In [4]:
# ========== METADATA FOR ANALYSIS ==========

# Count by difficulty
difficulty_distribution = {
    "easy": len([q for q in EVAL_QUESTIONS if q["difficulty"] == "easy"]),
    "medium": len([q for q in EVAL_QUESTIONS if q["difficulty"] == "medium"]),
    "hard": len([q for q in EVAL_QUESTIONS if q["difficulty"] == "hard"])
}

# Count by failure mode
failure_mode_distribution = {
    "semantic_confusion": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "semantic_confusion"]),
    "temporal_confusion": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "temporal_confusion"]),
    "missing_context": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "missing_context"]),
    "sparse_info": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "sparse_info"]),
    "edge_case": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "edge_case"]),
    "comparison": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "comparison"]),
    "should_fail_baseline": len([q for q in EVAL_QUESTIONS if q.get("failure_mode") == "should_fail_baseline"])
}

print("Question Distribution:")
print(f"Total Questions: {len(EVAL_QUESTIONS)}")
print(f"\nBy Difficulty:")
for diff, count in difficulty_distribution.items():
    print(f"  {diff}: {count}")
print(f"\nBy Failure Mode:")
for mode, count in failure_mode_distribution.items():
    print(f"  {mode}: {count}")

Question Distribution:
Total Questions: 20

By Difficulty:
  easy: 3
  medium: 7
  hard: 10

By Failure Mode:
  semantic_confusion: 3
  temporal_confusion: 2
  missing_context: 3
  sparse_info: 2
  edge_case: 1
  comparison: 2
  should_fail_baseline: 3


In [5]:
# ============================================================================
# BASELINE RAG (Deliberately Flawed)
# ============================================================================

class BaselineRAG:
    """
    Basic RAG with known issues:
    - Small chunk size (causes context loss)
    - Low top_k (misses relevant chunks)
    - No query rewriting
    - No reranking
    - No metadata filtering
    """
    
    def __init__(self, data_dir: Path):
        # Use small, problematic settings
        Settings.llm = Ollama(
            model="llama3.2", 
            base_url="http://127.0.0.1:11434",
            request_timeout=300.0
        )
        Settings.embed_model = HuggingFaceEmbedding(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        Settings.chunk_size = 512  # DELIBERATELY SMALL - causes issues
        Settings.chunk_overlap = 50  # DELIBERATELY SMALL
        
        # Load documents
        loader = SimpleDirectoryReader(
            input_dir=str(data_dir),
            required_exts=[".pdf"],
            recursive=False
        )
        self.documents = loader.load_data()
        
        # Build index
        self.index = VectorStoreIndex.from_documents(self.documents)
        
    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        """Basic query - no enhancements"""
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,  # DELIBERATELY LOW
            response_mode="compact"
        )
        
        response = query_engine.query(question)
        
        return {
            "answer": str(response),
            "source_nodes": [
                {
                    "text": node.text[:300],
                    "score": node.score,
                    "metadata": node.metadata
                }
                for node in response.source_nodes
            ]
        }

In [6]:
# ============================================================================
# IMPROVED RAG V1: Query Rewriting
# ============================================================================

class ImprovedRAG_V1(BaselineRAG):
    """
    Enhancement 1: Query Rewriting
    - Rephrases user query to match document language
    - Uses LLM to generate better search query
    """
    
    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        # Rewrite query before retrieval
        rewrite_prompt = f"""
        Rewrite this question as a search query that would match formal documentation.
        Use technical/formal language.
        
        Original question: {question}
        
        Rewritten query (be concise, 1 sentence):
        """
        
        rewritten = Settings.llm.complete(rewrite_prompt)
        rewritten_query = str(rewritten).strip()
        
        # Use rewritten query for retrieval
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="compact"
        )
        
        response = query_engine.query(rewritten_query)
        
        return {
            "answer": str(response),
            "rewritten_query": rewritten_query,
            "source_nodes": [
                {
                    "text": node.text[:300],
                    "score": node.score,
                    "metadata": node.metadata
                }
                for node in response.source_nodes
            ]
        }


In [9]:
# ============================================================================
# IMPROVED RAG V2: Increased Retrieval + Better Chunking
# ============================================================================

class ImprovedRAG_V2:
    """
    Enhancement 2: Better Retrieval Settings
    - Larger chunks (more context)
    - More overlap (continuity)
    - Higher top_k (broader search)
    """
    
    def __init__(self, data_dir: Path):
        Settings.llm = Ollama(
            model="llama3.2", 
            base_url="http://127.0.0.1:11434",
            request_timeout=300.0
        )
        Settings.embed_model = HuggingFaceEmbedding(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
        Settings.chunk_size = 1024  # LARGER
        Settings.chunk_overlap = 200  # MORE OVERLAP
        
        loader = SimpleDirectoryReader(
            input_dir=str(data_dir),
            required_exts=[".pdf"],
            recursive=False
        )
        self.documents = loader.load_data()
        self.index = VectorStoreIndex.from_documents(self.documents)
        
    def query(self, question: str, top_k: int = 10) -> Dict[str, Any]:
        """Query with higher top_k"""
        # Rewrite query
        rewrite_prompt = f"""
        Rewrite this question as a search query for formal documentation.
        Original: {question}
        Rewritten (concise):
        """
        rewritten = Settings.llm.complete(rewrite_prompt)
        rewritten_query = str(rewritten).strip()
        
        # Retrieve MORE chunks
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,  # INCREASED from 3 to 10
            response_mode="compact"
        )
        
        response = query_engine.query(rewritten_query)
        
        return {
            "answer": str(response),
            "rewritten_query": rewritten_query,
            "source_nodes": [
                {
                    "text": node.text[:300],
                    "score": node.score,
                    "metadata": node.metadata
                }
                for node in response.source_nodes
            ]
        }

In [10]:
# ============================================================================
# IMPROVED RAG V3: Better Prompting
# ============================================================================

class ImprovedRAG_V3(ImprovedRAG_V2):
    """
    Enhancement 3: Structured Prompting
    - Clear instructions
    - Format requirements
    - Citation requirements
    """
    
    def query(self, question: str, top_k: int = 10) -> Dict[str, Any]:
        from llama_index.core import PromptTemplate
        
        # Define structured prompt
        qa_prompt_str = """
Context information is below:
---------------------
{context_str}
---------------------

Instructions:
- Answer the question using ONLY the context above
- If the context doesn't contain the answer, say "I don't have enough information"
- Be specific and cite which document/section you're using
- Format your answer clearly

Question: {query_str}

Answer:
"""
        qa_prompt = PromptTemplate(qa_prompt_str)
        
        # Rewrite query
        rewrite_prompt = f"""
        Rewrite this question as a search query for formal documentation.
        Original: {question}
        Rewritten (concise):
        """
        rewritten = Settings.llm.complete(rewrite_prompt)
        rewritten_query = str(rewritten).strip()
        
        # Query with custom prompt
        query_engine = self.index.as_query_engine(
            similarity_top_k=top_k,
            response_mode="compact",
            text_qa_template=qa_prompt
        )
        
        response = query_engine.query(rewritten_query)
        
        return {
            "answer": str(response),
            "rewritten_query": rewritten_query,
            "source_nodes": [
                {
                    "text": node.text[:300],
                    "score": node.score,
                    "metadata": node.metadata
                }
                for node in response.source_nodes
            ]
        }

In [11]:
# ============================================================================
# EVALUATION FRAMEWORK
# ============================================================================

class RAGEvaluator:
    """
    Evaluates RAG systems on your golden dataset
    Tracks improvements across versions
    """
    
    def __init__(self):
        self.results = []
        
    def evaluate_system(
        self, 
        system, 
        system_name: str,
        questions: List[Dict]
    ) -> Dict[str, Any]:
        """Run all questions through a RAG system"""
        
        print(f"\n{'='*60}")
        print(f"Evaluating: {system_name}")
        print(f"{'='*60}\n")
        
        system_results = {
            "system_name": system_name,
            "timestamp": datetime.now().isoformat(),
            "questions": []
        }
        
        for q in questions:
            print(f"Q: {q['question']}")
            
            try:
                result = system.query(q["question"])
                
                question_result = {
                    "question_id": q["id"],
                    "question": q["question"],
                    "expected": q["expected_answer"],
                    "actual": result["answer"],
                    "difficulty": q["difficulty"],
                    "failure_mode": q.get("failure_mode"),
                    "sources_retrieved": len(result["source_nodes"]),
                    "top_source_score": result["source_nodes"][0]["score"] if result["source_nodes"] else 0,
                    "rewritten_query": result.get("rewritten_query", None)
                }
                
                # Manual assessment (you'll review these)
                print(f"A: {result['answer'][:200]}...")
                print(f"Sources: {len(result['source_nodes'])} chunks retrieved")
                print()
                
                system_results["questions"].append(question_result)
                
            except Exception as e:
                print(f"ERROR: {e}\n")
                question_result = {
                    "question_id": q["id"],
                    "error": str(e)
                }
                system_results["questions"].append(question_result)
        
        # Save results
        results_file = RESULTS_DIR / f"{system_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(results_file, 'w') as f:
            json.dump(system_results, f, indent=2)
        
        print(f"\nResults saved to: {results_file}")
        
        return system_results
    
    def compare_systems(self, results_files: List[Path]):
        """Compare multiple system results side-by-side"""
        # TODO: Implement comparison logic
        # This will generate your "before/after" tables for blog posts
        pass

In [None]:
# ============================================================================
# USAGE EXAMPLE
# ============================================================================

def run_experiments():
    """
    Run all experiments and collect results
    """
    
    evaluator = RAGEvaluator()
    
    # Experiment 1: Baseline (should show problems)
    print("\n" + "="*60)
    print("EXPERIMENT 1: Baseline RAG")
    print("="*60)
    baseline = BaselineRAG(DATA_RAW_DIR)
    baseline_results = evaluator.evaluate_system(
        baseline, 
        "baseline",
        EVAL_QUESTIONS[:5]  # Start with 5 questions
    )
    
    # Experiment 2: Query Rewriting
    print("\n" + "="*60)
    print("EXPERIMENT 2: With Query Rewriting")
    print("="*60)
    improved_v1 = ImprovedRAG_V1(DATA_RAW_DIR)
    v1_results = evaluator.evaluate_system(
        improved_v1,
        "query_rewriting",
        EVAL_QUESTIONS[:5]
    )
    
    # Experiment 3: Better Retrieval Settings
    print("\n" + "="*60)
    print("EXPERIMENT 3: Better Chunking + Higher top_k")
    print("="*60)
    improved_v2 = ImprovedRAG_V2(DATA_RAW_DIR)
    v2_results = evaluator.evaluate_system(
        improved_v2,
        "improved_retrieval",
        EVAL_QUESTIONS[:5]
    )
    
    # Experiment 4: Better Prompting
    print("\n" + "="*60)
    print("EXPERIMENT 4: Structured Prompting")
    print("="*60)
    improved_v3 = ImprovedRAG_V3(DATA_RAW_DIR)
    v3_results = evaluator.evaluate_system(
        improved_v3,
        "structured_prompting",
        EVAL_QUESTIONS[:5]
    )
    
    print("\n" + "="*60)
    print("All experiments complete!")
    print(f"Results saved in: {RESULTS_DIR}")
    print("="*60)

if __name__ == "__main__":
    # Run this to generate all your experimental data
    run_experiments()