# RAG Pipeline Optimization Example

This notebook demonstrates:
1. Setting up an unstructured document dataset
2. Creating an evaluation set with ground truth relevance judgments
3. Testing multiple RAG pipeline configurations using instance-based component variations
4. Evaluating retrieval relevance metrics (Precision@K, Recall@K, MRR)
5. Selecting the best pipeline configuration
6. Integrating the optimized pipeline with LangChain for Q&A

In [None]:
import os
os.environ['OPENAI_API_KEY'] = '<your_openai_api_key>'

In [2]:
from typing import List, Dict, Any
import json

from rag_search.rag_client import RAGClient
from rag_search.experiment.pipeline import generate_client_variations, build_client, artifact_from_client
from rag_search.experiment.artifact.rag_artifact import RAGArtifact

# Ingestion implementations
from rag_search.parameter_impls.ingestion_impls import (
    SimpleIngestion,
)

# Chunking implementations
from rag_search.parameter_impls.chunking_impls import (
    SlidingWindowChunking,
    LangChainRecursiveChunking,
)

# Embedding implementations
from rag_search.parameter_impls.embedding_impls import (
    HuggingFaceEmbedding,
)

# Storage implementations
from rag_search.parameter_impls.storage_impls import (
    SimpleStorage,
)

# Retriever implementations
from rag_search.parameter_impls.retriever_impls import (
    SimpleRetriever,
)

# Optional reranker
from rag_search.parameter_impls.reranking_impls import (
    CrossEncoderReranker,
)

# LangChain imports for Q&A (consume RAGArtifact as a retriever)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI


  from .autonotebook import tqdm as notebook_tqdm


## 1. Setup: Create Sample Document Dataset

We'll create a sample document corpus about various topics for testing.

In [None]:
# Define component variations to test
ingestion_variants = [
    SimpleIngestion(),
]

chunking_variants = [
    SlidingWindowChunking(chunk_size=128, chunk_overlap=20),
    SlidingWindowChunking(chunk_size=256, chunk_overlap=50),
    SlidingWindowChunking(chunk_size=512, chunk_overlap=100),
    LangChainRecursiveChunking(chunk_size=512, chunk_overlap=50),
]

embedding_variants = [
    HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2'),
]

storage_variants = [
    SimpleStorage(),
]

# Retriever variants (optionally with reranker)
retriever_variants = [
    SimpleRetriever(storage=storage_variants[0], embedding=embedding_variants[0], top_k=3, reranker=None),
    SimpleRetriever(storage=storage_variants[0], embedding=embedding_variants[0], top_k=5, reranker=None),
    SimpleRetriever(storage=storage_variants[0], embedding=embedding_variants[0], top_k=5, reranker=CrossEncoderReranker(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')),
]


In [None]:
# -----------------------------
# Define corpus documents
# -----------------------------

documents = [
    {
        "doc_id": "doc_1",
        "text": "Retrieval-Augmented Generation (RAG) combines information retrieval with text generation to improve factual accuracy."
    },
    {
        "doc_id": "doc_2",
        "text": "Vector databases store embeddings and enable efficient similarity search for retrieval systems."
    },
    {
        "doc_id": "doc_3",
        "text": "Chunking strategies such as sliding windows affect recall and precision in RAG pipelines."
    },
    {
        "doc_id": "doc_4",
        "text": "Embedding models map text into dense vector representations used for semantic search."
    },
]


# -----------------------------
# Define evaluation dataset
# -----------------------------

eval_dataset = [
    {
        "query": "What is RAG?",
        "relevant_doc_ids": ["doc_1"]
    },
    {
        "query": "What do vector databases do?",
        "relevant_doc_ids": ["doc_2"]
    },
    {
        "query": "How does chunking affect RAG systems?",
        "relevant_doc_ids": ["doc_3"]
    },
    {
        "query": "What are embeddings used for?",
        "relevant_doc_ids": ["doc_4"]
    },
]


# -----------------------------
# Metric helper functions
# -----------------------------

def calculate_precision_at_k(retrieved_ids, relevant_ids, k):
    if k == 0:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    if not retrieved_k:
        return 0.0
    hits = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return hits / k


def calculate_recall_at_k(retrieved_ids, relevant_ids, k):
    if not relevant_ids:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    hits = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return hits / len(relevant_ids)


def calculate_mrr(retrieved_ids, relevant_ids):
    for idx, doc_id in enumerate(retrieved_ids, start=1):
        if doc_id in relevant_ids:
            return 1.0 / idx
    return 0.0



## 2. Create Evaluation Set with Ground Truth

Define queries and their relevant documents for evaluation.

In [5]:
# Run experiments for each pipeline variation (output is a RAGArtifact per config)
experiment_results = []

configs = generate_client_variations(
    ingestion_variants=ingestion_variants,
    chunking_variants=chunking_variants,
    embedding_variants=embedding_variants,
    storage_variants=storage_variants,
    retriever_variants=retriever_variants,
)

print(f"Testing {len(configs)} pipeline configurations...\n")

for cfg in configs:
    storage = SimpleStorage()
    embedding = cfg.embedding
    retriever = SimpleRetriever(storage=storage, embedding=embedding, top_k=getattr(cfg.retriever, 'top_k', 5), reranker=getattr(cfg.retriever, 'reranker', None))
    client = RAGClient(
        ingestion=cfg.ingestion,
        chunking=cfg.chunking,
        embedding=embedding,
        storage=storage,
        retriever=retriever,
    )

    # Ingest corpus
    client.upload_documents(documents)

    # Evaluate retrieval over the evaluation dataset
    metrics_list = []
    for item in eval_dataset:
        q = item['query']
        relevant_ids = item['relevant_doc_ids']
        results = client.retrieve(q)
        retrieved_ids = [r.get('doc_id') for r in results]

        metrics_list.append({
            'precision@3': calculate_precision_at_k(retrieved_ids, relevant_ids, 3),
            'precision@5': calculate_precision_at_k(retrieved_ids, relevant_ids, 5),
            'recall@5': calculate_recall_at_k(retrieved_ids, relevant_ids, 5),
            'mrr': calculate_mrr(retrieved_ids, relevant_ids),
        })

    avg_metrics = {k: sum(m[k] for m in metrics_list)/len(metrics_list) for k in metrics_list[0].keys()}

    params = {
        'ingestion': type(cfg.ingestion).__name__,
        'chunking': type(cfg.chunking).__name__,
        'embedding': type(embedding).__name__,
        'storage': type(storage).__name__,
        'retriever': type(retriever).__name__,
        'top_k': getattr(retriever, 'top_k', None),
        'reranker': type(getattr(retriever, 'reranker', None)).__name__ if getattr(retriever, 'reranker', None) else None,
    }

    artifact = artifact_from_client(client=client, experiment_params=params, metrics=avg_metrics)
    experiment_results.append({
        'config_name': cfg.name,
        'artifact': artifact,
        'metrics': avg_metrics,
    })

    print(cfg.name)
    for metric, value in avg_metrics.items():
        print(f"  {metric}: {value:.4f}")
    print()

# Find best configuration based on MRR
best_result = max(experiment_results, key=lambda x: x['metrics']['mrr'])
best_artifact: RAGArtifact = best_result['artifact']
print("\n" + "="*120)
print(f"Best Configuration: {best_result['config_name']}")
print(f"Best MRR Score: {best_result['metrics']['mrr']:.4f}")
print("="*120)


Testing 12 pipeline configurations...

Config 1: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
  precision@3: 0.4167
  precision@5: 0.2500
  recall@5: 1.2500
  mrr: 0.8750

Config 2: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
  precision@3: 0.4167
  precision@5: 0.2500
  recall@5: 1.2500
  mrr: 0.8750

Config 3: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
  precision@3: 0.4167
  precision@5: 0.2500
  recall@5: 1.2500
  mrr: 1.0000

Config 4: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
  precision@3: 0.3333
  precision@5: 0.2000
  recall@5: 1.0000
  mrr: 0.8750

Config 5: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
  precision@3: 0.3333
  precision@5: 0.2000
  recall@5: 1.0000
  mrr: 0.8750

Config 6: SimpleIngestion | SlidingW

## 3. Define Evaluation Metrics

Implement retrieval evaluation metrics.

In [6]:
# The metrics helper functions are already defined in the cell above (99185da0)
# Here's a summary of what they do:

# calculate_precision_at_k: Measures what fraction of retrieved docs are relevant
# calculate_recall_at_k: Measures what fraction of relevant docs were retrieved  
# calculate_mrr: Mean Reciprocal Rank - measures how high the first relevant doc ranks

# Example usage:
retrieved = ["doc_1", "doc_3", "doc_2"]
relevant = ["doc_1"]

print(f"Precision@3: {calculate_precision_at_k(retrieved, relevant, 3):.4f}")
print(f"Recall@3: {calculate_recall_at_k(retrieved, relevant, 3):.4f}")
print(f"MRR: {calculate_mrr(retrieved, relevant):.4f}")

Precision@3: 0.3333
Recall@3: 1.0000
MRR: 1.0000


## 4. Define Pipeline Component Variations

Create sets of component instances to test different configurations.

In [7]:
# The component variations are already defined in cell-4 above:
# - ingestion_variants: Different document ingestion strategies
# - chunking_variants: Different chunking strategies (sliding window, recursive)
# - embedding_variants: Different embedding models
# - storage_variants: Different storage backends
# - retriever_variants: Different retrieval configurations (top_k, reranker)

# Print summary of variations
print(f"Ingestion variants: {len(ingestion_variants)}")
print(f"Chunking variants: {len(chunking_variants)}")
print(f"Embedding variants: {len(embedding_variants)}")
print(f"Storage variants: {len(storage_variants)}")
print(f"Retriever variants: {len(retriever_variants)}")
print(f"\nTotal possible configurations: {len(ingestion_variants) * len(chunking_variants) * len(retriever_variants)}")

Ingestion variants: 1
Chunking variants: 4
Embedding variants: 1
Storage variants: 1
Retriever variants: 3

Total possible configurations: 12


## 5. Run Optimization Experiments

Test each pipeline configuration and collect metrics.

In [8]:
# The optimization experiments are run in cell-6 above.
# Here we can inspect the experiment results in more detail.

print(f"Total experiments run: {len(experiment_results)}\n")

# Show all results sorted by MRR
sorted_results = sorted(experiment_results, key=lambda x: x['metrics']['mrr'], reverse=True)

print("Rankings by MRR:\n")
for i, result in enumerate(sorted_results, 1):
    print(f"{i}. {result['config_name']}")
    print(f"   MRR: {result['metrics']['mrr']:.4f}, Precision@5: {result['metrics']['precision@5']:.4f}, Recall@5: {result['metrics']['recall@5']:.4f}")
    print()

Total experiments run: 12

Rankings by MRR:

1. Config 3: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 1.0000, Precision@5: 0.2500, Recall@5: 1.2500

2. Config 6: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 1.0000, Precision@5: 0.2000, Recall@5: 1.0000

3. Config 9: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 1.0000, Precision@5: 0.2000, Recall@5: 1.0000

4. Config 12: SimpleIngestion | LangChainRecursiveChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 1.0000, Precision@5: 0.2000, Recall@5: 1.0000

5. Config 1: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 0.8750, Precision@5: 0.2500, Recall@5: 1.2500

6. Config 2: SimpleIngestion | SlidingWindowChunking | HuggingFaceEmbedding | SimpleStorage | SimpleRetriever
   MRR: 0.8750

## 6. Use Best Pipeline for Q&A

Create a simple retrieval function using the best pipeline.

In [10]:
# Test retrieval using the best artifact directly (RAGArtifact is a BaseRetriever)
test_query = "What is RAG?"
print(f"Query: {test_query}\n")

docs = best_artifact.invoke(test_query)
print("Retrieved Context:")
for i, doc in enumerate(docs, 1):
    print(f"{i}. {doc.page_content}")

Query: What is RAG?

Retrieved Context:
1. Retrieval-Augmented Generation (RAG) combines information retrieval with text generation to improve factual accuracy.
2. Chunking strategies such as sliding windows affect recall and precision in RAG pipelines.
3. accuracy.


In [11]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant. Use the provided context to answer the question.

Context:
{context}

Question: {input}
"""
)

doc_chain = create_stuff_documents_chain(llm, prompt)
qa_chain = create_retrieval_chain(best_artifact, doc_chain)

sample_questions = [
    eval_dataset[0]['query'],
    eval_dataset[1]['query'],
]

for q in sample_questions:
    out = qa_chain.invoke({"input": q})
    print("Q:", q)
    print("A:", out["answer"])
    print("-" * 80)

Q: What is RAG?
A: Retrieval-Augmented Generation (RAG) is a method that combines information retrieval with text generation to enhance the factual accuracy of generated content. It leverages external information sources to provide more reliable and contextually relevant responses.
--------------------------------------------------------------------------------
Q: What do vector databases do?
A: Vector databases store embeddings and enable efficient similarity search for retrieval systems.
--------------------------------------------------------------------------------


## 7. Save Best Configuration

Save the best pipeline configuration for later use.

In [12]:
# Save best configuration info using pickle
import pickle

best_info = {
    'config_name': best_result['config_name'],
    'metrics': best_result['metrics'],
}

with open('best_rag_config.json', 'w') as f:
    json.dump(best_info, f, indent=2)
print("Saved best configuration info to best_rag_config.json")

with open('best_artifact.pkl', 'wb') as f:
    pickle.dump(best_artifact, f)
print("Saved best artifact to best_artifact.pkl")

Saved best configuration info to best_rag_config.json
Saved best artifact to best_artifact.pkl


## 8. Example: Creating a Custom Pipeline

Demonstrate how to create a pipeline with specific component instances.

In [13]:
# Example: Creating a custom pipeline with specific component instances
from rag_search.rag_client import RAGClient

# Create custom components
custom_chunking = SlidingWindowChunking(chunk_size=256, chunk_overlap=30)
custom_embedding = HuggingFaceEmbedding(model_name='sentence-transformers/all-MiniLM-L6-v2')
custom_storage = SimpleStorage()
custom_retriever = SimpleRetriever(
    storage=custom_storage,
    embedding=custom_embedding,
    top_k=5,
    reranker=CrossEncoderReranker(model_name='cross-encoder/ms-marco-MiniLM-L-6-v2')
)

# Build custom client
custom_client = RAGClient(
    ingestion=SimpleIngestion(),
    chunking=custom_chunking,
    embedding=custom_embedding,
    storage=custom_storage,
    retriever=custom_retriever,
)

# Ingest documents
custom_client.upload_documents(documents)

# Test retrieval
query = "What is RAG?"
results = custom_client.retrieve(query)
print(f"Query: {query}\n")
print("Results:")
for i, result in enumerate(results, 1):
    print(f"  {i}. {result}")

Query: What is RAG?

Results:
  1. {'text': 'Retrieval-Augmented Generation (RAG) combines information retrieval with text generation to improve factual accuracy.', 'score': 6.015378475189209, 'chunk_id': 0, 'doc_id': 'doc_1', 'original_score': 0.3975445191976605, 'rerank_score': 6.015378475189209}
  2. {'text': 'Chunking strategies such as sliding windows affect recall and precision in RAG pipelines.', 'score': -4.435868263244629, 'chunk_id': 0, 'doc_id': 'doc_3', 'original_score': 0.3627310530889852, 'rerank_score': -4.435868263244629}
  3. {'text': 'Vector databases store embeddings and enable efficient similarity search for retrieval systems.', 'score': -11.362852096557617, 'chunk_id': 0, 'doc_id': 'doc_2', 'original_score': 0.03317414874458483, 'rerank_score': -11.362852096557617}
  4. {'text': 'Embedding models map text into dense vector representations used for semantic search.', 'score': -11.3738431930542, 'chunk_id': 0, 'doc_id': 'doc_4', 'original_score': 0.034143145429370976