<a target="_parent" href="https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/docs/notebooks/demo/navigator/rag-evals/data_designer_sdk_rag_evals.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Data Designer SDK: Generate Diverse RAG Evaluations

Generate comprehensive evaluation datasets for your RAG systems, customized to your content and use cases. This blueprint helps you create diverse question-answer pairs at scale, testing both answerable and unanswerable scenarios across different difficulty levels and reasoning types that form a critical step in deploying a production grade RAG system.

> **Note:** The [Data Designer](https://docs.gretel.ai/create-synthetic-data/gretel-data-designer-beta) functionality demonstrated in this notebook is currently in **Early Preview**. To access these features and run this notebook, please [join the waitlist](https://gretel.ai/navigator/data-designer#waitlist).

# 📘 Getting Started

First, let's install and import the required packages:

In [None]:
# Install required libraries

%%capture
!pip install -qq langchain unstructured[pdf] smart_open git+https://github.com/gretelai/gretel-python-client

In [None]:
# Configuration
# -------------
# Define source documents and total number of evaluation pairs to generate.
# You can replace this with your own documents.

DOCUMENT_LIST = ["https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/rag_evals/databricks-state-of-data-ai-report.pdf"]
NUM_EVALS = 100

In [None]:
# Document Processing
# ------------------
# The DocumentProcessor class handles loading and chunking source documents for RAG evaluation.
# We use langchain's RecursiveCharacterTextSplitter and unstructured.io for robust document parsing.

from typing import List, Union
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.auto import partition
from smart_open import open
import tempfile
import os

class DocumentProcessor:
    def __init__(self, chunk_size: int = 4192, chunk_overlap: int = 200):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

    def parse_document(self, uri: str) -> str:
        """Parse a single document from URI into raw text."""
        with open(uri, 'rb') as file:
            content = file.read()
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(content)
                temp_file.flush()
                elements = partition(temp_file.name)

        os.unlink(temp_file.name)
        return "\n\n".join([str(element) for element in elements])

    def process_documents(self, uris: Union[str, List[str]]) -> List[str]:
        """Process one or more documents into chunks for RAG evaluation."""
        if isinstance(uris, str):
            uris = [uris]

        all_chunks = []
        for uri in uris:
            text = self.parse_document(uri)
            chunks = self.text_splitter.split_text(text)
            all_chunks.extend(chunks)

        return all_chunks

In [None]:
# Data Models
# -----------
# Define Pydantic models for structured output generation:
# 1. QAPair: Schema for question-answer evaluation pairs
# 2. EvalMetrics: Schema for scoring generation quality

from pydantic import BaseModel, Field
from typing import Optional, Literal

class QAPair(BaseModel):
    question: str = Field(..., description="A specific question related to the domain of the context")
    answer: str = Field(..., description="Either a context-supported answer or explanation of why the question cannot be answered")


class EvalMetrics(BaseModel):
   context_relevance: int = Field(..., description="How relevant the retrieved context is (1=irrelevant, 5=perfectly relevant)", ge=1, le=5)
   answer_precision: int = Field(..., description="Answer accuracy or appropriateness (1=incorrect/inappropriate, 5=perfect)", ge=1, le=5)
   answer_completeness: int = Field(..., description="Information completeness (1=missing critical info, 5=fully complete)", ge=1, le=5)
   hallucination_avoidance: int = Field(..., description="Adherence to facts (1=complete fabrication, 5=no hallucination)", ge=1, le=5)

In [None]:
# Configure Data Designer
# ---------------------
# Set up the data generation pipeline with:
# 1. Process source document into chunks
# 2. Initialize Data Designer
# 3. Configure seed columns for controlled diversity
# 4. Define generation templates for Q&A pairs and eval metrics

from gretel_client.navigator import DataDesigner

# Process document chunks
processor = DocumentProcessor(chunk_size=4192, chunk_overlap=200)
chunks = processor.process_documents(DOCUMENT_LIST)

# Initialize Data Designer
designer = DataDesigner(
    api_key="prompt",
    cache="yes",
    model_suite="llama-3.x"
    )

# Add seed columns
designer.add_categorical_seed_column(
    name="context",
    values=chunks
)

designer.add_categorical_seed_column(
    name="difficulty",
    values=["easy", "medium", "hard"]
)

designer.add_categorical_seed_column(
    name="reasoning_type",
    values=[
        "factual recall",
        "inferential reasoning",
        "comparative analysis",
        "procedural understanding",
        "cause and effect"
    ]
)

designer.add_categorical_seed_column(
    name="question_type",
    values=[
        "answerable",
        "unanswerable"
    ],
    weights=[10, 1]
)

# Generation template for Q&A pairs
designer.add_generated_data_column(
    name="qa_pair",
    generation_prompt=(
        "<context>\n{context}\n</context>\n\n"
        "Generate a {difficulty} {reasoning_type} question-answer pair.\n"
        "The question should be {question_type} using the provided context.\n\n"
        "For answerable questions:\n"
        "- Ensure the answer is fully supported by the context\n"
        "- Make the reasoning clear and traceable\n\n"
        "For unanswerable questions:\n"
        "- Keep the question topically relevant\n"
        "- Make it clearly beyond the context's scope\n"
        "- Explain why it cannot be answered\n\n"
        "Put your thoughts within <think>...</think> before providing the JSON."
    ),
    data_config={
        "type": "structured",
        "params": {"model": QAPair}
    }
)

# Eval template for Q&A pairs
designer.add_generated_data_column(
   name="eval_metrics",
   llm_type="judge",
   generation_prompt=(
       "<context>\n{context}\n</context>\n\n"
       "For this {difficulty} {reasoning_type} Q&A pair:\n"
       "{qa_pair}\n\n"
       "Rate each criterion on a scale of 1-5:\n"
       "- Context Relevance\n"
       "- Answer Precision\n"
       "- Answer Completeness\n"
       "- Hallucination Avoidance\n"
       "Put your thoughts within <think>...</think> before providing the JSON."
   ),
   data_config={
       "type": "structured",
       "params": {"model": EvalMetrics}
   }
)

# Preview sample of 10 records
preview = designer.generate_dataset_preview()
preview.display_sample_record()


In [None]:
# Generate a report analyzing RAG coverage
# ---------------------

import json
from rich.console import Console
from rich.table import Table
from collections import Counter
import pandas as pd

def analyze_rag_coverage(df: pd.DataFrame) -> None:
    """
    Analyze the coverage of RAG evaluation examples with consistent formatting.

    Args:
        df: DataFrame containing RAG evaluation data
    """
    # Normalize the DataFrame
    qa_df = pd.json_normalize(
        df.assign(eval_metrics=lambda _df: _df["eval_metrics"].apply(
            lambda x: json.loads(x) if isinstance(x, str) else x
        )).to_dict(orient="records")
    )

    console = Console()
    categories = ['question_type', 'difficulty', 'reasoning_type']

    # Print header
    console.print("\n[bold blue]📊 RAG Evaluation Report[/bold blue]", justify="center")
    console.print("=" * 80, justify="center")
    console.print(f"\n[bold]Total Examples:[/bold] {len(qa_df)}")

    # Category distributions
    for category in categories:
        if category in qa_df.columns:
            # Count non-empty values
            counts = Counter(x for x in qa_df[category] if pd.notna(x) and x != '')
            if not counts:
                continue

            table = Table(title=f"\n{category.title()} Distribution")
            table.add_column("Category", style="cyan")
            table.add_column("Count", justify="right")
            table.add_column("Percentage", justify="right")

            total = sum(counts.values())
            for value, count in sorted(counts.items()):
                percentage = (count / total) * 100
                table.add_row(str(value), str(count), f"{percentage:.1f}%")

            console.print(table)

    # Quality metrics summary
    metric_cols = [col for col in qa_df.columns if col.startswith('eval_metrics.')]
    if metric_cols:
        metrics_table = Table(title="\nQuality Metrics Summary")
        metrics_table.add_column("Metric")
        metrics_table.add_column("Average Score", justify="right")

        for col in metric_cols:
            metric_name = col.replace('eval_metrics.', '').replace('_', ' ').title()
            scores = qa_df[col].dropna()
            if len(scores) > 0:
                avg_score = scores.mean()
                metrics_table.add_row(
                    metric_name,
                    f"{avg_score:.2f}/5.00"
                )

        console.print(metrics_table)

# Usage
analyze_rag_coverage(preview.dataset)

In [None]:
# Explore the generated preview as a Pandas DataFrame
# ---------------------------

preview.dataset

In [None]:
# Generate and Analyze Full Dataset
# ---------------------------
# Uncomment these lines to generate evaluation pairs, save them to JSONL,
# and analyze the coverage and quality of the generated dataset.

#batch_job = designer.submit_batch_workflow(num_records=NUM_EVALS)
#dataset = batch_job.fetch_dataset(wait_for_completion=True)

#analyze_rag_coverage(dataset)

#dataset.to_json('rag_evals.jsonl', orient='records', lines=True)
