# Data Designer SDK: Generate Diverse RAG Evaluations

Generate comprehensive evaluation datasets for your RAG systems, customized to your content and use cases. This blueprint helps you create diverse question-answer pairs at scale, testing both answerable and unanswerable scenarios across different difficulty levels and reasoning types that form a critical step in deploying a production grade RAG system.

> **Note:** The [Data Designer](https://docs.gretel.ai/create-synthetic-data/gretel-data-designer-beta) functionality demonstrated in this notebook is currently in **Early Preview**. To access these features and run this notebook, please [join the waitlist](https://gretel.ai/navigator/data-designer#waitlist).

# 📘 Getting Started

First, let's install and import the required packages:

In [None]:
# Install required libraries

%%capture
!pip install -qq langchain unstructured[pdf] smart_open git+https://github.com/gretelai/gretel-python-client

In [None]:
# Configuration
# -------------
# Define source documents and total number of evaluation pairs to generate.
# You can replace this with your own documents.

DOCUMENT_LIST = ["https://gretel-public-website.s3.us-west-2.amazonaws.com/datasets/rag_evals/databricks-state-of-data-ai-report.pdf"]
NUM_EVALS = 100

In [None]:
# Document Processing
# ------------------
# The DocumentProcessor class handles loading and chunking source documents for RAG evaluation.
# We use langchain's RecursiveCharacterTextSplitter and unstructured.io for robust document parsing.

from typing import List, Union
from langchain.text_splitter import RecursiveCharacterTextSplitter
from unstructured.partition.auto import partition
from smart_open import open
import tempfile
import os

class DocumentProcessor:
    def __init__(self, chunk_size: int = 4192, chunk_overlap: int = 200):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
        )

    def parse_document(self, uri: str) -> str:
        """Parse a single document from URI into raw text."""
        with open(uri, 'rb') as file:
            content = file.read()
            with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                temp_file.write(content)
                temp_file.flush()
                elements = partition(temp_file.name)

        os.unlink(temp_file.name)
        return "\n\n".join([str(element) for element in elements])

    def process_documents(self, uris: Union[str, List[str]]) -> List[str]:
        """Process one or more documents into chunks for RAG evaluation."""
        if isinstance(uris, str):
            uris = [uris]

        all_chunks = []
        for uri in uris:
            text = self.parse_document(uri)
            chunks = self.text_splitter.split_text(text)
            all_chunks.extend(chunks)

        return all_chunks

In [None]:
# Data Models
# -----------
# Define Pydantic models for structured output generation:
# 1. QAPair: Schema for question-answer evaluation pairs
# 2. EvalMetrics: Schema for scoring generation quality

from pydantic import BaseModel, Field
from typing import Optional, Literal

class QAPair(BaseModel):
    question: str = Field(..., description="A specific question related to the domain of the context")
    answer: str = Field(..., description="Either a context-supported answer or explanation of why the question cannot be answered")
    reasoning: str = Field(..., description="Explanation of why this Q&A pair is valuable for evaluation")
    question_type: Literal["answerable", "unanswerable"] = Field(..., description="Whether the question should be answerable from the given context")
    expected_behavior: str = Field(..., description="Expected RAG system behavior when encountering this question")

class EvalMetrics(BaseModel):
   context_relevance: int = Field(..., description="How relevant the retrieved context is (1=irrelevant, 5=perfectly relevant)", ge=1, le=5)
   answer_precision: int = Field(..., description="Answer accuracy or appropriateness (1=incorrect/inappropriate, 5=perfect)", ge=1, le=5)
   answer_completeness: int = Field(..., description="Information completeness (1=missing critical info, 5=fully complete)", ge=1, le=5)
   hallucination_avoidance: int = Field(..., description="Adherence to facts (1=complete fabrication, 5=no hallucination)", ge=1, le=5)

In [None]:
# Configure Data Designer
# ---------------------
# Set up the data generation pipeline with:
# 1. Process source document into chunks
# 2. Initialize Data Designer
# 3. Configure seed columns for controlled diversity
# 4. Define generation templates for Q&A pairs and eval metrics

from gretel_client.navigator import DataDesigner

# Process document chunks
processor = DocumentProcessor(chunk_size=4192, chunk_overlap=200)
chunks = processor.process_documents(DOCUMENT_LIST)

# Initialize Data Designer
designer = DataDesigner(
    api_key="prompt",
    cache="yes",
    model_suite="llama-3.x"
    )

# Add seed columns
designer.add_categorical_seed_column(
    name="context",
    values=chunks
)

designer.add_categorical_seed_column(
    name="difficulty",
    values=["easy", "medium", "hard"]
)

designer.add_categorical_seed_column(
    name="reasoning_type",
    values=[
        "factual recall",
        "inferential reasoning",
        "comparative analysis",
        "procedural understanding",
        "cause and effect"
    ]
)

designer.add_categorical_seed_column(
    name="question_type",
    values=[
        "answerable",
        "unanswerable"
    ]
)

# Generation template for Q&A pairs
designer.add_generated_data_column(
    name="qa_pair",
    generation_prompt=(
        "<context>\n{context}\n</context>\n\n"
        "Generate a {difficulty} {reasoning_type} question-answer pair.\n"
        "The question should be {question_type} using the provided context.\n\n"
        "For answerable questions:\n"
        "- Ensure the answer is fully supported by the context\n"
        "- Make the reasoning clear and traceable\n\n"
        "For unanswerable questions:\n"
        "- Keep the question topically relevant\n"
        "- Make it clearly beyond the context's scope\n"
        "- Explain why it cannot be answered\n\n"
        "Put your thoughts within <think>...</think> before providing the JSON."
    ),
    data_config={
        "type": "structured",
        "params": {"model": QAPair}
    }
)

# Eval template for Q&A pairs
designer.add_generated_data_column(
   name="eval_metrics",
   llm_type="judge",
   generation_prompt=(
       "<context>\n{context}\n</context>\n\n"
       "For this {difficulty} {reasoning_type} Q&A pair:\n"
       "{qa_pair}\n\n"
       "Score each metric from 1-5 (5 is best):\n"
       "1. Context relevance (1=irrelevant, 5=perfectly relevant)\n"
       "2. Answer precision (1=incorrect/inappropriate, 5=perfect)\n"
       "3. Answer completeness (1=missing critical info, 5=fully complete)\n"
       "4. Hallucination avoidance (1=complete fabrication, 5=no hallucination)\n\n"
       "Put your thoughts within <think>...</think> before providing the JSON."
   ),
   data_config={
       "type": "structured",
       "params": {"model": EvalMetrics}
   }
)

# Preview sample of 10 records
preview = designer.generate_dataset_preview()
preview.display_sample_record()


In [None]:
# Analyze Dataset Coverage
# -----------------------
# Analyze the generated evaluation dataset to ensure:
# - Good distribution across question types and difficulty levels
# - Strong evaluation metrics
# - Sufficient coverage of different reasoning types

from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from collections import Counter
import json
import pandas as pd
from typing import List, Dict

def analyze_rag_coverage(df: pd.DataFrame) -> None:
    """
    Analyze the coverage of RAG evaluation examples.

    Args:
        df: DataFrame containing the RAG evaluation data

    Displays:
        Rich console tables showing:
        - Distribution of question types, difficulty levels, and reasoning types
        - Statistics for evaluation metrics
        - Summary of total examples
    """
    console = Console()

    # Create main table for category coverage
    category_table = Table(title="Category Coverage Analysis", show_header=True, header_style="bold magenta")
    category_table.add_column("Category", style="blue")
    category_table.add_column("Distribution", style="green")

    # Analyze categorical distributions
    categories = {
        "Question Type": "question_type",
        "Difficulty": "difficulty",
        "Reasoning Type": "reasoning_type"
    }

    for category_name, column in categories.items():
        value_counts = df[column].value_counts()
        distribution = "\n".join([
            f"{k}: {v} ({v/len(df)*100:.1f}%)"
            for k, v in value_counts.items()
        ])
        category_table.add_row(category_name, distribution)

    # Create metrics table
    metrics_table = Table(title="Evaluation Metrics Analysis", show_header=True, header_style="bold magenta")
    metrics_table.add_column("Metric", style="blue")
    metrics_table.add_column("Statistics", style="green")
    metrics_table.add_column("Distribution", style="yellow")

    # Analyze metrics distributions
    metrics = df['eval_metrics'].apply(json.loads)
    for metric in metrics.iloc[0].keys():
        values = [m[metric] for m in metrics]
        stats = {
            'mean': sum(values) / len(values),
            'min': min(values),
            'max': max(values)
        }

        # Create distribution buckets
        buckets = Counter([round(v * 2) / 2 for v in values])  # Round to nearest 0.5
        distribution = "\n".join([f"{k:.1f}: {v}" for k, v in sorted(buckets.items())])

        stats_str = f"Mean: {stats['mean']:.2f}\nMin: {stats['min']:.2f}\nMax: {stats['max']:.2f}"
        metrics_table.add_row(metric, stats_str, distribution)

    # Create summary table
    summary_table = Table.grid()
    summary_table.add_column(style="bold blue")
    summary_table.add_column(style="green")

    total_examples = len(df)
    summary_table.add_row("Total Examples:", str(total_examples))

    # Display all tables
    console.print("\n")
    console.print(Panel(category_table, title="Category Coverage Analysis", expand=False))
    console.print(Panel(metrics_table, title="Metrics Analysis", expand=False))
    console.print(Panel(summary_table, title="Summary", expand=False))

analyze_rag_coverage(preview.dataset)

In [None]:
# Generate and Analyze Dataset
# ---------------------------
# Uncomment these lines to generate evaluation pairs, save them to JSONL,
# and analyze the coverage and quality of the generated dataset.

#batch_job = designer.submit_batch_workflow(num_records=NUM_EVALS)
#dataset = batch_job.fetch_dataset(wait_for_completion=True)

#analyze_rag_coverage(dataset)

#dataset.to_json('rag_evals.jsonl', orient='records', lines=True)
