---
title: "Synthetic data for RAG"
date: 2025-07-11
date-modified: 2025-07-11
description-meta: "How to use synthetic data to build a RAG system"
categories:
  - llm
  - python
  - rag 
---

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import asyncio
import os
import random
from textwrap import dedent

import chromadb
import pandas as pd
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langsmith import Client, traceable
from pydantic import BaseModel

load_dotenv()

In [None]:
loader = DirectoryLoader(
    "../data/synthetic-data-rag/people-group/", glob="**/*.md", loader_cls=TextLoader
)
docs = loader.load()

## Index data

In [None]:
openai_ef = OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"))
client = chromadb.PersistentClient(
    path="../data/synthetic-data-rag/chroma",
)
collection = client.get_or_create_collection(
    "gitlab-handbook", embedding_function=openai_ef
)

In [None]:
from langchain_text_splitters import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4o",
    chunk_size=400,
    chunk_overlap=0,
)
splits = text_splitter.split_documents(docs)

In [None]:
def create_batches(ids, documents, metadatas, batch_size=100):
    batches = []
    for i in range(0, len(ids), batch_size):
        batch_ids = ids[i : i + batch_size]
        batch_documents = documents[i : i + batch_size]
        batch_metadatas = metadatas[i : i + batch_size]
        batches.append((batch_ids, batch_metadatas, batch_documents))
    return batches

In [None]:
ids = [f"{str(i)}" for i in range(len(splits))]
documents = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]


if collection.count() > 0:
    print("Collection already exists, skipping creation.")
else:
    print("Adding documents...")
    batches = create_batches(ids=ids, documents=documents, metadatas=metadatas)
    for i, batch in enumerate(batches):
        print(f"Adding batch {i} of size {len(batch[0])}")
        collection.add(ids=batch[0], metadatas=batch[1], documents=batch[2])

In [None]:
class RetrievedDoc(BaseModel):
    id: str
    path: str
    page_content: str


def get_similar_docs(text: str, top_k: int = 5) -> list[RetrievedDoc]:
    results = collection.query(query_texts=[text], n_results=top_k)
    docs = [results["documents"][0][i] for i in range(top_k)]
    metadatas = [results["metadatas"][0][i] for i in range(top_k)]
    ids = [results["ids"][0][i] for i in range(top_k)]
    return [
        RetrievedDoc(id=id_, path=m["source"], page_content=d)
        for d, m, id_ in zip(docs, metadatas, ids)
    ]


def get_doc_by_id(doc_id: str) -> RetrievedDoc:
    results = collection.get(ids=[doc_id])
    doc = results["documents"][0]
    metadata = results["metadatas"][0]
    return RetrievedDoc(id=doc_id, path=metadata["source"], page_content=doc)

In [None]:
def format_docs(chunks: list[RetrievedDoc]) -> str:
    return "\n".join(
        [f"*** Filepath: {chunk.path} ***\n{chunk.page_content}\n" for chunk in chunks]
    )

In [None]:
golden_docs_idx = random.sample(range(len(splits)), 200)
golden_docs = [get_doc_by_id(str(i)) for i in golden_docs_idx]

## Generate QA Pairs

In [None]:
system_prompt_generate = dedent(
    """
    You are a helpful assistant generating synthetic QA pairs for retrieval evaluation.

    Given a target chunk of text and a set of confounding chunks, you must extract a specific, self-contained fact from the target chunk that is not included in the confounding chunks. Then write a question that is directly and unambiguously answered by that fact. The question should only be answered by the fact extracted from the target chunk (and not by any of the confounding chunks) but it should also use themes or terminology that is present in the confounding chunks.

    Always respond with a JSON object with the following keys (in that exact order):
    1. "fact": "<the fact extracted from the target chunk>",
    2. "confounding_terms": "<a list of terms or themes from the confounding chunks that are relevant to the question>",
    3. "question": "<the question that is directly and unambiguously answered by the fact>",
    
    You should write the questions as if you're an employee looking for information in the handbook. The question should be as realistic and natural as possible, reflecting the kind of queries an employee might actually make when searching for information in the handbook.
    """
)

user_prompt_generate = dedent(
    """
    TARGET CHUNK:
    {target_chunk}

    CONFOUNDING CHUNKS:
    {confounding_chunks} 
    """
)

In [None]:
class Response(BaseModel):
    fact: str
    confounding_terms: list[str] = []
    question: str


llm = ChatOpenAI(model="gpt-4.1-mini", temperature=1)
llm_with_structured_output = llm.with_structured_output(Response)

In [None]:
messages = ChatPromptTemplate.from_messages(
    [("system", system_prompt_generate), ("user", user_prompt_generate)]
)

In [None]:
async def generate_qa_pair(random_chunk):
    similar_chunks = get_similar_docs(random_chunk.page_content)
    compiled_messages = await messages.ainvoke(
        {
            "target_chunk": format_docs([similar_chunks[0]]),
            "confounding_chunks": format_docs(similar_chunks[1:]),
        }
    )
    output = await llm_with_structured_output.ainvoke(compiled_messages)
    return output

In [None]:
tasks = [generate_qa_pair(random_split) for random_split in golden_docs]
qa_pairs = await asyncio.gather(*tasks)

df = pd.DataFrame([qa_pair.dict() for qa_pair in qa_pairs])
df.to_excel("../data/synthetic-data-rag/files/qa_pairs.xlsx", index=False)

## Filtering QA pairs

In [None]:
system_prompt_rate = dedent(
    """
    You are an AI assistant helping us curate a high-quality dataset of questions for evaluating an company's internal handbook. We have generated synthetic questions and need to filter out those that are unrealistic or not representative of typical user queries.

    Here are examples of realistic and unrealistic user queries we have manually rated:

    ### Realistic Queries (Good Examples)

    * **Query:** "What is the required process for creating a new learning hub for your team in Level Up at GitLab?"
        * **Explanation:** Very realistic user query. It's concise, information-seeking, and process-oriented.
        * **Rating:** 5
    * **Query:** "Where is the People Operations internal handbook hosted, and how can someone gain access to it?"
        * **Explanation:** Realistic query but might be a bit too detailed for a typical user.
        * **Rating:** 4
    * **Query:** "Who controls access to People Data in the data warehouse at GitLab, and what approvals are required for Analytics Engineers and Data Analysts to obtain access?"
        * **Explanation:** Seems reasonable but too lengthy for a typical user query. 
        * **Rating:** 3

    ### Unrealistic Queries (Bad Examples)

    * **Query:** "If a GitLab team member has been with the company for over 3 months and is interested in participating in the Onboarding Buddy Program, what should they do to express their interest?"
        * **Explanation:** Overly specific and unnatural. No real user would ask this.
        * **Rating:** 1
    * **Query:** "On what date did the 'Managing Burnout with Time Off with John Fitch' session occur as part of the FY21 Learning Speaker Series?"
        * **Explanation:** Irrelevant and overly specific. Not a typical user query. 
        * **Rating:** 2

    ### Your Task

    For the following generated question, please:

    1.  Rate its realism as a typical user query for an internal handbook application on a scale of 1 to 5 (1 = Very Unrealistic, 3 = Neutral/Somewhat Realistic, 5 = Very Realistic).
    2.  Provide a brief explanation for your rating, comparing it to the examples above if helpful.

    ### Output Format

    **Explanation:** `[Your brief explanation]`
    **Rating:** `[Your 1–5 rating]`
    """
)

user_prompt_rate = dedent(
    """
    **Generated Question to Evaluate:**
    `{question_to_evaluate}`
    """
)

In [None]:
class ResponseFiltering(BaseModel):
    explanation: str
    rating: int


llm_with_structured_output_filtering = llm.with_structured_output(ResponseFiltering)

messages_filtering = ChatPromptTemplate.from_messages(
    [("system", system_prompt_rate), ("user", user_prompt_rate)]
)

In [None]:
async def rate_qa_pair(qa_pair):
    compiled_messages = await messages_filtering.ainvoke(
        {"question_to_evaluate": qa_pair.question}
    )
    output = await llm_with_structured_output_filtering.ainvoke(compiled_messages)
    return output


tasks = [rate_qa_pair(qa_pair) for qa_pair in qa_pairs]
results = await asyncio.gather(*tasks)

In [None]:
rated_qa_pairs = [
    {
        "rating": result.rating,
        "explanation": result.explanation,
        "question": qa_pair.question,
        "answer": qa_pair.fact,
    }
    for (result, qa_pair) in zip(results, qa_pairs)
]

In [None]:
df_rated_qa_pairs = pd.DataFrame(
    rated_qa_pairs, columns=["Rating", "Explanation", "Question"]
)

df_rated_qa_pairs.to_excel(
    "../data/synthetic-data-rag/files/rated_qa_pairs.xlsx", index=False
)

## Evaluating RAG system 

In [None]:
langsmith_client = Client()
dataset_name = "Gitlab Handbook QA Evaluation 2"

try:
    dataset = langsmith_client.create_dataset(dataset_name=dataset_name)
    examples = [
        {
            "inputs": {
                "question": h["question"],
            },
            "outputs": {
                "answer": h["answer"],
                "doc": {
                    "id": chunk.id,
                    "path": chunk.path,
                },
            },
        }
        for h, chunk in zip(rated_qa_pairs, golden_docs)
        if h["rating"] >= 5
    ]
    langsmith_client.create_examples(dataset_id=dataset.id, examples=examples)
except Exception:
    print("Dataset already exists, skipping creation.")
    dataset = langsmith_client.read_dataset(dataset_name=dataset_name)

### Retrieval Metrics

In [None]:
def mrr(inputs: dict, outputs: dict, reference_outputs: dict) -> list[dict]:
    reference_docs = [str(reference_outputs["doc"]["id"])]
    docs = outputs.get("docs", [])

    if not docs:
        return [{"key": "MRR", "score": 0.0}]

    rank = next((i + 1 for i, doc in enumerate(docs) if doc in reference_docs), None)
    mrr_value = 1.0 / rank if rank else 0.0

    return [{"key": "MRR", "score": mrr_value}]


def recall_at_k(inputs: dict, outputs: dict, reference_outputs: dict) -> list[dict]:
    reference_docs = [str(reference_outputs["doc"]["id"])]
    docs = outputs.get("docs", [])

    if not docs:
        return [
            {"key": "recall@1", "score": 0.0},
            {"key": "recall@3", "score": 0.0},
            {"key": "recall@5", "score": 0.0},
            {"key": "recall@10", "score": 0.0},
        ]

    recall_at_1 = docs[0] in reference_docs
    recall_at_3 = any(doc in reference_docs for doc in docs[:3])
    recall_at_5 = any(doc in reference_docs for doc in docs[:5])
    recall_at_10 = any(doc in reference_docs for doc in docs[:10])

    return [
        {"key": "recall@1", "score": recall_at_1 * 1.0},
        {"key": "recall@3", "score": recall_at_3 * 1.0},
        {"key": "recall@5", "score": recall_at_5 * 1.0},
        {"key": "recall@10", "score": recall_at_10 * 1.0},
    ]

### Generation metrics

#### Answer accuracy

In [None]:
system_prompt_answer_accuracy = dedent(
    """
    You are an expert evaluator. Your task is to evaluate the accuracy of a User Answer against a Reference Answer, given a Question.

    Say 2, if User Answer is full contained and equivalent to Reference Answer in all terms, topics, numbers, metrics, dates and units.
    Say 1, if User Answer is partially contained and almost equivalent to Reference Answer in all terms, topics, numbers, metrics, dates and units.
    Say 0, if User Answer is not contained in Reference Answer or not accurate in all terms, topics, numbers, metrics, dates and units or the User Answer do not answer the question.

    Your rating must be only 2, 1 or 0 according to the instructions above.


    Your answer must be a JSON object with the following keys:
    1. "explanation": "<a brief explanation of your rating>",
    2. "rating": "<your rating, which must be one of the following:
    """
)

user_prompt_answer_accuracy = dedent(
    """
    **Question:** `{question}`
    **User Answer:** `{user_answer}`
    **Reference Answer:** `{reference_answer}`
    """
)

messages_answer_accuracy = ChatPromptTemplate.from_messages(
    [("system", system_prompt_answer_accuracy), ("user", user_prompt_answer_accuracy)]
)

class ResponseAnswerAccuracy(BaseModel):
    explanation: str
    rating: int

llm = ChatOpenAI(model="gpt-4.1-mini")

llm_with_structured_output_answer_accuracy = llm.with_structured_output(ResponseAnswerAccuracy)

async def answer_accuracy(inputs: dict, outputs: dict, reference_outputs: dict) -> list[dict]:
    question = inputs["question"]
    user_answer = outputs["answer"]
    reference_answer = reference_outputs["answer"]

    compiled_messages = await messages_answer_accuracy.ainvoke(
        {
            "question": question,
            "user_answer": user_answer,
            "reference_answer": reference_answer,
        }
    )
    
    output  = await llm_with_structured_output_answer_accuracy.ainvoke(compiled_messages)

    return output.rating

## Run evaluation

In [None]:
system_prompt_generation = dedent(
    """
    You're a helpful assistant. Provided with a question and the most relevant documents, you must generate a concise and accurate answer based on the information in those documents.
    """
)

user_prompt_generation = dedent(
    """
    QUESTION: {question}

    RELEVANT DOCUMENTS:
    {documents}
    """
)

messages_generation = ChatPromptTemplate.from_messages(
    [("system", system_prompt_generation), ("user", user_prompt_generation)]
)

llm_generation = ChatOpenAI(
    model="gpt-4o-mini",
)

In [None]:
@traceable
async def target(inputs: dict) -> dict:
    relevant_docs = get_similar_docs(inputs["question"], top_k=10)
    formatted_docs = format_docs(relevant_docs)
    messages = await messages_generation.ainvoke(
        {
            "question": inputs["question"],
            "documents": formatted_docs,
        }
    )
    response = await llm_generation.ainvoke(messages)
    return {"answer": response.content, "docs": [doc.id for doc in relevant_docs]}

In [None]:
experiment_results = await langsmith_client.aevaluate(
    target,
    data=dataset_name,
    evaluators=[recall_at_k, mrr, answer_accuracy],
    max_concurrency=1,
)

## Improving retrieval with a reranker

In [None]:
from sentence_transformers import CrossEncoder

cross_encoder = CrossEncoder("mixedbread-ai/mxbai-rerank-xsmall-v1")

In [None]:
query = "What is the process for creating a new learning hub for your team in Level Up at GitLab?"
hits = get_similar_docs(query, top_k=50)
cross_inp = [[query, h.page_content] for h in hits]
reranker_scores = cross_encoder.predict(cross_inp)
sorted_hits = sorted(hits, key=lambda x: reranker_scores[hits.index(x)], reverse=True)

In [None]:
def get_reranked_docs(
    query: str, similar_docs: list[RetrievedDoc]
) -> list[RetrievedDoc]:
    cross_inp = [[query, doc.page_content] for doc in similar_docs]
    reranker_scores = cross_encoder.predict(cross_inp)
    sorted_docs = sorted(
        similar_docs, key=lambda x: reranker_scores[similar_docs.index(x)], reverse=True
    )
    return sorted_docs[:10]


@traceable
async def target_with_reranking(inputs: dict) -> dict:
    relevant_docs = get_similar_docs(inputs["question"], top_k=100)
    reranked_docs = get_reranked_docs(inputs["question"], relevant_docs)
    formatted_docs = format_docs(reranked_docs)
    messages = await messages_generation.ainvoke(
        {
            "question": inputs["question"],
            "documents": formatted_docs,
        }
    )
    response = await llm.ainvoke(messages)
    return {"answer": response, "docs": [doc.id for doc in reranked_docs]}

In [None]:
experiment_results = await langsmith_client.aevaluate(
    target_with_reranking,
    data=dataset_name,
    evaluators=[recall_at_k, mrr],
    max_concurrency=20,
)