---
title: "Synthetic data for RAG"
date: 2025-07-11
date-modified: 2025-07-11
description-meta: "How to use synthetic data to build a RAG system"
categories:
  - llm
  - python
  - rag 
---

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import asyncio
import os
import random
from textwrap import dedent

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai import ChatOpenAI
from langsmith import Client, traceable
from pydantic import BaseModel

load_dotenv()

In [None]:
loader = DirectoryLoader(
    "../data/synthetic-data-rag/people-group/", glob="**/*.md", loader_cls=TextLoader
)
docs = loader.load()

## Index data

In [None]:
openai_ef = OpenAIEmbeddingFunction(api_key=os.getenv("OPENAI_API_KEY"))
client = chromadb.PersistentClient()

collection = client.get_or_create_collection(
    "gitlab-handbook", embedding_function=openai_ef
)

In [None]:
from langchain_text_splitters import MarkdownTextSplitter

text_splitter = MarkdownTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4o",
    chunk_size=1000,
    chunk_overlap=200,
)
splits = text_splitter.split_documents(docs)

In [None]:
def create_batches(ids, documents, metadatas, batch_size=100):
    batches = []
    for i in range(0, len(ids), batch_size):
        batch_ids = ids[i : i + batch_size]
        batch_documents = documents[i : i + batch_size]
        batch_metadatas = metadatas[i : i + batch_size]
        batches.append((batch_ids, batch_metadatas, batch_documents))
    return batches

In [None]:
ids = [f"{str(i)}" for i in range(len(splits))]
documents = [doc.page_content for doc in splits]
metadatas = [doc.metadata for doc in splits]

try:
    client.get_collection("gitlab-handbook")
    print("Collection already exists, skipping creation.")
except Exception:
    batches = create_batches(ids=ids, documents=documents, metadatas=metadatas)
    for i, batch in enumerate(batches):
        print(f"Adding batch {i} of size {len(batch[0])}")
        collection.add(ids=batch[0], metadatas=batch[1], documents=batch[2])

In [None]:
class Chunk(BaseModel):
    path: str
    page_content: str


@traceable
def get_similar_chunks(chunk, n_results: int = 5) -> list[Chunk]:
    results = collection.query(query_texts=[chunk.page_content], n_results=20)
    random_chunks = random.sample(range(20), n_results)
    docs = [results["documents"][0][i] for i in random_chunks]
    metadatas = [results["metadatas"][0][i] for i in random_chunks]
    return [Chunk(path=m["source"], page_content=d) for d, m in zip(docs, metadatas)]

In [None]:
def format_chunks(chunks: list[Chunk]) -> str:
    return "\n".join(
        [f"*** Filepath: {chunk.path} ***\n{chunk.page_content}\n" for chunk in chunks]
    )

## Generate QA Pairs

In [None]:
system_prompt_generate = dedent(
    """
    You are a helpful assistant generating synthetic QA pairs for retrieval evaluation.

    Given a target chunk of text and a set of confounding chunks, you must extract a specific, self-contained fact from the target chunk that is not included in the confounding chunks. Then write a question that is directly and unambiguously answered by that fact. The question should only be answered by the fact extracted from the target chunk (and not by any of the confounding chunks) but it should also use themes or terminology that is present in the confounding chunks.

    Always respond with a JSON object with the following keys (in that exact order):
    1. "fact": "<the fact extracted from the target chunk>",
    2. "confounding_terms": "<a list of terms or themes from the confounding chunks that are relevant to the question>",
    3. "question": "<the question that is directly and unambiguously answered by the fact>",
    
    You should write the questions as if you're an employee looking for information in the handbook. The question should be as realistic and natural as possible, reflecting the kind of queries an employee might actually make when searching for information in the handbook.
    """
)

user_prompt_generate = dedent(
    """
    TARGET CHUNK:
    {target_chunk}

    CONFOUNDING CHUNKS:
    {confounding_chunks} 
    """
)

In [None]:
class Response(BaseModel):
    fact: str
    confounding_terms: list[str] = []
    question: str


llm = ChatOpenAI(model="gpt-4.1-mini", temperature=1)
llm_with_structured_output = llm.with_structured_output(Response)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

messages = ChatPromptTemplate.from_messages(
    [("system", system_prompt_generate), ("user", user_prompt_generate)]
)

In [None]:
@traceable
async def generate_qa_pair(random_chunk):
    similar_chunks = get_similar_chunks(random_chunk)
    compiled_messages = await messages.ainvoke(
        {
            "target_chunk": format_chunks([similar_chunks[0]]),
            "confounding_chunks": format_chunks(similar_chunks[1:]),
        }
    )
    output = await llm_with_structured_output.ainvoke(compiled_messages)
    return output

In [None]:
random_splits = [random.choice(splits) for _ in range(200)]
tasks = [generate_qa_pair(random_split) for random_split in random_splits]
qa_pairs = await asyncio.gather(*tasks)

In [None]:
import pandas as pd

df = pd.DataFrame([qa_pair.dict() for qa_pair in qa_pairs])
df.to_excel("qa_pairs.xlsx", index=False)

## Filtering QA pairs

In [None]:
system_prompt_filtering = dedent(
    """
You are an AI assistant helping us curate a high-quality dataset of questions for evaluating an company's internal handbook. We have generated synthetic questions and need to filter out those that are unrealistic or not representative of typical user queries.

Here are examples of realistic and unrealistic user queries we have manually rated:

### Realistic Queries (Good Examples)

* **Query:** "What is the required process for creating a new learning hub for your team in Level Up at GitLab?"
    * **Explanation:** Very realistic user query. It's concise, information-seeking, and process-oriented.
    * **Rating:** 5
* **Query:** "Where is the People Operations internal handbook hosted, and how can someone gain access to it?"
    * **Explanation:** Realistic query but might be a bit too detailed for a typical user.
    * **Rating:** 4
* **Query:** "Who controls access to People Data in the data warehouse at GitLab, and what approvals are required for Analytics Engineers and Data Analysts to obtain access?"
    * **Explanation:** Seems reasonable but too lengthy for a typical user query. 
    * **Rating:** 3

### Unrealistic Queries (Bad Examples)

* **Query:** "If a GitLab team member has been with the company for over 3 months and is interested in participating in the Onboarding Buddy Program, what should they do to express their interest?"
    * **Explanation:** Overly specific and unnatural. No real user would ask this.
    * **Rating:** 1
* **Query:** "On what date did the 'Managing Burnout with Time Off with John Fitch' session occur as part of the FY21 Learning Speaker Series?"
    * **Explanation:** Irrelevant and overly specific. Not a typical user query. 
    * **Rating:** 2

### Your Task

For the following generated question, please:

1.  Rate its realism as a typical user query for an internal handbook application on a scale of 1 to 5 (1 = Very Unrealistic, 3 = Neutral/Somewhat Realistic, 5 = Very Realistic).
2.  Provide a brief explanation for your rating, comparing it to the examples above if helpful.

### Output Format

**Explanation:** `[Your brief explanation]`
**Rating:** `[Your 1–5 rating]`
"""
)

user_prompt_filtering = dedent("""
**Generated Question to Evaluate:**
`{question_to_evaluate}`
""")


class ResponseFiltering(BaseModel):
    explanation: str
    rating: int


llm_with_structured_output_filtering = llm.with_structured_output(ResponseFiltering)

messages_filtering = ChatPromptTemplate.from_messages(
    [("system", system_prompt_filtering), ("user", user_prompt_filtering)]
)

In [None]:
async def generate_review_qa_pair(qa_pair):
    compiled_messages = await messages_filtering.ainvoke(
        {"question_to_evaluate": qa_pair.question}
    )
    output = await llm_with_structured_output_filtering.ainvoke(compiled_messages)
    return output


async def review_qa_pairs(qa_pairs) -> list[ResponseFiltering]:
    """
    Function to review the generated QA pairs.
    This function will be used to evaluate the realism of the questions.
    """
    tasks = [generate_review_qa_pair(qa_pair) for qa_pair in qa_pairs]
    results = await asyncio.gather(*tasks)
    return results


results = await review_qa_pairs(qa_pairs)

In [None]:
high_ratings = [
    {
        "rating": result.rating,
        "explanation": result.explanation,
        "question": qa_pair.question,
        "answer": qa_pair.fact,
    }
    for (result, qa_pair) in zip(results, qa_pairs)
    if result.rating >= 4
]
len(high_ratings)

In [None]:
df_high_ratings = pd.DataFrame(
    high_ratings, columns=["Rating", "Explanation", "Question"]
)

df_high_ratings.to_excel("high_ratings.xlsx", index=False)

## Evaluating RAG system 

In [None]:
client = Client()

dataset = client.create_dataset(dataset_name="Gitlab Handbook QA Evaluation")

In [None]:
examples = [
    {
        "input": {
            "question": h.question,
        },
        "output": {
            "answer": h.answer,
            "chunk": chunk
        },
    }
    for h, chunk in zip(high_ratings, random_splits)
]

# client.create_examples(dataset_id=dataset.id, examples=examples)

In [None]:
examples

## Generating more diverse QA pairs