# Synthetic Data Generation (SDG)


## Prepare Dataset

In [1]:
from datasets import load_dataset
from langchain.schema import Document

dataset = load_dataset(
    "motionlabs/fineweb-ultra-mini",
    cache_dir="/mnt/d/datasets/fineweb-ultra-mini"
)

selected_data = dataset['train'].select(range(3))
texts = [item['text'] for item in selected_data]

# Convert texts to LangChain Document objects
docs = [Document(page_content=text) for text in texts]

Generating train split:   0%|          | 0/72742 [00:00<?, ? examples/s]

In [39]:
import re

def extract_headlines(text):
    # This regex matches lines that start with a number followed by a period or parenthesis
    return re.findall(r'^\d+\.*\s*(.*)', text, re.MULTILINE)

In [40]:
for doc in docs:
    headlines = extract_headlines(doc.page_content)
    if not headlines:
        headlines = ['No headlines found']
    doc.metadata['headlines'] = headlines


## Generate Testset


In [1]:
from ragas.testset import TestsetGenerator
from langchain_ollama import ChatOllama
from langchain_huggingface import HuggingFaceEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

generator_llm = ChatOllama(
    model="qwen3:8b",
    temperature=0.1,
)

print(generator_llm.invoke("hi"))

generator_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

wrapped_llm = LangchainLLMWrapper(generator_llm)
wrapped_embeddings = LangchainEmbeddingsWrapper(generator_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


content='<think>\nOkay, the user said "hi". I need to respond appropriately. Since it\'s a simple greeting, I should acknowledge their greeting and offer assistance. Maybe start with a friendly "Hello!" and ask how I can help. Keep it open-ended so they feel comfortable to ask anything. Make sure the tone is positive and welcoming. Let me check if there\'s anything else needed. No, that should cover it. Alright, time to put it all together.\n</think>\n\nHello! 😊 How can I assist you today? I\'m here to help with any questions or tasks you might have!' additional_kwargs={} response_metadata={'model': 'qwen3:8b', 'created_at': '2025-08-13T02:48:35.91344224Z', 'done': True, 'done_reason': 'stop', 'total_duration': 24028804272, 'load_duration': 19189712961, 'prompt_eval_count': 9, 'prompt_eval_duration': 482973816, 'eval_count': 122, 'eval_duration': 4353258620, 'model_name': 'qwen3:8b'} id='run--fcaeb002-6783-430c-a700-180d63ba9617-0' usage_metadata={'input_tokens': 9, 'output_tokens': 12

    Found GPU0 NVIDIA GeForce GTX 1070 Ti which is of cuda capability 6.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
NVIDIA GeForce GTX 1070 Ti with CUDA capability sm_61 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_75 sm_80 sm_86 sm_90 sm_100 sm_120.
If you want to use the NVIDIA GeForce GTX 1070 Ti GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [None]:
generator = TestsetGenerator.from_langchain(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(
    docs, testset_size=3
)

## Analyze the Testset


In [None]:
df = dataset.to_pandas()
type(df)

pandas.core.frame.DataFrame

In [None]:
df

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,What contributions has Han Zhou made to the fi...,[5 2 0 2 b e F 4 ] G L . s c [ 1 v 3 3 5 2 0 ....,Han Zhou is one of the authors of a study that...,single_hop_specifc_query_synthesizer
1,How do Kojima et al. contribute to the underst...,[1. Introduction Exemplar Topology Optimizer d...,Kojima et al. (2022) contribute to the underst...,single_hop_specifc_query_synthesizer
2,What is the significance of prompt design in m...,[2.1. Block-level: Prompt Design for Agents At...,The significance of prompt design in multi-age...,single_hop_specifc_query_synthesizer
3,What were the key factors contributing to Meta...,"[<1-hop>\n\nConclusion In summary, 2024 was a ...",Meta's stock gain of 72% in 2024 was driven by...,multi_hop_specific_query_synthesizer
4,How do the findings of Madaan et al. regarding...,[<1-hop>\n\n2.2. Workflow-level Search Space D...,Madaan et al. emphasize the significance of se...,multi_hop_specific_query_synthesizer
5,What was Amazon's stock performance in 2024 an...,[<1-hop>\n\nStock Market Performance in 2024 U...,Amazon's stock staged an impressive rebound in...,multi_hop_specific_query_synthesizer


## Saving DataFrame as CSV


In [None]:
import pandas as pd

df.to_csv("./data/eval_dataframe.csv", index=False)

loaded_df = pd.read_csv("./data/eval_dataframe.csv")

df.shape, loaded_df.shape

((6, 4), (6, 4))

## Saving DataFrame as Parquet

This format is for larger datasets


In [None]:
# df.to_parquet("../data/eval_dataframe.parquet")

# loaded_df = pd.read_parquet("../data/eval_dataframe.parquet")

# df.shape, loaded_df.shape
# ## Saving DataFrame as Feather


((12, 4), (12, 4))