# Intelligent Synthetic Dataset Generator

An AI-powered tool that creates realistic synthetic datasets for any business case—whether you provide the schema or let it intelligently design one for you.

It works with Claude, Gemini, GPT and HugginFace APIs.

## Imports

In [None]:
!pip install -q requests bitsandbytes anthropic

In [None]:
import os
import requests
import json
from google.colab import userdata

from openai import OpenAI
import anthropic
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import pandas as pd

import gradio as gr
import gc

In [None]:
hf_token = userdata.get('HF_TOKEN')
openai_api_key = userdata.get('OPENAI_API_KEY')
anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')
google_api_key = userdata.get('GOOGLE_API_KEY')

login(hf_token, add_to_git_credential=True)

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

## Configuration

In [None]:
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
PHI3 = "microsoft/Phi-3-mini-4k-instruct"
GEMMA2 = "google/gemma-2-2b-it"
GPT = "gpt-4o-mini"
CLAUDE = "claude-3-haiku-20240307"
GEMINI = "gemini-2.0-flash"

In [None]:
MODELS = {
    'LLama 3.1' : LLAMA,
    'Phi 3 mini': PHI3,
    'Gemma 2': GEMMA2,
    'GPT 4.o mini': GPT,
    'Claude 3 Haiku': CLAUDE,
    'Gemini 2.0 Flash': GEMINI,
}

HF_MODELS = [LLAMA, PHI3, GEMMA2]

In [None]:
FILE_FORMATS = [".csv", ".tsv", ".jsonl", ".json"]

In [None]:
SCHEMA = [
    ("Name", "TEXT", "Name of the restaurant", "Blue River Bistro"),
    ("Address", "TEXT", "Restaurant address", "742 Evergreen Terrace, Springfield, IL 62704"),
    ("Type", "TEXT", "Kitchen type", 'One of ["Thai","Mediterranean","Vegan","Steakhouse","Japanese"] or other potential types'),
    ("Average Price", "TEXT", "Average meal price", "$45, or '--' if unknown"),
    ("Year", "INT", "Year of restaurant opening", 2015),
    ("Menu", "Array", "List of meals", '["Grilled Salmon", "Caesar Salad", "Pad Thai", "Margherita Pizza", ...]'),
]

DEFAULT_SCHEMA_TEXT = "\n".join([f"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}" for i, col in enumerate(SCHEMA)])
print(DEFAULT_SCHEMA_TEXT)

In [None]:
system_prompt = """
You are an expert in generating synthetic datasets tailored to a given business case and user requirements.
If the user does not specify output columns, infer and create the most appropriate columns based on your expertise.
Do NOT repeat column values from one row to another. Only output valid JSONL without any comments."
"""


def get_user_prompt(business_case, schema_text, nr_records):
    prompt = f"The business case is: {business_case}.\nGenerate {nr_records} rows of data in JSONL format.\n"

    if schema_text is not None:
      prompt += f"Each line should be a JSON object with the following fields: \n{schema_text}\n"

    return prompt

## LLM handler

In [None]:
def ask_gpt(model: str, user_prompt: str):
  client = OpenAI(api_key=openai_api_key)
  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]
  response = client.chat.completions.create(
      model=model,
      messages=messages,
      temperature=0.7
  )
  content = response.choices[0].message.content

  return content

In [None]:
def ask_claude(model: str, user_prompt: str):
  client = anthropic.Anthropic(api_key=anthropic_api_key)
  response = client.messages.create(
      model=model,
      messages=[{"role": "user", "content": user_prompt}],
      max_tokens=4000,
      temperature=0.7,
      system=system_prompt
  )
  content = response.content[0].text

  return content

In [None]:
def ask_gemini(model: str, user_prompt: str):
  client = OpenAI(
      api_key=google_api_key,
      base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
  )
  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt}
    ]
  response = client.chat.completions.create(
      model=model,
      messages=messages,
      temperature=0.7
  )
  content = response.choices[0].message.content

  return content

In [None]:
def ask_hf(model: str, user_prompt: str):
  global tokenizer, inputs, hf_model, outputs

  messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
      ]

  tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  if hf_model == None:
      hf_model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)
  outputs = hf_model.generate(inputs, max_new_tokens=4000)

  _, _, after = tokenizer.decode(outputs[0]).partition("assistant<|end_header_id|>")
  content = after.strip()

  return content

In [None]:
def query_llm(model_name: str, user_prompt):
    try:
        model = MODELS[model_name]

        if "gpt" in model.lower():
            response = ask_gpt(model, user_prompt)

        elif "claude" in model.lower():
            response = ask_claude(model, user_prompt)

        elif "gemini" in model.lower():
            response = ask_gemini(model, user_prompt)

        elif model in HF_MODELS:
            response = ask_hf(model, user_prompt)

        else:
            raise ValueError(f"Unsupported model. Use one of {', '.join(MODELS.keys())}")

        lines = [line.strip() for line in response.strip().splitlines() if line.strip().startswith("{")]

        return [json.loads(line) for line in lines]

    except Exception as e:
        raise Exception(f"Model query failed: {str(e)}")

## Output Formatter

In [None]:
def save_dataset(records, file_format: str, file_name: str):
    df = pd.DataFrame(records)
    print(df.shape)
    if file_format == ".csv":
        df.to_csv(file_name, index=False)
    elif file_format == ".tsv":
        df.to_csv(file_name, sep="\t", index=False)
    elif file_format == ".jsonl":
        with open(file_name, "w") as f:
            for record in records:
                f.write(json.dumps(record) + "\n")
    elif file_format == ".json":
        df.to_json(file_name, orient="records", index=False)
    else:
        raise ValueError("Unsupported file format")

In [None]:
def generate_dataset(
    model_name: str,
    business_case: str,
    num_records: int = 100,
    schema_text: str = None,
    file_format: str = '.jsonl',
    file_name: str = 'test_dataset.jsonl'
):
    """
    Generates a synthetic dataset using an LLM based on the given business case and optional schema.

    Returns:
        Tuple[str, pd.DataFrame | None]: A status message and a preview DataFrame (first 10 rows) if successful.
    """
    try:
        # Validate number of records
        if num_records <= 10:
            return "❌ Error: Number of records must be greater than 10.", None
        if num_records > 1000:
            return "❌ Error: Number of records must be less than or equal to 1000.", None

        # Validate file format
        if file_format not in FILE_FORMATS:
            return f"❌ Error: Invalid file format '{file_format}'. Supported formats: {FILE_FORMATS}", None

        # Ensure file name has correct extension
        if not file_name.endswith(file_format):
            file_name += file_format

        # Generate the prompt and query the model
        prompt = get_user_prompt(business_case, schema_text, num_records)
        records = query_llm(model_name, prompt)

        if not records:
            return "❌ Error: No valid records were generated by the model.", None

        # Save dataset
        save_dataset(records, file_format, file_name)

        # Prepare preview
        df = pd.DataFrame(records)
        preview = df.head(10)

        success_message = (
            f"✅ Generated {len(records)} records successfully!\n"
            f"📁 Saved to: {file_name}\n"
        )

        return success_message, preview

    except Exception as e:
        return f"❌ Error: {str(e)}", None

In [None]:
with gr.Blocks(title="Synthetic Dataset Generator", theme=gr.themes.Monochrome()) as interface:
    tokenizer = None
    inputs = None
    hf_model = None
    outputs = None

    gr.Markdown("# Dataset Generator")
    gr.Markdown("Generate synthetic datasets using AI models")

    with gr.Row():
        with gr.Column(scale=2):
            schema_input = gr.Textbox(
                label="Schema",
                value=DEFAULT_SCHEMA_TEXT,
                lines=15,
                placeholder="Define your dataset schema here... Please follow this format: Name (TYPE) - Description, example: Example"
            )

            business_case_input = gr.Textbox(
                label="Business Case",
                value="I want to generate restaurant dataset",
                lines=1,
                placeholder="Enter business case description..."
            )

            with gr.Row():
                model_dropdown = gr.Dropdown(
                    label="Model",
                    choices=list(MODELS.keys()),
                    value=list(MODELS.keys())[0],
                    interactive=True
                )

                nr_records_input = gr.Number(
                    label="Number of records",
                    value=27,
                    minimum=11,
                    maximum=1000,
                    step=1
                )

            with gr.Row():
                filename_input = gr.Textbox(
                      label="Save as",
                      value="restaurant_dataset",
                      placeholder="Enter filename (extension will be added automatically)"
                  )

                file_format_dropdown = gr.Dropdown(
                    label="File format",
                    choices=FILE_FORMATS,
                    value=FILE_FORMATS[0],
                    interactive=True
                )

            generate_btn = gr.Button("🚀 Generate", variant="secondary", size="lg")

        with gr.Column(scale=1):
            gr.Markdown("""
            ### 📝 Dataset Generation Instructions

            1. **🗂 Schema** – Define your dataset structure
              *(default: restaurant schema provided)*
            2. **💡 Business Case** – Enter a prompt to guide the AI for generating data
            3. **🤖 Model** – Choose your AI model: GPT, Claude, Gemini, or Hugging Face
            4. **📊 Number of Records** – Specify entries to generate
              *(min: 11, max: 1000)*
            5. **📁 File Format** – Select output type: `.csv`, `.tsv`, `.jsonl`, or `.json`
            6. **💾 Save As** – Provide a filename *(extension auto-added)*
            7. **🚀 Generate** – Click **Generate** to create your dataset

            ### 🔧 Requirements

            Set API keys in Colab’s secret section:
              `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `HF_TOKEN`
            """)
            output_status = gr.Textbox(
                label="Status",
                lines=4,
                interactive=False
            )

            output_preview = gr.Dataframe(
                label="Preview (first 10 rows)",
                interactive=False,
                wrap=True
            )

    generate_btn.click(
        fn=generate_dataset,
        inputs=[
            model_dropdown,
            business_case_input,
            nr_records_input,
            schema_input,
            file_format_dropdown,
            filename_input
        ],
        outputs=[output_status, output_preview]
    )

interface.launch(debug=True)

del tokenizer, inputs, hf_model, outputs
gc.collect()
torch.cuda.empty_cache()