# Synthetic Dataset Generator
Generates realistic synthetic datasets from a schema description using Claude Haiku.

```

In [1]:
import json
import csv
import os
import tempfile
from typing import Optional

import anthropic
import gradio as gr
import pandas as pd
from dotenv import load_dotenv

In [None]:
load_dotenv()

ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
MODEL: str = "claude-haiku-4-5-20251001"
BATCH_SIZE: int = 50  # Max records per API call

SYSTEM_PROMPT = """You are a synthetic data generation engine. Your only job is to produce realistic, varied JSON arrays matching the schema and constraints given.

Rules:
- Output ONLY a valid JSON array, no commentary or markdown fences
- Every record must be unique and realistic — no repetitive patterns
- Respect all types, ranges, formats, and constraints precisely
- Vary values naturally across the dataset; avoid clustering
- For names, emails, cities — use diverse, realistic values"""

print(f"API key loaded: {'yes' if ANTHROPIC_API_KEY else 'no — set ANTHROPIC_API_KEY in .env or below'}")

In [3]:
def build_prompt(schema: str, count: int, example: Optional[str] = None) -> str:
    """Construct the user prompt for a given schema and record count."""
    example_block = f"\n\nExample record:\n{example.strip()}" if example and example.strip() else ""
    return (
        f"Generate {count} synthetic records matching this schema:\n\n{schema.strip()}"
        f"{example_block}\n\n"
        f"Return a JSON array of exactly {count} objects."
    )

In [4]:
def call_claude(api_key: str, prompt: str, max_tokens: int = 4096) -> str:
    """Send a prompt to Claude and return the raw text response."""
    client = anthropic.Anthropic(api_key=api_key)
    message = client.messages.create(
        model=MODEL,
        max_tokens=max_tokens,
        system=SYSTEM_PROMPT,
        messages=[{"role": "user", "content": prompt}],
    )
    return message.content[0].text.strip()

In [5]:
def parse_json_response(raw: str) -> list[dict]:
    """Parse a JSON array from the model response, stripping any markdown fences."""
    text = raw.strip()
    if text.startswith("```"):
        parts = text.split("```")
        # parts[1] contains the fenced block; strip optional language tag
        text = parts[1].lstrip("json").strip()
    return json.loads(text)

In [6]:
# Cell 6 — Record Generator (with batching)
def generate_records(
    api_key: str,
    schema: str,
    count: int,
    example: Optional[str] = None,
) -> list[dict]:
    """
    Generate `count` records for the given schema.
    Automatically batches requests when count exceeds BATCH_SIZE.
    """
    records: list[dict] = []
    remaining = count

    while remaining > 0:
        batch = min(remaining, BATCH_SIZE)
        prompt = build_prompt(schema, batch, example)
        raw = call_claude(api_key, prompt)
        records.extend(parse_json_response(raw))
        remaining -= batch
        print(f"  Generated {len(records)} / {count} records...")

    return records

In [7]:
def records_to_csv(records: list[dict]) -> str:
    """Write records to a temporary CSV file and return the file path."""
    df = pd.DataFrame(records)
    tmp = tempfile.NamedTemporaryFile(
        delete=False, suffix=".csv", mode="w", newline="", encoding="utf-8"
    )
    writer = csv.DictWriter(tmp, fieldnames=df.columns.tolist())
    writer.writeheader()
    writer.writerows(records)
    tmp.close()
    return tmp.name

In [8]:
def run_generation(
    api_key_input: str,
    schema: str,
    count: int,
    example: str,
):
    """
    Gradio-facing handler. Returns (DataFrame, status_message, csv_filepath).
    Falls back to environment API key when available.
    """
    key = ANTHROPIC_API_KEY or api_key_input.strip()

    if not key:
        return None, "API key missing. Set ANTHROPIC_API_KEY in .env or enter it above.", None
    if not schema.strip():
        return None, "Schema description is required.", None

    try:
        records = generate_records(key, schema, int(count), example or None)
    except json.JSONDecodeError as exc:
        return None, f"Failed to parse model output as JSON: {exc}", None
    except anthropic.AuthenticationError:
        return None, "Authentication failed. Check your API key.", None
    except Exception as exc:  # noqa: BLE001
        return None, f"Unexpected error: {exc}", None

    if not records:
        return None, "No records returned. Try rephrasing your schema.", None

    df = pd.DataFrame(records)
    csv_path = records_to_csv(records)
    status = f"Generated {len(records)} records, {len(df.columns)} columns."
    return df, status, csv_path

In [9]:
#  UI Styles
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');

:root {
    --bg:          #0d0d0d;
    --surface:     #161616;
    --border:      #2a2a2a;
    --accent:      #e8ff47;
    --accent-dim:  rgba(232,255,71,0.07);
    --text:        #e8e8e8;
    --muted:       #6b6b6b;
    --mono:        'IBM Plex Mono', monospace;
    --sans:        'IBM Plex Sans', sans-serif;
    --r:           2px;
}

*, *::before, *::after { box-sizing: border-box; }

body, .gradio-container {
    background: var(--bg) !important;
    font-family: var(--sans) !important;
    color: var(--text) !important;
}

.gradio-container {
    max-width: 1240px !important;
    margin: 0 auto !important;
    padding: 40px 28px !important;
}

.hdr { border-bottom: 1px solid var(--border); padding-bottom: 20px; margin-bottom: 32px; }
.hdr-title { font-family: var(--mono) !important; font-size: 12px !important; font-weight: 600 !important;
             letter-spacing: .14em !important; text-transform: uppercase !important; color: var(--accent) !important;
             margin: 0 0 5px 0 !important; }
.hdr-sub   { font-size: 12px !important; color: var(--muted) !important; font-weight: 300 !important; margin: 0 !important; }

.gr-block, .gr-box, .form { background: var(--surface) !important; border: 1px solid var(--border) !important; border-radius: var(--r) !important; }

label, span.svelte-1f354aw {
    font-family: var(--mono) !important; font-size: 10px !important; font-weight: 500 !important;
    letter-spacing: .09em !important; text-transform: uppercase !important; color: var(--muted) !important;
}

textarea, input[type=text], input[type=password] {
    background: #0d0d0d !important; border: 1px solid var(--border) !important;
    border-radius: var(--r) !important; color: var(--text) !important;
    font-family: var(--mono) !important; font-size: 12px !important;
    padding: 10px 12px !important; transition: border-color .15s !important;
}
textarea:focus, input:focus {
    border-color: var(--accent) !important; outline: none !important;
    box-shadow: 0 0 0 1px var(--accent) !important;
}

input[type=range] { accent-color: var(--accent) !important; }

.gen-btn button, button.primary {
    background: var(--accent) !important; color: #0d0d0d !important;
    font-family: var(--mono) !important; font-size: 11px !important; font-weight: 600 !important;
    letter-spacing: .12em !important; text-transform: uppercase !important;
    border: none !important; border-radius: var(--r) !important;
    padding: 12px 0 !important; width: 100% !important;
    cursor: pointer !important; transition: opacity .15s !important;
}
.gen-btn button:hover, button.primary:hover { opacity: .82 !important; }

table { font-family: var(--mono) !important; font-size: 11px !important; border-collapse: collapse !important; width: 100% !important; }
th { background: #1e1e1e !important; color: var(--accent) !important; font-size: 10px !important;
     letter-spacing: .1em !important; text-transform: uppercase !important;
     padding: 8px 12px !important; border-bottom: 1px solid var(--border) !important; text-align: left !important; }
td { padding: 7px 12px !important; border-bottom: 1px solid #1a1a1a !important; color: #ccc !important; }
tr:last-child td { border-bottom: none !important; }
tr:hover td { background: var(--accent-dim) !important; }

.section-rule { font-family: var(--mono) !important; font-size: 10px !important; letter-spacing: .15em !important;
                text-transform: uppercase !important; color: var(--muted) !important;
                border-bottom: 1px solid var(--border) !important; padding-bottom: 8px !important;
                margin: 28px 0 14px 0 !important; }

::-webkit-scrollbar { width: 4px; height: 4px; }
::-webkit-scrollbar-track { background: var(--bg); }
::-webkit-scrollbar-thumb { background: var(--border); border-radius: 2px; }
"""

In [10]:
# Example Schemas
EXAMPLE_SCHEMAS = [
    [
        """employee_id: format EMP001, EMP002, etc.
name: realistic full name
department: one of Engineering, Sales, Marketing, HR, Finance
salary: integer between 40000 and 150000
hire_date: ISO date between 2020-01-01 and 2024-12-31
performance_rating: integer 1-5""",
        10,
    ],
    [
        """product_id: format PRD-XXXX (4 digits)
product_name: creative product name
category: one of Electronics, Clothing, Home, Books, Sports
price: float between 5.00 and 500.00
stock_quantity: integer between 0 and 1000
rating: float between 1.0 and 5.0
num_reviews: integer between 0 and 500""",
        15,
    ],
    [
        """student_id: format STU2024XXX (3 digits)
name: realistic full name
major: one of Computer Science, Biology, Business, Arts, Engineering
gpa: float between 2.0 and 4.0 (1 decimal place)
year: one of Freshman, Sophomore, Junior, Senior
credits_completed: integer between 0 and 120""",
        20,
    ],
]

In [11]:
def build_interface() -> gr.Blocks:
    with gr.Blocks(title="Synthetic Dataset Generator", css=CUSTOM_CSS) as demo:

        gr.HTML("""
        <div class="hdr">
            <p class="hdr-title">Synthetic Dataset Generator</p>
            <p class="hdr-sub">Describe a schema — get a CSV. Powered by Claude Haiku.</p>
        </div>
        """)

        with gr.Row(equal_height=False):
            with gr.Column(scale=1, min_width=300):
                if not ANTHROPIC_API_KEY:
                    api_key_input = gr.Textbox(
                        label="API Key",
                        type="password",
                        placeholder="sk-ant-...",
                        info="Set ANTHROPIC_API_KEY in .env to skip this field.",
                    )
                else:
                    api_key_input = gr.Textbox(
                        label="API Key",
                        type="password",
                        value=ANTHROPIC_API_KEY,
                        info="Loaded from environment.",
                        interactive=False,
                    )

                schema_input = gr.Textbox(
                    label="Schema Description",
                    placeholder=(
                        "field_name: description / constraints\n\n"
                        "name: realistic full name\n"
                        "email: valid email address\n"
                        "age: integer between 18 and 80\n"
                        "city: US city name\n"
                        "spend: float between 10.00 and 999.99"
                    ),
                    lines=12,
                )

                example_input = gr.Textbox(
                    label="Example Record (optional)",
                    placeholder='{"name": "Alice Chen", "email": "alice@example.com", "age": 34}',
                    lines=3,
                )

                record_count = gr.Slider(
                    minimum=1, maximum=500, value=10, step=1, label="Record Count"
                )

                with gr.Row(elem_classes="gen-btn"):
                    generate_btn = gr.Button("Generate Dataset", variant="primary")

            with gr.Column(scale=2):
                status_output = gr.Textbox(label="Status", lines=2, interactive=False)
                dataframe_output = gr.Dataframe(label="Preview", wrap=True)
                csv_download = gr.File(label="Download CSV", file_types=[".csv"])

        gr.HTML('<p class="section-rule">Example Schemas</p>')
        gr.Examples(examples=EXAMPLE_SCHEMAS, inputs=[schema_input, record_count], label="")

        generate_btn.click(
            fn=run_generation,
            inputs=[api_key_input, schema_input, record_count, example_input],
            outputs=[dataframe_output, status_output, csv_download],
        )

    return demo

In [None]:
app = build_interface()
app.launch()