# Frank Asket's Week 3 Exercise

[Frank Asket](https://github.com/frank-asket) — *Founder & CTO building Human-Centered AI infrastructure.*

**Synthetic dataset generator:** describe a business scenario (e.g. restaurant reviews, support tickets, product catalog); the LLM generates structured synthetic data (CSV or JSON). Runs locally with **OpenRouter** (or OpenAI); **Gradio UI** to configure scenario, number of rows, and format. No Colab or HuggingFace token required.

In [None]:
# imports

import os
import re
import json
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

In [None]:
# environment & API client (OpenRouter preferred, fallback OpenAI)

load_dotenv(override=True)
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")

if openrouter_api_key and openrouter_api_key.startswith("sk-or-"):
    client = OpenAI(api_key=openrouter_api_key, base_url="https://openrouter.ai/api/v1")
    MODEL = "openai/gpt-4o-mini"
    print("Using OpenRouter.")
elif openai_api_key:
    client = OpenAI(api_key=openai_api_key)
    MODEL = "gpt-4o-mini"
    print("Using OpenAI.")
else:
    client = OpenAI()
    MODEL = "gpt-4o-mini"
    print("Using default client (set OPENROUTER_API_KEY or OPENAI_API_KEY in .env).")

In [None]:
# System prompt: synthetic data only, no commentary, no real PII

SYSTEM_PROMPT = """You are a synthetic dataset generator. Your only job is to output structured data.

Rules:
- Output ONLY the requested format (CSV or JSON). No explanations, no markdown code fences, no extra text.
- For CSV: first line is the header row, then one row per record. Use commas; escape quotes inside fields.
- For JSON: output a single JSON array of objects. Each object is one record with consistent keys.
- Generate realistic but fake data. No real names, emails, or identifiable information.
- Infer a sensible schema from the user's scenario (e.g. for "restaurant reviews" use: reviewer_name, rating, review_text, date).
- Generate exactly the number of records requested."""

In [None]:
def generate_dataset(scenario: str, num_rows: int, output_format: str) -> str:
    """Call the LLM to generate synthetic data. Returns raw CSV or JSON string."""
    if not scenario or not scenario.strip():
        return "Please describe the dataset scenario (e.g. 'restaurant reviews with rating and date')."
    num_rows = max(1, min(int(num_rows), 50))
    fmt = "CSV" if "csv" in output_format.lower() else "JSON"
    user_msg = (
        f"Generate a synthetic dataset with exactly {num_rows} records. "
        f"Scenario: {scenario.strip()}. "
        f"Output format: {fmt} only, no other text."
    )
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user_msg}
    ]
    try:
        r = client.chat.completions.create(model=MODEL, messages=messages, temperature=0.7)
        raw = (r.choices[0].message.content or "").strip()
        # Strip markdown code blocks if the model added them
        if raw.startswith("```"):
            raw = re.sub(r"^```\w*\n", "", raw)
            raw = re.sub(r"\n```\s*$", "", raw)
        return raw
    except Exception as e:
        return f"Error: {e}"


In [None]:
# Gradio UI

with gr.Blocks() as demo:
    gr.Markdown(
        """
        ## Synthetic Dataset Generator (Week 3)
        Describe the kind of data you want (e.g. *product catalog with name, price, category* or *customer support tickets with id, subject, status*).
        Choose number of rows and output format. Output is raw CSV or JSON — copy or download.
        """
    )
    with gr.Row():
        scenario = gr.Textbox(
            label="Dataset scenario",
            placeholder="e.g. Restaurant reviews with reviewer_name, rating (1-5), review_text, date",
            lines=2
        )
    with gr.Row():
        num_rows = gr.Slider(1, 50, value=5, step=1, label="Number of rows")
        output_format = gr.Dropdown(["CSV", "JSON"], value="CSV", label="Output format")
    btn = gr.Button("Generate", variant="primary")
    out = gr.Textbox(label="Generated data", lines=12)

    btn.click(fn=generate_dataset, inputs=[scenario, num_rows, output_format], outputs=out)

demo.launch(inbrowser=True, theme=gr.themes.Soft())