# Week 3 Exercise — Synthetic Data Generator (Emmanuel)

Chat UI to generate **synthetic data of any type** using an LLM. Choose a model from the dropdown and describe what data you need (e.g. tabular customer records, JSON configs, CSV, sample emails, survey responses). The assistant is system-prompted to produce valid, consistent synthetic data in the format you request.

**Requirements:** For OpenAI models set `OPENAI_API_KEY` in your environment or a `.env` file. For open-source models (Ollama), run Ollama locally (`ollama serve`, then `ollama pull <model>`).

In [None]:
import os
import gradio as gr
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True)
openai_api_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI() if openai_api_key else None

OLLAMA_BASE_URL = "http://localhost:11434/v1"
ollama_client = OpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama")

# Dropdown: (display label, model id for API). OpenAI first, then Ollama.
MODEL_CHOICES = [
    ("OpenAI — GPT-4.1 Mini", "gpt-4.1-mini"),
    ("OpenAI — GPT-4o", "gpt-4o"),
    ("OpenAI — GPT-4 Turbo", "gpt-4-turbo"),
    ("Ollama — Llama 3.2", "llama3.2"),
    ("Ollama — Llama 3.1", "llama3.1"),
    ("Ollama — Mistral", "mistral"),
    ("Ollama — Qwen 2.5 7B", "qwen2.5:7b"),
    ("Ollama — Phi-3", "phi3"),
]
DEFAULT_MODEL_ID = "gpt-4.1-mini"
OLLAMA_MODELS = {"llama3.2", "llama3.1", "mistral", "qwen2.5:7b", "phi3"}

In [None]:
SYSTEM_PROMPT = """You are an expert synthetic data generator. Your role is to produce realistic, consistent synthetic data of any type the user requests.

You must:
- Generate data that matches the user's description (schema, format, volume, domain).
- Output in the exact format requested: CSV, JSON, JSONL, YAML, markdown tables, or plain text.
- Ensure internal consistency (e.g. dates, IDs, referential integrity where relevant).
- Avoid real personal data; use clearly fake names, emails, and identifiers.
- If the user specifies a number of rows or examples, produce at least that many unless they ask for a small sample.

When the user's request is ambiguous, ask one short clarifying question (e.g. number of rows, format, or fields) then generate. Otherwise generate directly."""

In [None]:
def build_messages(message, history):
    sys = [{"role": "system", "content": SYSTEM_PROMPT}]
    history_msgs = [{"role": h["role"], "content": h["content"]} for h in history]
    return sys + history_msgs + [{"role": "user", "content": message}]


def chat(message, history, model):
    use_ollama = model in OLLAMA_MODELS
    client = ollama_client if use_ollama else openai_client
    if not client:
        if use_ollama:
            yield "Ollama is not available. Run `ollama serve` and `ollama pull <model>` (e.g. ollama pull llama3.2)."
        else:
            yield "Set OPENAI_API_KEY in your environment or .env to use OpenAI models."
        return
    messages = build_messages(message, history)
    try:
        stream = client.chat.completions.create(model=model, messages=messages, stream=True)
    except Exception as e:
        yield f"Error calling model: {e}"
        return
    accumulated = ""
    for chunk in stream:
        if chunk.choices and chunk.choices[0].delta.content:
            accumulated += chunk.choices[0].delta.content
            yield accumulated

In [None]:
with gr.Blocks(title="Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
    gr.Markdown("### Synthetic Data Generator — pick a model, then chat below.")
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=MODEL_CHOICES,
            value=DEFAULT_MODEL_ID,
            label="Model",
            info="OpenAI models need OPENAI_API_KEY; Ollama models need local Ollama running.",
            allow_custom_value=False,
        )
    gr.ChatInterface(
        fn=chat,
        type="messages",
        additional_inputs=[model_dropdown],
    )

demo.launch()