# Week 3 Exercise: Generating Synthetic Data

**Objectives:**
1. Build models that can generate datasets (JSON, CSV, or Raw Text).
2. Use a variety of Hugging Face models and prompts for diverse outputs.
3. Create a Gradio UI so users can choose the model and generate synthetic data.

**Run this notebook in Google Colab**. Add Hugging Face token in Colab Secrets: `HF_TOKEN`.

In [None]:
%pip install -q --upgrade bitsandbytes accelerate "transformers==4.57.6" sentencepiece gradio torch

In [None]:
import os
import tempfile
from functools import lru_cache

from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import gradio as gr

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
MODELS = {
    "Llama-3.2-3B-Instruct": "meta-llama/Llama-3.2-3B-Instruct",
    "SmolLM2-1.7B-Instruct": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    "Qwen2-1.5B-Instruct": "Qwen/Qwen2-1.5B-Instruct",
}

FORMAT_RULES = {
    "JSON": "Return a JSON array containing exactly the requested number of objects with consistent fields tailored to the context. No explanations.",
    "CSV": "Return a CSV document with a header row and the requested number of data rows aligned to the context. No explanations.",
    "Raw Text": "Return the requested number of short prose entries separated by blank lines that reflect the context. No explanations.",
}

In [None]:
#quantization
def get_quantization_config():
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    )



In [None]:
@lru_cache(maxsize=6)
def load_text_components(model_id: str, use_quant: bool):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    if use_quant and torch.cuda.is_available():
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=get_quantization_config(),
            trust_remote_code=True,
        )
    else:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto" if torch.cuda.is_available() else None,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            trust_remote_code=True,
        )
    model.eval()
    return tokenizer, model

In [None]:
STYLE_TEMPLATES = {
    "Concise": "Keep each record brief and to the point.",
    "Detailed": "Include rich, realistic detail in each record.",
    "Diverse": "Maximize variety across records (names, values, categories).",
    "Technical": "Use precise, technical language where appropriate.",
    "Balanced": "Mix clarity and variety without being verbose.",
}

In [None]:
def build_text_messages(
    style: str,
    context: str,
    return_format: str,
    record_count: int,
) -> list:
    context_value = (context or "general purpose scenario").strip()
    style_value = (style or "Balanced").strip()
    style_instruction = STYLE_TEMPLATES.get(style_value, STYLE_TEMPLATES["Balanced"])
    directive = FORMAT_RULES.get(return_format, FORMAT_RULES["JSON"])

    system_prompt = (
        "You generate synthetic datasets that are high quality, diverse, and free of personally identifiable information. "
        + directive
        + " Ensure outputs are consistent in structure and avoid any explanation or commentary."
    )
    user_prompt = (
        f"Context: {context_value}\n"
        f"Style: {style_instruction}\n"
        f"Generate exactly {record_count} records. Output format: {return_format}."
    )
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

In [None]:
def generate_text_data(
    model_choice: str,
    style: str,
    context: str,
    return_format: str,
    quantize: bool,
    record_count: int,
) -> str:
    model_id = MODELS.get(model_choice)
    if not model_id:
        return "Error: Unknown model selected."

    tokenizer, model = load_text_components(model_id, bool(quantize))
    messages = build_text_messages(style, context, return_format, int(record_count))

    if hasattr(tokenizer, "apply_chat_template"):
        inputs = tokenizer.apply_chat_template(
            messages,
            return_tensors="pt",
            add_generation_prompt=True,
        )
    else:
        prompt = messages[-1]["content"]
        inputs = tokenizer(prompt, return_tensors="pt")

    device = next(model.parameters()).device
    if isinstance(inputs, dict):
        input_ids = inputs["input_ids"].to(device)
    else:
        input_ids = inputs.to(device) if hasattr(inputs, "to") else torch.tensor([inputs], device=device)
    if input_ids.dim() == 1:
        input_ids = input_ids.unsqueeze(0)
    attention_mask = torch.ones_like(input_ids, device=device)

    with torch.inference_mode():
        generated = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.05,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )

    new_tokens = generated[:, input_ids.shape[-1] :]
    text = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
    return text.strip()

In [None]:
with gr.Blocks(title="Generating Synthetic Data") as demo:
    gr.Markdown("## Generating Synthetic Data")
    gr.Markdown("Choose a Hugging Face model and generate datasets in JSON, CSV, or Raw Text.")

    with gr.Row():
        model_choice = gr.Dropdown(
            choices=list(MODELS.keys()),
            value="SmolLM2-1.7B-Instruct",
            label="Model",
        )
        return_format = gr.Dropdown(
            choices=["JSON", "CSV", "Raw Text"],
            value="JSON",
            label="Output format",
        )
        style = gr.Dropdown(
            choices=list(STYLE_TEMPLATES.keys()),
            value="Balanced",
            label="Style",
        )

    context_input = gr.Textbox(
        label="Context",
        lines=4,
        placeholder="e.g. Product catalog for an online electronics store: name, category, price, sku, in_stock",
    )
    record_count = gr.Slider(1, 20, value=5, step=1, label="Number of records")

    generate_btn = gr.Button("Generate")
    text_output = gr.Textbox(label="Generated data", lines=16)

    def run_generate(model_choice, style, context, return_format, record_count):
        return generate_text_data(model_choice, style, context, return_format, True, record_count)

    generate_btn.click(
        fn=run_generate,
        inputs=[model_choice, style, context_input, return_format, record_count],
        outputs=text_output,
    )

demo.launch(share=True, debug=True)

In [None]:
def save_generated_to_file(text: str, return_format: str):
    """Save generated text to a file and return its path for download. Returns None if error or empty."""
    if not (text and text.strip()) or text.startswith("Error:"):
        return None
    ext = {"JSON": ".json", "CSV": ".csv", "Raw Text": ".txt"}.get(return_format, ".txt")
    fd, path = tempfile.mkstemp(suffix=ext)
    try:
        with os.fdopen(fd, "w", encoding="utf-8") as f:
            f.write(text)
        return path
    except Exception:
        try:
            os.close(fd)
        except Exception:
            pass
        return None

In [None]:
with gr.Blocks(title="Generating Synthetic Data") as demo:
    gr.Markdown("## Generating Synthetic Data")
    gr.Markdown("Choose a Hugging Face model and generate datasets in JSON, CSV, or Raw Text.")

    with gr.Row():
        model_choice = gr.Dropdown(
            choices=list(MODELS.keys()),
            value="SmolLM2-1.7B-Instruct",
            label="Model",
        )
        return_format = gr.Dropdown(
            choices=["JSON", "CSV", "Raw Text"],
            value="JSON",
            label="Output format",
        )
        style = gr.Dropdown(
            choices=list(STYLE_TEMPLATES.keys()),
            value="Balanced",
            label="Style",
        )

    context_input = gr.Textbox(
        label="Context",
        lines=4,
        placeholder="e.g. Product catalog for an online electronics store: name, category, price, sku, in_stock",
    )
    record_count = gr.Slider(1, 20, value=5, step=1, label="Number of records")

    generate_btn = gr.Button("Generate")
    text_output = gr.Textbox(label="Generated data (preview)", lines=16)
    file_output = gr.File(label="Download file")

    def run_generate(model_choice, style, context, return_format, record_count):
        text = generate_text_data(model_choice, style, context, return_format, True, record_count)
        file_path = save_generated_to_file(text, return_format)
        return text, file_path

    generate_btn.click(
        fn=run_generate,
        inputs=[model_choice, style, context_input, return_format, record_count],
        outputs=[text_output, file_output],
    )

demo.launch(share=True)