In [1]:
import gradio as gr
import json
import csv
import io
import os
from huggingface_hub import InferenceClient, login
from google.colab import userdata

In [2]:
# LLM Setup
HF_TOKEN = userdata.get('HF_TOKEN')
login(HF_TOKEN, add_to_git_credential=True)

# HF_TOKEN = os.environ.get("HF_TOKEN", None)
DEFAULT_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"

AVAILABLE_MODELS = [
    "meta-llama/Meta-Llama-3-8B-Instruct",
    "HuggingFaceH4/zephyr-7b-beta",
]


In [3]:
def build_messages(description: str, num_rows: int, schema: str):
    schema_hint = f"\nUse this schema/columns: {schema}" if schema.strip() else ""
    return [
        {
            "role": "system",
            "content": (
                "You are a synthetic dataset generator. "
                "When asked, return ONLY a valid JSON array of objects ‚Äî no markdown, no explanation, no code fences. "
                "Each object must have identical keys. Make data realistic, diverse, and non-repetitive."
            ),
        },
        {
            "role": "user",
            "content": (
                f"Generate exactly {num_rows} rows of synthetic data.\n"
                f"Dataset description: {description}{schema_hint}\n\n"
                "Return ONLY the raw JSON array."
            ),
        },
    ]



In [4]:
def generate_dataset(description, num_rows, schema, model_name, output_format, progress=gr.Progress()):
    if not HF_TOKEN:
        return None, None, "‚ùå HF_TOKEN secret is not configured on this Space."

    if not description.strip():
        return None, None, "‚ùå Please describe your dataset."

    num_rows = max(1, min(int(num_rows), 200))
    client = InferenceClient(model=model_name, token=HF_TOKEN)

    all_rows = []
    batch_size = min(20, num_rows)
    batches = (num_rows + batch_size - 1) // batch_size

    for _ in progress.tqdm(range(batches), desc="Generating batches"):
        rows_needed = min(batch_size, num_rows - len(all_rows))
        messages = build_messages(description, rows_needed, schema)
        try:
            response = client.chat_completion(
                messages=messages,
                max_tokens=2048,
                temperature=0.8,
            )
            text = response.choices[0].message.content.strip()
            # Strip markdown code fences if model adds them
            if text.startswith("```"):
                text = text.split("```")[1]
                if text.startswith("json"):
                    text = text[4:]
            start = text.find("[")
            end = text.rfind("]") + 1
            if start == -1 or end == 0:
                return None, None, f"‚ùå Model didn't return valid JSON. Raw response:\n{text[:500]}"
            batch_rows = json.loads(text[start:end])
            all_rows.extend(batch_rows)
        except json.JSONDecodeError as e:
            return None, None, f"‚ùå JSON parse error: {e}\n\nRaw output:\n{text[:500]}"
        except Exception as e:
            return None, None, f"‚ùå Generation error: {str(e)}"

    if not all_rows:
        return None, None, "‚ùå No data was generated."

    if output_format == "JSON":
        output_str = json.dumps(all_rows, indent=2, ensure_ascii=False)
        filename = "frexrator_dataset.json"
    else:
        keys = list(all_rows[0].keys())
        buf = io.StringIO()
        writer = csv.DictWriter(buf, fieldnames=keys)
        writer.writeheader()
        for row in all_rows:
            flat = {k: json.dumps(v) if isinstance(v, (dict, list)) else v for k, v in row.items()}
            writer.writerow(flat)
        output_str = buf.getvalue()
        filename = "frexrator_dataset.csv"

    tmp_path = f"/tmp/{filename}"
    with open(tmp_path, "w", encoding="utf-8") as f:
        f.write(output_str)

    preview = output_str[:3000] + ("\n...(truncated)" if len(output_str) > 3000 else "")
    return tmp_path, preview, f"‚úÖ Generated {len(all_rows)} rows successfully!"

In [5]:
# Gradio UI
with gr.Blocks(
    title="üß™ Frexrator ‚Äì Synthetic Dataset Generator",
    theme=gr.themes.Soft(primary_hue="violet"),
    css=".output-preview { font-family: monospace; font-size: 12px; }"
) as demo:

    gr.Markdown("""
    # üß™ Frexrator
    ### Synthetic Dataset Generator powered by Open-Source LLMs
    Describe the dataset you want in plain English and get structured synthetic data instantly.
    """)

    with gr.Row():
        with gr.Column(scale=2):
            description = gr.Textbox(
                label="üìù Dataset Description",
                placeholder="e.g. 'A customer support ticket dataset with fields for ticket ID, customer name, issue category, priority level, and resolution status. Include a mix of resolved and open tickets across different product categories.'",
                lines=4,
            )
            schema = gr.Textbox(
                label="üìã Column Schema (optional)",
                placeholder="e.g. ticket_id, customer_name, issue, priority (low/medium/high), status, created_at",
                lines=2,
            )
            with gr.Row():
                num_rows = gr.Slider(1, 200, value=20, step=1, label="üî¢ Number of Rows")
                output_format = gr.Radio(["JSON", "CSV"], value="JSON", label="üìÑ Output Format")

        with gr.Column(scale=1):
            model_name = gr.Dropdown(
                AVAILABLE_MODELS,
                value=DEFAULT_MODEL,
                label="ü§ñ LLM Model",
            )
            gr.Markdown("""
            **Tips:**
            - Be specific about data types and value ranges
            - Mention relationships between fields
            - Specify any domain (medical, finance, e-commerce...)
            - Add constraints like date ranges or enums
            """)

    generate_btn = gr.Button("‚ö° Generate Dataset", variant="primary", size="lg")
    status_box = gr.Textbox(label="Status", interactive=False)

    with gr.Row():
        download_file = gr.File(label="‚¨áÔ∏è Download Dataset")
        preview = gr.Textbox(
            label="üëÅÔ∏è Preview",
            lines=20,
            interactive=False,
            elem_classes=["output-preview"]
        )

    gr.Examples(
        examples=[
            ["E-commerce product reviews with product_id, product_name, category, rating (1-5), review_text, helpful_votes, verified_purchase, date", 25, "product_id, product_name, category, rating, review_text, helpful_votes, verified_purchase, date"],
            ["Medical patient records with anonymized patient_id, age, gender, diagnosis (ICD codes), medications, blood_pressure, bmi, and admission_date", 15, ""],
            ["Job postings dataset with company, job_title, location, salary_range, required_skills, experience_years, remote_option, posted_date", 20, ""],
            ["Financial transactions with transaction_id, timestamp, merchant, amount, currency, category, is_fraud flag", 30, ""],
        ],
        inputs=[description, num_rows, schema],
        label="Example Prompts"
    )

    generate_btn.click(
        fn=generate_dataset,
        inputs=[description, num_rows, schema, model_name, output_format],
        outputs=[download_file, preview, status_box],
    )


  with gr.Blocks(
  with gr.Blocks(


In [6]:
if __name__ == "__main__":
    demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dcfde182e31c681b50.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
