In [None]:
## INSTALL DEPENDANCIES

!pip install transformers diffusers accelerate gradio datasets --quiet

In [54]:
## IMPORTS

import torch
import gradio as gr
import random
import json
import re
import pandas as pd
from transformers import pipeline, AutoTokenizer
from diffusers import DiffusionPipeline
from huggingface_hub import login, whoami
from google.colab import userdata

In [None]:
## REPRODUCTIBILITY

torch.manual_seed(42)
random.seed(42)

In [52]:
## LOGIN WITH HUGGING FACE

try:
    hf_token = userdata.get("HF_API_KEY")
    if hf_token:
        login(hf_token)
        print("Logged into Hugging Face")
    else:
        print("HF_TOKEN not found. Make sure to set it in Colab Secrets.")
except Exception as e:
    print("Login skipped or failed:", e)

Logged into Hugging Face


In [None]:
## SETUP DEVICE

device = 0 if torch.cuda.is_available() else -1
print("Using GPU" if device == 0 else "Using CPU")

Using CPU


In [None]:
## LOAD MODELS  SET-UP

models = {
    "FLAN-T5": "google/flan-t5-base",
    "DistilGPT2": "distilgpt2",
    "TinyLlama": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
}

loaded_models = {}

def load_model(model_name):
    if model_name not in loaded_models:
        loaded_models[model_name] = pipeline(
            "text-generation",
            model=models[model_name],
            device=device
        )
    return loaded_models[model_name]

In [None]:
## JSON VALIDATION

def extract_json(text):
    try:
        return json.loads(text)
    except:
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            try:
                return json.loads(match.group())
            except:
                return {"error": "Invalid JSON format"}
        return {"error": "No JSON detected"}

In [50]:
## SYTHENTIC DATA GENERATOR

def generate_synthetic_data(task_type, model_name, num_samples):

    pipe = load_model(model_name)

    prompts = {
        "Customer Support":
        """
Generate ONE realistic customer support interaction as VALID JSON only.

Return:
{
  "customer_message": "...",
  "agent_response": "...",
  "issue_category": "...",
  "resolution_status": "resolved or escalated",
  "customer_sentiment": "positive, neutral, or negative"
}
        """,

        "Product Reviews":
        """
Generate ONE realistic product review as VALID JSON only.

Return:
{
  "product_name": "...",
  "category": "...",
  "rating": 1-5,
  "review_title": "...",
  "review_text": "...",
  "verified_purchase": true or false,
  "sentiment": "positive, neutral, or negative"
}
        """,

        "Meeting Summary":
        """
Generate ONE structured meeting summary as VALID JSON only.

Return:
{
  "title": "...",
  "date": "YYYY-MM-DD",
  "participants": ["name1", "name2"],
  "key_points": ["point1", "point2"],
  "decisions_made": ["decision1"],
  "action_items": [
    {"task": "...", "owner": "...", "deadline": "YYYY-MM-DD"}
  ]
}
        """,

        "QA Dataset":
        """
Generate ONE QA training example as VALID JSON only.

Return:
{
  "domain": "...",
  "difficulty": "easy, medium, or hard",
  "context": "...",
  "question": "...",
  "answer": "...",
  "answer_type": "fact, explanation, or reasoning"
}
        """
    }

    results = []

    for _ in range(int(num_samples)):
        output = pipe(
            prompts[task_type],
            max_new_tokens=180,
            do_sample=True,
            temperature=0.7
        )[0]["generated_text"]

        validated = extract_json(output)
        results.append(validated)

    return results

In [None]:
## JSON CSV EXPORT

def json_to_dataframe(json_data):
    try:
        return pd.DataFrame(json_data)
    except:
        return pd.DataFrame({"error": ["Conversion failed"]})

In [None]:
## TOKENIZER INSPECTION

def inspect_tokenizer(model_name, sample_text):
    tokenizer = AutoTokenizer.from_pretrained(models[model_name])
    tokens = tokenizer.tokenize(sample_text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    return {
        "tokens": tokens,
        "token_ids": token_ids,
        "num_tokens": len(tokens)
    }

In [53]:
## LOAD DIFFUSSION MODEL

if torch.cuda.is_available():
    image_pipe = DiffusionPipeline.from_pretrained(
        "stable-diffusion-v1-5/stable-diffusion-v1-5",
        torch_dtype=torch.float16
    ).to("cuda")

    image_pipe.enable_attention_slicing()
    image_pipe.enable_model_cpu_offload()

    print("Diffusion model loaded.")
else:
    print("GPU not available. Image mode disabled.")
    image_pipe = None

GPU not available. Diffusion image mode disabled.


In [None]:
## IMAGE GENERATOR

def generate_synthetic_image_dataset(prompt, num_images):

    if image_pipe is None:
        return ["GPU not available ‚Äî image generation disabled."]

    images = []
    for _ in range(int(num_images)):
        image = image_pipe(prompt).images[0]
        images.append(image)

    return images

In [None]:
## GRADIO APPLICATION

def run_generator(task_type, model_name, num_samples):
    data = generate_synthetic_data(task_type, model_name, num_samples)
    df = json_to_dataframe(data)

    csv_path = "synthetic_dataset.csv"
    df.to_csv(csv_path, index=False)

    return data, csv_path


with gr.Blocks() as demo:

    gr.Markdown("# üß† Synthetic Dataset Generator Studio")

    # TEXT DATA SECTION
    gr.Markdown("## üìä Text Synthetic Dataset Generator")

    task_dropdown = gr.Dropdown(
        ["Customer Support", "Product Reviews", "Meeting Summary", "QA Dataset"],
        label="Select Dataset Type"
    )

    model_dropdown = gr.Dropdown(
        list(models.keys()),
        label="Select Model"
    )

    num_samples = gr.Slider(1, 5, value=2, step=1, label="Number of Samples")

    output_json = gr.JSON(label="Validated JSON Output")
    csv_output = gr.File(label="Download CSV")

    generate_btn = gr.Button("Generate Dataset")

    generate_btn.click(
        run_generator,
        inputs=[task_dropdown, model_dropdown, num_samples],
        outputs=[output_json, csv_output],
        show_progress=True
    )

    # TOKENIZER SECTION
    gr.Markdown("## üîç Tokenizer Inspection")

    token_input = gr.Textbox(label="Enter Text")
    token_output = gr.JSON(label="Tokenizer Output")

    inspect_btn = gr.Button("Inspect Tokens")

    inspect_btn.click(
        inspect_tokenizer,
        inputs=[model_dropdown, token_input],
        outputs=token_output,
        show_progress=True
    )

    # IMAGE SECTION
    gr.Markdown("## üñº Synthetic Image Dataset Generator")

    image_prompt = gr.Textbox(label="Image Prompt")
    num_images = gr.Slider(1, 3, value=1, step=1, label="Number of Images")
    image_gallery = gr.Gallery(label="Generated Images")

    image_btn = gr.Button("Generate Images")

    image_btn.click(
        generate_synthetic_image_dataset,
        inputs=[image_prompt, num_images],
        outputs=image_gallery,
        show_progress=True
    )

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5d818ebea3cc01fd74.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


