A dataset generator.

In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr


In [2]:
MODEL_GPT = 'gpt-4.1-mini'
MODEL_LLAMA = 'qwen2.5-coder'
MODEL_CLAUDE = 'claude-sonnet-4-5-20250929'

# Initialize the OpenAI client
openai = OpenAI()

In [3]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set (and this is optional)")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key not set (and this is optional)


In [4]:
openai = OpenAI()

anthropic_url = "https://api.anthropic.com/v1/"
OLLAMA_BASE_URL = "http://localhost:11434/v1"

anthropic = OpenAI(api_key=anthropic_api_key, base_url=anthropic_url)
ollama = OpenAI(api_key="ollama", base_url=OLLAMA_BASE_URL)

openai = OpenAI()

In [5]:
clients = {"gpt": openai, "claude": anthropic, "qwen": ollama, }

In [24]:
def stream_gpt(history, message):
    relevant_system_message = """
    You are a helpful assistant for generating synthetic dataset. Always generate a JSON dataset.
    For example: Generate a dataset for an Employee entity
    [{
        "name": "John Doe",
        "email": "john.doe@example.com",
        "phone": "+1234567890",
        "address": "123 Main St, Anytown, USA"
    }]
    """
    messages = [{"role": "system", "content": relevant_system_message}] + history + [{"role": "user", "content": message}]

    stream = openai.chat.completions.create(
        model=MODEL_GPT,
        messages=messages,
        stream=True
    )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
        yield result

In [36]:
def stream_claude(history, message):
    relevant_system_message = """
    You are a helpful assistant for generating synthetic dataset. Always generate a JSON dataset.
    For example: Generate a dataset for an Employee entity
    [{
        "name": "John Doe",
        "email": "john.doe@example.com",
        "phone": "+1234567890",
        "address": "123 Main St, Anytown, USA"
    }]
    """

    messages = [{"role": "system", "content": relevant_system_message}] + history + [{"role": "user", "content": message}]

    stream = anthropic.chat.completions.create(
        model=MODEL_CLAUDE,
        messages=messages,
        stream=True
    )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
        yield result

In [39]:
def stream_ollama(history, message):
    relevant_system_message = """
    You are a helpful assistant for generating synthetic dataset. Always generate a JSON dataset.
    For example: Generate a dataset for an Employee entity
    [{
        "name": "John Doe",
        "email": "john.doe@example.com",
        "phone": "+1234567890",
        "address": "123 Main St, Anytown, USA"
    }]
    """
    messages = [{"role": "system", "content": relevant_system_message}] + history + [{"role": "user", "content": message}]
    response = ollama.chat.completions.create(model=MODEL_LLAMA, messages=messages)
    print(response)

    yield response.choices[0].message.content

In [None]:
STREAM_FUNCTIONS = {
    "GPT": stream_gpt,
    "QWEN": stream_ollama,
    "Claude": stream_claude,
}

def run_chat(msg):
    """Add user message to display and pass context for API call. Clears previous history for each new message."""
    updated = [{"role": "user", "content": msg}]  # Fresh display: only current message
    return "", updated, ([], msg)  # Empty history for API - each message is independent

def stream_to_chatbot(context, model):
    """Stream API response based on selected model and yield message history for Chatbot."""
    history, message = context
    stream_fn = STREAM_FUNCTIONS.get(model, stream_gpt)
    for chunk in stream_fn(history, message):
        yield history + [{"role": "user", "content": message}, {"role": "assistant", "content": chunk}]

# UI definition
COL_HEIGHT = 450

with gr.Blocks() as ui:
    chat_context = gr.State(value=([], ""))
    with gr.Row(elem_id="columns-row"):
        with gr.Column(scale=1, elem_classes=["scrollable-column"]):
            message = gr.Textbox(
                label="Message",
                lines=15,
                max_lines=20,
                placeholder="Please provide the entity/schema you want to generate a dataset for..."
            )
        with gr.Column(scale=1, elem_classes=["scrollable-column"]):
            chatbot = gr.Chatbot(
                height=COL_HEIGHT,
                type="messages",
            )
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=["GPT", "QWEN", "Claude"],
            value="GPT",
            label="Select Model",
        )
    with gr.Row():
        submit_btn = gr.Button("Generate Dataset")

    # Hooking up events to callbacks
    submit_btn.click(run_chat, [message, chatbot], [message, chatbot, chat_context]).then(
        stream_to_chatbot, [chat_context, model_dropdown], chatbot
    )
    message.submit(run_chat, [message, chatbot], [message, chatbot, chat_context]).then(
        stream_to_chatbot, [chat_context, model_dropdown], chatbot
    )

ui.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7882
* To create a public link, set `share=True` in `launch()`.




ChatCompletion(id='chatcmpl-309', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='```json\n[\n    {\n        "name": "John Doe",\n        "email": "john.doe@example.com",\n        "phone": "+1234567890",\n        "address": "123 Main St, Anytown, USA"\n    },\n    {\n        "name": "Jane Smith",\n        "email": "jane.smith@example.com",\n        "phone": "+0987654321",\n        "address": "456 Elm St, Othercity, USA"\n    },\n    {\n        "name": "Mike Johnson",\n        "email": "mike.johnson@example.com",\n        "phone": "+1123456789",\n        "address": "789 Oak St,anothertown,USA"\n    }\n]\n```', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))], created=1772248913, model='qwen2.5-coder', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=177, prompt_tokens=106, total_tokens=283, completion_tokens_details=