# DataForge: Open Source Dataset Synthesizer
A utility to generate structured synthetic datasets using OpenRouter.

In [20]:
import gradio as gr
import os
import json
from openai import OpenAI

client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=os.getenv('OPENROUTER_API_KEY'))

In [21]:
def get_models():
    # List of open-source models available via OpenRouter
    return ["meta-llama/llama-3.1-8b-instruct", "mistralai/mistral-nemo-12b-instruct-v1", "qwen/qwen-2.5-7b-instruct"]

In [22]:
def generate_dataset(topic, model_name, count):
    prompt = f'Generate {count} JSON objects for a dataset about "{topic}". Structure: {{"input": "...", "output": "..."}}'
    json_schema = {
        "name": "dataset_schema",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "samples": {
                    "type": "array",
                    "minItems": int(count),
                    "maxItems": int(count),
                    "items": {
                        "type": "object",
                        "properties": {
                            "input": {"type": "string"},
                            "output": {"type": "string"}
                        },
                        "required": ["input", "output"],
                        "additionalProperties": False
                    }
                }
            },
            "required": ["samples"],
            "additionalProperties": False
        }
    }
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "system", "content": "You are a data engineer. Output only raw JSON."}, {"role": "user", "content": prompt}],
        response_format={
            "type": "json_schema",
            "json_schema": json_schema
        }
    )
    return response.choices[0].message.content

In [23]:
def save_to_file(json_data):
    with open("dataset.json", "w") as f:
        f.write(json_data)
    return "dataset.json"

In [24]:
def orchestrate_generation(topic, model_name, count):
    raw_json = generate_dataset(topic, model_name, count)
    file_path = save_to_file(raw_json)
    return raw_json, file_path

In [25]:
with gr.Blocks(title="DataForge") as demo:
    gr.Markdown("# üõ†Ô∏è DataForge: Open Source Synthesis")
    
    with gr.Row():
        topic_in = gr.Textbox(label="Dataset Topic")
        model_drop = gr.Dropdown(get_models(), label="Select Open Source Model")
        count_slider = gr.Slider(1, 10, value=3, label="Sample Count", step=1)
    
    gen_btn = gr.Button("Forge Dataset")
    json_out = gr.JSON(label="Preview")
    file_out = gr.File(label="Download Dataset")
    
    gen_btn.click(orchestrate_generation, inputs=[topic_in, model_drop, count_slider], outputs=[json_out, file_out])

In [None]:
demo.launch(inbrowser=True)