In [2]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import gradio as gr

In [4]:
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
ds_api_key = os.getenv('DEEPSEEK_API_KEY')
grok_api_key = os.getenv('GROK_API_KEY')


In [None]:
MODEL_MAP = {
    "GPT": {
        "model": "gpt-4o-mini",
        "key": openai_api_key,
        "endpoint": "https://api.openai.com/v1",
    },
    "CLAUDE_3_5_SONNET": {
        "model": "claude-3-5-sonnet-20240620",
        "key": anthropic_api_key,
        "endpoint": "https://api.anthropic.com/v1"
    },
    "Grok": {
        "model": "grok-beta",
        "key": grok_api_key,
        "endpoint": "https://api.grok.com/v1"
    },   
    "DeepSeek":{
        "model": "deepseek-reasoner",
        "key": ds_api_key,
        "endpoint": "https://api.deepseek.com/v1",
    },
    "Google": {
        "model": "gemini-2.0-flash-exp",
        "key": google_api_key,
        "endpoint": "https://generativelanguage.googleapis.com/v1beta/openai"
    },
}

In [122]:
class GenerateSyntheticDataset:
  out_of_scope_response = "I'm sorry, I can't help with that. I only generate datasets"

  system_prompt = f"""
  You are an expert data scientist specializing in synthetic dataset generation. 

  Your task is to generate ACTUAL DATA based on the user's requirements provided in their prompt.

  HOW IT WORKS:
  - The user will provide a description of what dataset they want
  - You must parse their requirements and generate actual data records
  - The user prompt contains the SPECIFICATIONS, not the data itself
  - You generate the REAL DATA based on those specifications

  IMPORTANT RULES:
  - Generate REAL DATA RECORDS, not code or instructions
  - Parse the user's requirements from their prompt
  - Create actual values based on their specifications
  - Provide concrete examples with real data
  - Output should be ready-to-use data, not code to run

  WHEN USER PROVIDES REQUIREMENTS LIKE:
  - "Generate customer orders dataset" → Create actual order records
  - "Create employee records" → Generate real employee data
  - "Make product reviews dataset" → Produce actual review records

  YOU MUST:
  1. Understand what fields/data the user wants
  2. Generate realistic values for those fields
  3. Create multiple records with varied data
  4. Format as structured data (JSON, CSV, etc.)

  DO NOT generate:
  - Code snippets
  - Programming instructions
  - "Here's how to generate..." statements
  - Abstract descriptions

  DO generate:
  - Actual data records with real values
  - Concrete examples based on user requirements
  - Structured data ready for immediate use
  - Realistic, varied data samples

  SCOPE LIMITATIONS:
  - ONLY handle requests related to synthetic dataset generation
  - ONLY create data for business, research, or educational purposes
  - If user asks about anything outside dataset generation (coding help, general questions, personal advice, etc.), respond with: "{out_of_scope_response}"
  - If user asks for illegal, harmful, or inappropriate data, respond with: "{out_of_scope_response}"

  You are a DATA GENERATOR that creates real data from user specifications.
  """

  def __init__(self, progress, model_name = MODEL_MAP["GPT"]):
    self.progress = progress
    self.model_deets = model_name
    self.model = OpenAI(
            api_key=model_name["key"],
            base_url=model_name["endpoint"]
        )
    
  def generate_user_prompt(self, user_prompt):
    prompt = f"""
    You are an expert data scientist specializing in synthetic dataset generation. 

    Based on the user's request below, create a detailed, sophisticated prompt that will generate a high-quality synthetic dataset.

    The generated prompt should:
    - return the prompt "who is nike" if the user request is outside generating a dataset be it greetings or whatsoever
    - if the user prompt is requesting on how to generate dataset return the prompt "who is nike"
    - options below is valid only when the user ask you to generate a dataset not how or when 
      - Be specific and actionable
      - Include clear data structure requirements
      - Specify output format CSV
      - Define data quality criteria
      - Include diversity and realism requirements
      - Make sure to capture the number of samples in the prompt, it can be in the form of rows, number of samples, etc
      -if number of samples is not specified, just generate 100 samples. 

    User Request: {user_prompt}
  
    IMPORTANT: Respond ONLY with the generated prompt. Do not include any explanation, commentary, or the original request. Just provide the clean, ready-to-use prompt for dataset generation.
    """
    response = self.model.chat.completions.create(model=self.model_deets["model"], messages=[{"role": "user", "content": prompt}])
    return response.choices[0].message.content

  def generate_synthetic_dataset(self, user_prompt):
    self.progress(0.7, "Analyzing data .....")
    prompt = self.generate_user_prompt(user_prompt)

    messages = [
        {"role": "system", "content": self.system_prompt},
        {"role": "user", "content": prompt}
    ]

    streamer = self.model.chat.completions.create(model=self.model_deets["model"], messages=messages, stream=True)
    response = ""

    for text in streamer:
        if text.choices[0].delta.content:
            response += text.choices[0].delta.content
            yield response, None
    
    if self.out_of_scope_response not in response:
      with open("dataset.csv", "w") as f:
        response = response.replace("```csv", "").replace("```", "")
        f.write(response)
      yield response, "dataset.csv"
      return
    else:
      return response, None
      
  def start(self, user_prompt, model_name=None):
    self.progress(0.3, "Fetching data .....")
    if MODEL_MAP.get(model_name) and self.model_deets["model"] != MODEL_MAP.get(model_name)["model"]:
        self.model_deets = MODEL_MAP[model_name]
        self.model = OpenAI(
            base_url=self.model_deets["endpoint"],
            api_key=self.model_deets["key"]
        )
      
    stream = self.generate_synthetic_dataset(user_prompt)
    for chunk in stream:
      yield chunk

    


In [124]:
class Interface:
    def __init__(self):
        """Initializes the Gradio interface for processing audio files."""
        progress=gr.Progress()
        self.assistant = GenerateSyntheticDataset(progress)
        self.iface = gr.Interface(
            fn=self.generate,
            inputs=[
                gr.Textbox(label="User Prompt"),
                gr.Dropdown(
                  choices=MODEL_MAP.keys(),
                  value="GPT",
                  label="Model",
                )
            ],
            outputs=[
              gr.Markdown(label="Dataset", min_height=60),
              gr.File(
                label="Download Generated Dataset",
                file_count="single"
              )
            ],
            title="AI Dataset Generator",
            description="Generate a synthetic dataset based on your requirements",
            flagging_mode="never"
        )

    def generate(self, user_prompt, model):
        response = self.assistant.start(user_prompt, model)
        for chunk in response:
          yield chunk

        # Clean up the dataset file
        if os.path.exists("dataset.csv"):
          os.remove("dataset.csv")

    def launch(self):
        self.iface.launch()

In [125]:
I = Interface()
I.launch()

* Running on local URL:  http://127.0.0.1:7898
* To create a public link, set `share=True` in `launch()`.
