# Date Generator

generate synthetic data when given scheme, business problem description, model, number of records, file name, file type, and environment

# Available models
  Model API:

    1. gpt-4o-mini
    2. claude-3-haiku-20240307
    3. gemini-2.0-flash
    4. deepseek-chat"

  HuggingFace API:

    5. meta-llama/Meta-Llama-3.1-8B-Instruct


# Available environment

Colab: set up HF token and API keys in Colab secret section

Local: set up HF token and API keys in .env file



### *** This project is developed based on the idea of 'week3/community-contributuins/Week3-Dataset_Generator-DP'. Really appreciate it! Then, the project is improved to run both on Colab or locally, and integrate HuggingFace API

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0
!pip install anthropic dotenv pyarrow

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from bs4 import BeautifulSoup
from typing import List
import google.generativeai
import anthropic
from itertools import chain
from dotenv import load_dotenv
import gradio as gr
import json
import pandas as pd
import random
import re
import subprocess
import pyarrow as pa
import torch
import gc

In [None]:
# --- Schema Definition ---
SCHEMA = [
    ("Name", "TEXT", '"Northern Cafe"'),
    ("Location", "TEXT", '"2904 S Figueroa St, Los Angeles, CA 90007"'),
    ("Type", "TEXT", 'One of ["Chinese","Mexico","French","Korean","Italy"] or other potential types'),
    ("Average Price", "TEXT", '"$30", or "--" if unkown'),
    ("History/Age", "INT", 'integer age of resturant, e.g., 7'),
    ("Menu", "Array", '["Beef Noodle", "Fried Rice", "Dumpling", ...]'),
]

In [None]:
# Default schema text for the textbox
DEFAULT_SCHEMA_TEXT = "\n".join([f"{i+1}. {col[0]} ({col[1]}) Example: {col[2]}" for i, col in enumerate(SCHEMA)])

In [None]:
# Available models
MODELS = [
    "gpt-4o-mini",
    "claude-3-haiku-20240307",
    "gemini-2.0-flash",
    "deepseek-chat",
    "meta-llama/Meta-Llama-3.1-8B-Instruct"
]

In [None]:
# Available file formats
FILE_FORMATS = [".csv", ".tsv", ".jsonl", ".parquet", ".arrow"]

In [None]:
system_prompt = """You are a helpful assistant whose main purpose is to generate datasets for a given business problem based on given schema."""

In [None]:
def get_env_info(env):
  try:
    global hf_token, openai_api_key, anthropic_api_key, google_api_key, deepseek_api_key
    if env == "Colab":
      # Colab environment
      from google.colab import drive
      from google.colab import userdata
      hf_token = userdata.get('HF_TOKEN')
      openai_api_key = userdata.get('OPENAI_API_KEY')
      anthropic_api_key = userdata.get('ANTHROPIC_API_KEY')
      google_api_key = userdata.get('GOOGLE_API_KEY')
      deepseek_api_key = userdata.get('DEEPSEEK_API_KEY')
    elif env == "Local":
      # Local environment
      load_dotenv(override=True)
      hf_token = os.getenv('HF_TOKEN')
      openai_api_key = os.getenv('OPENAI_API_KEY')
      anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
      google_api_key = os.getenv('GOOGLE_API_KEY')
      deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
  except Exception as e:
      raise Exception(f"Please check your environment: {str(e)}")

In [None]:
def get_prompt(schema_text, business_problem, nr_records):
    prompt = f"""
      The problem is: {business_problem}

      Generate {nr_records} rows data in JSONL format, each line a JSON object with the following fields:

      {schema_text}

      Do NOT repeat column values from one row to another.

      Only output valid JSONL.
      """
    return prompt.strip()

In [None]:
# --- LLM Interface ---
def query(user_prompt, model):
    try:
        if "gpt" in model.lower():
            client = OpenAI(api_key=openai_api_key)
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
              ]
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0.7
            )
            content = response.choices[0].message.content

        elif "claude" in model.lower():
            client = anthropic.Anthropic(api_key=anthropic_api_key)
            response = client.messages.create(
                model=model,
                messages=[{"role": "user", "content": user_prompt}],
                max_tokens=4000,
                temperature=0.7,
                system=system_prompt
            )
            content = response.content[0].text
        elif "gemini" in model.lower():
            client = OpenAI(
                api_key=google_api_key,
                base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
            )
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
              ]
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0.7
            )
            content = response.choices[0].message.content

        elif "deepseek" in model.lower():
            client = OpenAI(
                api_key=deepseek_api_key,
                base_url="https://api.deepseek.com"
            )
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
              ]
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0.7
            )
            content = response.choices[0].message.content

        elif "llama" in model.lower():
            global tokenizer, inputs, llama_model, outputs
            messages = [
                  {"role": "system", "content": system_prompt},
                  {"role": "user", "content": user_prompt}
                ]

            login(hf_token, add_to_git_credential=True)
            quant_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4"
            )

            tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
            tokenizer.pad_token = tokenizer.eos_token
            inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
            if llama_model == None:
                llama_model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)
            outputs = llama_model.generate(inputs, max_new_tokens=4000)

            _, _, after = tokenizer.decode(outputs[0]).partition("assistant<|end_header_id|>")
            content = after.strip()
        else:
            raise ValueError(f"Unsupported model. Use one of {MODELS}")

        # Parse JSONL output
        lines = [line.strip() for line in content.strip().splitlines() if line.strip().startswith("{")]
        return [json.loads(line) for line in lines]

    except Exception as e:
        raise Exception(f"Model query failed: {str(e)}")

In [None]:
# --- Output Formatter ---
def save_dataset(records, file_format, filename):
    df = pd.DataFrame(records)
    if file_format == ".csv":
        df.to_csv(filename, index=False)
    elif file_format == ".tsv":
        df.to_csv(filename, sep="\t", index=False)
    elif file_format == ".jsonl":
        with open(filename, "w") as f:
            for record in records:
                f.write(json.dumps(record) + "\n")
    elif file_format == ".parquet":
        df.to_parquet(filename, engine="pyarrow", index=False)
    elif file_format == ".arrow":
        table = pa.Table.from_pandas(df)
        with pa.OSFile(filename, "wb") as sink:
            with pa.ipc.new_file(sink, table.schema) as writer:
                writer.write(table)
    else:
        raise ValueError("Unsupported file format")

In [None]:
# --- Main Generation Function ---
def generate_dataset(schema_text, business_problem, model, nr_records, file_format, save_as, env):
    try:
        # Validation
        if nr_records <= 10:
            return "❌ Error: Number of records must be greater than 10.", None
        if nr_records > 1000:
            return "❌ Error: Number of records must be less than or equal to 1000.", None

        if file_format not in FILE_FORMATS:
            return "❌ Error: Invalid file format.", None

        if not (save_as or save_as.strip() == ""):
            save_as = f"default{file_format}"
        elif not save_as.endswith(file_format):
            save_as = save_as + file_format

        # Load env
        get_env_info(env)

        # Generate prompt
        user_prompt = get_prompt(schema_text, business_problem, nr_records)

        # Query model
        records = query(user_prompt, model)

        if not records:
            return "❌ Error: No valid records generated from the model.", None

        # Save dataset
        save_dataset(records, file_format, save_as)

        # Create preview
        df = pd.DataFrame(records)
        preview = df.head(10)  # Show first 10 rows

        success_message = f"✅ Generated {len(records)} records successfully!\n📁 Saved to: {save_as}\n📊 "

        return success_message, preview

    except Exception as e:
        return f"❌ Error: {str(e)}", None

In [None]:
# --- Gradio Interface ---

with gr.Blocks(title="Dataset Generator", theme=gr.themes.Citrus()) as interface:
    hf_token = None
    openai_api_key = None
    anthropic_api_key = None
    google_api_key = None
    deepseek_api_key = None
    tokenizer = None
    inputs = None
    llama_model = None
    outputs = None

    gr.Markdown("# Dataset Generator")
    gr.Markdown("Generate synthetic datasets using AI models")

    with gr.Row():
        with gr.Column(scale=2):
            schema_input = gr.Textbox(
                label="Schema",
                value=DEFAULT_SCHEMA_TEXT,
                lines=15,
                placeholder="Define your dataset schema here... Please follow this format: Field_Name, Field_Type, Field Example"
            )

            business_problem_input = gr.Textbox(
                label="Business Problem",
                value="I want to generate restuant records",
                lines=1,
                placeholder="Enter business problem desciption for the model..."
            )

            with gr.Row():
                model_dropdown = gr.Dropdown(
                    label="Model",
                    choices=MODELS,
                    value=MODELS[0],
                    interactive=True
                )

                nr_records_input = gr.Number(
                    label="Number of records",
                    value=27,
                    minimum=11,
                    maximum=1000,
                    step=1
                )

            with gr.Row():
                save_as_input = gr.Textbox(
                      label="Save as",
                      value="restaurant_dataset",
                      placeholder="Enter filename (extension will be added automatically)"
                  )

                file_format_dropdown = gr.Dropdown(
                    label="File format",
                    choices=FILE_FORMATS,
                    value=FILE_FORMATS[0],
                    interactive=True
                )

                env_dropdown = gr.Dropdown(
                    label="Environment",
                    choices=["Colab", "Local"],
                    value="Colab",
                    interactive=True
                )



            generate_btn = gr.Button("🚀 Generate", variant="secondary", size="lg")

        with gr.Column(scale=1):
            output_status = gr.Textbox(
                label="Status",
                lines=4,
                interactive=False
            )

            output_preview = gr.Dataframe(
                label="Preview (First 10 rows)",
                interactive=False,
                wrap=True
            )

    # Connect the generate button
    generate_btn.click(
        fn=generate_dataset,
        inputs=[
            schema_input,
            business_problem_input,
            model_dropdown,
            nr_records_input,
            file_format_dropdown,
            save_as_input,
            env_dropdown
        ],
        outputs=[output_status, output_preview]
    )

    gr.Markdown("""
    ### 📝 Instructions:
    1. **Schema**: Define the structure of your dataset (pre-filled with restaurant schema)
    2. **Business problem**: User prompt to guide the AI model
    3. **Model**: Choose between GPT, Claude, Gemini, DeepSeek or Llama models
    4. **Number of records**: Number of records to generate (minimum 11)
    5. **File format**: Choose output format (.csv, .tsv, .jsonl, .parquet, .arrow)
    6. **Save as**: Filename (extension added automatically)
    7. Click **Generate** to create your dataset

    ### 🔧 Requirements:
    - For local mode, set up HF token and API keys in `.env` file (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)
    - For colab mode, set up HF token and API keys in Colab secret section (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `DEEPSEEK_API_KEY`, `HF_TOKEN`)
    """)

interface.launch(debug=True)

del tokenizer, inputs, llama_model, outputs
gc.collect()
torch.cuda.empty_cache()