
## **Week 3 task.**
Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.

https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing

In [2]:
# imports

import os
import requests
import torch
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from huggingface_hub import login
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from dotenv import load_dotenv
import gradio as gr

In [None]:

load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
llama_api_key = "ollama"

# hf_token = userdata.get('HF_TOKEN')
# login(hf_token, add_to_git_credential=True)


if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")

if llama_api_key:
    print(f"LLama API Key exists")
else:
    print("LLama API Key not set")
    
GPT_MODEL = "gpt-4.1-mini"
LLAMA_MODEL = "llama3.1"


openai = OpenAI()

llama_url = "http://localhost:11434/v1"
llama = OpenAI(api_key=llama_api_key, base_url=llama_url)

In [12]:
def generate_with_gpt(user_prompt: str, num_samples: int = 5):
    """
    Generates synthetic data using OpenAI's GPT.
    Return a JSON string.
    """
    if not openai:
        return json.dumps({"error": "OpenAI client not initialized. Please check your API key."}, indent=2)

    try:
        response = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": f"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting."},
                {"role": "user", "content": user_prompt}
            ],
            response_format={"type": "json_object"}
        )
        
        json_text = response.choices[0].message.content
        return json_text
    except APIError as e:
        return json.dumps({"error": f"Error from OpenAI API: {e.body}"}, indent=2)
    except Exception as e:
        return json.dumps({"error": f"An unexpected error occurred: {e}"}, indent=2)

def generate_with_gpt(user_prompt: str, num_samples: int = 5):
    """
    Generates synthetic data using OpenAI's GPT.
    Return a JSON string.
    """
    if not openai:
        return json.dumps({"error": "OpenAI client not initialized. Please check your API key."}, indent=2)

    try:
        response = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": f"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting."},
                {"role": "user", "content": user_prompt}
            ],
            response_format={"type": "json_object"}
        )
        
        json_text = response.choices[0].message.content
        return json_text
    except APIError as e:
        return json.dumps({"error": f"Error from OpenAI API: {e.body}"}, indent=2)
    except Exception as e:
        return json.dumps({"error": f"An unexpected error occurred: {e}"}, indent=2)

In [13]:
def generate_data(user_prompt, model_choice):
    """
    Wrapper function that calls the appropriate generation function based on model choice.
    """
    if not user_prompt:
        return json.dumps({"error": "Please provide a description for the data."}, indent=2)

    if model_choice == f"Hugging Face ({LLAMA_MODEL})":
        return generate_with_llama(user_prompt)
    elif model_choice == f"OpenAI ({GPT_MODEL})":
        return generate_with_gpt(user_prompt)
    else:
        return json.dumps({"error": "Invalid model choice."}, indent=2)

In [None]:
# Gradio UI
with gr.Blocks(theme=gr.themes.Glass(), title="Synthetic Data Generator") as ui:
    gr.Markdown("# Synthetic Data Generator")
    gr.Markdown("Describe the type of data you need, select a model, and click 'Generate'.")

    with gr.Row():
        with gr.Column(scale=3):
            data_prompt = gr.Textbox(
                lines=5,
                label="Data Prompt",
                placeholder="e.g., a list of customer profiles with name, email, and a favorite product"
            )
        
        with gr.Column(scale=1):
            model_choice = gr.Radio(
                [f"Hugging Face ({LLAMA_MODEL})", f"OpenAI ({GPT_MODEL})"],
                label="Choose a Model",
                value=f"Hugging Face ({LLAMA_MODEL})"
            )
            
            generate_btn = gr.Button("Generate Data")
            
    with gr.Row():
        output_json = gr.JSON(label="Generated Data")
    
    generate_btn.click(
        fn=generate_data,
        inputs=[data_prompt, model_choice],
        outputs=output_json
    )

ui.launch(inbrowser=True, debug=True)