# Test Data Generator

This application creates test data in various formats. 
The test data is described in the prompt to the LLM.
The output file is in Text (not Markdown) for easier copying and writing to a file without modifications.
There's a dropdown box that allows LLM selection.

You can use several models: 
- Local - HuggingFace (Llama 3.1 8B, KriKri 8B for native Greek support)
- Anthropic Claude (haiku)
- OpenAI (gpt-4o-mini)
- Google Gemini

## to be added later
- Local Ollama & LM Studio
- *Amazon Bedrock (To be added in a later version)*

Notebook can run locally or on a Google Colab notebook (or on Amazon SageMaker notebook - in that case make sure to also import your environment variables)

In [1]:
# install dependencies for HuggingFace and other models
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai anthropic google

In [15]:
# Import libraries
# commented out the ones not used in this notebook
# If running in Google Colab, uncomment the drive import to access files and the userdata import to access user data

from dotenv import load_dotenv
import os
#import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import anthropic
from google import genai
from google.genai import types
# from google.colab import drive
from huggingface_hub import login
#from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, TextIteratorStreamer
import torch
import gradio as gr
#import threading    

In [34]:
# Constants
LLAMA_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
KRIKRI_MODEL = "ilsp/Llama-Krikri-8B-Instruct"
OPENAI_MODEL = "gpt-4o-mini"
ANTHROPIC_MODEL = "claude-3-haiku-20240307"
GOOGLE_MODEL = "gemini-2.5-flash"


In [35]:
# Load environment variables and set up API connections
load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
weather_api_key = os.getenv('WEATHER_API_KEY')
hf_api_key = os.getenv('HF_API_KEY')

#ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key="ollama")

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
    openai = OpenAI(api_key=openai_api_key)
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
    claude = anthropic.Anthropic()
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
    gemini =  genai.Client(api_key=google_api_key)  
else:
    print("Google API Key not set")

if weather_api_key:
    print(f"Weather API Key exists and begins {weather_api_key[:7]}")
else:
    print("Weather API Key not set")

if hf_api_key:
    print(f"HuggingFace API Key exists and begins {weather_api_key[:7]}")
    login(hf_api_key, add_to_git_credential=True)
else:
    print("HuggingFace API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyB5
Weather API Key exists and begins 51c3669
HuggingFace API Key exists and begins 51c3669


In [38]:
# Model invocation function
# This function will handle the invocation of different models based on the selected model name.

def invoke_model(model_name, prompt, max_tokens=1000, temperature=0.4):
    if model_name == OPENAI_MODEL:
        response = openai.chat.completions.create(
            model=OPENAI_MODEL,
            messages=prompt,
            max_tokens=max_tokens,
            temperature=temperature
        )
        return response.choices[0].message.content
    
    elif model_name == ANTHROPIC_MODEL:
        response = claude.messages.create(
            model=ANTHROPIC_MODEL,
            system=prompt[0]['content'],
            messages=[prompt[1]],
            max_tokens=max_tokens,
            temperature=temperature
        )
        return response.content[0].text
    
    elif model_name == GOOGLE_MODEL:
        response = gemini.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt[1]['content'],
            config=types.GenerateContentConfig(
                temperature=temperature,
                maxOutputTokens=max_tokens,
                system_instruction=prompt[0]['content'],)
        )
        return response.text
    
    elif model_name == LLAMA_MODEL or model_name == KRIKRI_MODEL:
        if torch.cuda.is_available():
            print("CUDA is available, setting up quantization configuration for HuggingFace models.")
            from transformers import BitsAndBytesConfig
            # Set up quantization configuration for 4-bit loading
            quant_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_quant_type="nf4"
            )
        else: 
            print("CUDA is not available, using default configuration for HuggingFace models.")
            quant_config = None
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quant_config)
        inputs = tokenizer.apply_chat_template(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(inputs, max_new_tokens=max_tokens, temperature=temperature)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    else:
        raise ValueError("Unsupported model name")

# Save text to a selected location
# This function will save the generated text to a specified file or the current directory if no file
def save_text_to_selected_location(text_content, filename=None):
    if not text_content.strip():
        return "No content to save"
    
    if uploaded_file is None:
        # Save to current directory if no file selected
        save_path = filename if filename else "output.txt"
    else:
        # Use the directory of the uploaded file
        upload_dir = os.path.dirname(uploaded_file.name)
        save_path = os.path.join(upload_dir, filename if filename else "output.txt")
    
    try:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(text_content)
        return f"Successfully saved to: {save_path}"
    except Exception as e:
        return f"Error saving file: {str(e)}"
# def funciton to create the prompt messages list

def create_prompt_messages(prompt):
    system_prompt = """You are a test data generator. Your task is to generate test data based on the provided prompt and format instructions. 
    You will respond with the generated test data only, without any additional explanations or comments.
    Follow the format specified in the prompt and ensure that the generated data is relevant and accurate."""
    return [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
    
# Set up event handlers
def generate_response(prompt, max_tokens, temperature, model_name):
    messages=create_prompt_messages(prompt)
    try:
        response = invoke_model(model_name=model_name, prompt=messages, max_tokens=max_tokens, temperature=temperature)
        return response
    except Exception as e:
        return f"Error: {str(e)}"



In [37]:
# Set up UI components

with gr.Blocks() as ui:
    gr.Markdown("# LLM Test Data Generator")
    with gr.Row():
        prompt_input= gr.Textbox(label="Enter your prompt", placeholder="Type your prompt here...", lines=4)
        response_output = gr.Textbox(label="Model Response", lines=10, interactive=False)
    
    with gr.Row():
        max_tokens_input = gr.Slider(minimum=1, maximum=4096, value=1000, step=1, label="Max Tokens")
        temperature_input = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="Temperature")
        model_selector = gr.Dropdown(
            label="Select Model", choices=[
                OPENAI_MODEL,
                ANTHROPIC_MODEL,
                GOOGLE_MODEL,
                LLAMA_MODEL,
                KRIKRI_MODEL
            ], value=OPENAI_MODEL)  
    with gr.Row():
        status_output = gr.Textbox(label="Status", interactive=False, visible=True)
        #status_output.value = "Ready to generate responses."
        # file_output = gr.File(label="Download Response", visible=True)
        #intermediate_data = gr.State() # not displayed, used to store intermediate data
    with gr.Row():
        generate_button = gr.Button("Generate Response")
        download_button = gr.Button("Download your file", visible=True)

    generate_button.click(
        fn=generate_response,
        inputs=[prompt_input, max_tokens_input, temperature_input, model_selector],
        outputs=response_output
    )
    download_button.click(
        fn=save_text_to_selected_location,
        inputs=response_output,
        outputs= status_output
    )

ui.launch(inbrowser=True)
    

* Running on local URL:  http://127.0.0.1:7874
* To create a public link, set `share=True` in `launch()`.




You are a test data generator. Your task is to generate test data based on the provided prompt and format instructions. 
    You will respond with the generated test data only, without any additional explanations or comments.
    Follow the format specified in the prompt and ensure that the generated data is relevant and accurate.
{'role': 'user', 'content': 'Generate a test dataset of 15 records for books, with the following fields: ISBN, Book Title, Author, Publication Year, Genre. The ISBN should follow the standard international book number format. The output should be in JSON'}
