# Meeting minutes creator

https://colab.research.google.com/drive/13wR4Blz3Ot_x0GOpflmvvFffm5XU3Kct?usp=sharing

## **Week 3 task.**
Create your own tool that generates synthetic data/test data. Input the type of dataset or products or job postings, etc. and let the tool dream up various data samples.



In [None]:
# imports
import gradio as gr, requests, json, time, os, torch
from transformers import pipeline, set_seed
from functools import partial
from openai import OpenAI, APIError, AuthenticationError
from google.colab import drive, userdata
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Sample user_prompt = "a list of student profiles with full name, email, course studied, and GPA for each of 6 semesters, and a CGPA for the 6 semesters"

# Sign in to HuggingFace Hub
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Sign in to OpenAI using Secrets in Colab
openai_api_key = userdata.get('OPENAI_API_KEY')

# Initialize client
try:
    openai = OpenAI(api_key=openai_api_key)
except Exception as e:
    openai = None
    print(f"OpenAI client not initialized: {e}")

# Constants
GPT_MODEL = "gpt-3.5-turbo"

# Local Llama Model Setup
# Loads a Llama model from Hugging Face for local inference.
# Note: This requires a powerful GPU and specific library installations (e.g., bitsandbytes, accelerate).
LLAMA_MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"

try:
    # Set up quantization config for efficient memory usage.
    # This loads the model in 4-bit precision, significantly reducing VRAM requirements.
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    # Load the tokenizer and model.
    tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL)
    model = AutoModelForCausalLM.from_pretrained(
        LLAMA_MODEL, 
        device_map="auto", 
        quantization_config=quant_config,
        trust_remote_code=True
    )
    
    # Set the model to evaluation mode for inference.
    model.eval()

except Exception as e:
    model = None
    tokenizer = None
    print(f"Failed to load local Llama model: {e}")


def generate_with_llama(user_prompt: str, num_samples: int = 5):
    """
    Generates synthetic data using a local Llama model.
    Return a JSON string.
    """
    if not model or not tokenizer:
        return json.dumps({"error": "Llama model not loaded. Check model paths and hardware compatibility."}, indent=2)

    # Llama 3.1 uses a specific chat template for conversation formatting.
    messages = [
        {"role": "system", "content": f"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting."},
        {"role": "user", "content": user_prompt}
    ]

    try:
        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

        outputs = model.generate(inputs, max_new_tokens=2000, do_sample=True, top_p=0.9, temperature=0.7)

        # Decode the generated tokens.
        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract only the assistant's part from the complete chat history.
        assistant_start = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        if assistant_start in response_text:
            response_text = response_text.split(assistant_start)[-1]
        
        # Parse the JSON and return it.
        parsed_json = json.loads(response_text)
        return json.dumps(parsed_json, indent=2)

    except Exception as e:
        return json.dumps({"error": f"An error occurred during local model generation: {e}"}, indent=2)



def generate_with_gpt(user_prompt: str, num_samples: int = 5):
    """
    Generates synthetic data using OpenAI's GPT.
    Return a JSON string.
    """
    if not openai:
        return json.dumps({"error": "OpenAI client not initialized. Please check your API key."}, indent=2)

    try:
        response = openai.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {"role": "system", "content": f"You are a data generation assistant. Generate a JSON array of exactly {num_samples} objects based on the user's request. The output must be valid JSON only, without any other text or formatting."},
                {"role": "user", "content": user_prompt}
            ],
            response_format={"type": "json_object"}
        )
        
        json_text = response.choices[0].message.content
        return json_text
    except APIError as e:
        return json.dumps({"error": f"Error from OpenAI API: {e.body}"}, indent=2)
    except Exception as e:
        return json.dumps({"error": f"An unexpected error occurred: {e}"}, indent=2)


def generate_data(user_prompt, model_choice):
    """
    Wrapper function that calls the appropriate generation function based on model choice.
    """
    if not user_prompt:
        return json.dumps({"error": "Please provide a description for the data."}, indent=2)

    if model_choice == f"Hugging Face ({LLAMA_MODEL})":
        return generate_with_llama(user_prompt)
    elif model_choice == f"OpenAI ({GPT_MODEL})":
        return generate_with_gpt(user_prompt)
    else:
        return json.dumps({"error": "Invalid model choice."}, indent=2)

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft(), title="Synthetic Data Generator") as ui:
    gr.Markdown("# Synthetic Data Generator")
    gr.Markdown("Describe the type of data you need, select a model, and click 'Generate'.")

    with gr.Row():
        with gr.Column(scale=3):
            data_prompt = gr.Textbox(
                lines=5,
                label="Data Prompt",
                placeholder="e.g., a list of customer profiles with name, email, and a favorite product"
            )
        
        with gr.Column(scale=1):
            model_choice = gr.Radio(
                [f"Hugging Face ({LLAMA_MODEL})", f"OpenAI ({GPT_MODEL})"],
                label="Choose a Model",
                value=f"Hugging Face ({LLAMA_MODEL})"
            )
            
            generate_btn = gr.Button("Generate Data")
            
    with gr.Row():
        output_json = gr.JSON(label="Generated Data")
    
    # Click trigger
    generate_btn.click(
        fn=generate_data,
        inputs=[data_prompt, model_choice],
        outputs=output_json
    )

ui.launch(inbrowser=True, debug=True)
