### Importing Libraries

In [1118]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import anthropic
import google.generativeai
from IPython.display import Markdown, display, update_display
# the httpx Library to disable the SSL certificate verificationin case of use from a corporate computer
import httpx
import pandas as pd
import gradio as gr
import ast

In [706]:
# Sign in to HuggingFace Hub

# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyCC


In [707]:
# Connect to OpenAI, Anthropic and Google; comment out the Claude or Google lines if you're not using them

openai = OpenAI(http_client=httpx.Client(verify=False))

claude = anthropic.Anthropic(http_client=httpx.Client(verify=False))

#google.generativeai.configure(transport='rest')

In [925]:
# Defining the used models in the App.
GPT_3_5 = 'gpt-3.5-turbo'
GPT_4o_MINI = 'gpt-4o-mini'
GPT_4o = 'gpt-4o'
claude_3_5_sonnet_latest = "claude-3-5-sonnet-latest"
claude_3_HAIKU = "claude-3-haiku-20240307"

### Testing the GPT and Claude models before implementing in the app

In [1110]:
data_subject = "Job Posting for an AI company"
n=3
user_prompt = f"Generate {n} instances of synthesized data of {data_subject}"
system_prompt = f"You are an AI assistant dedicated to generating high-quality synthetic testing data. Provide the generated data as a list of dictionaries that can be directly converted into a Pandas DataFrame. Ensure all dictionary values are strings. Do not use nested lists or nested dictionaries. Respond only with the list of dictionaries—no explanation or extra text."

In [1111]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
  ]

In [1112]:
print(system_prompt)
print(user_prompt)

You are an AI assistant dedicated to generating high-quality synthetic testing data. Provide the generated data as a list of dictionaries that can be directly converted into a Pandas DataFrame. Ensure all dictionary values are strings. Do not use nested lists or nested dictionaries. Respond only with the list of dictionaries—no explanation or extra text.
Generate 3 instances of synthesized data of Job Posting for an AI company


In [1113]:
completion = openai.chat.completions.create(
    model=GPT_4o_MINI,
    messages=messages,
    temperature=0.7
)

In [1116]:
print(completion.choices[0].message.content)

[
    {
        "job_title": "Machine Learning Engineer",
        "company_name": "AI Innovations Inc.",
        "location": "San Francisco, CA",
        "employment_type": "Full-time",
        "experience_level": "Mid-level",
        "job_description": "Develop and implement machine learning models to enhance product capabilities.",
        "required_skills": "Python, TensorFlow, Scikit-learn, Data Analysis",
        "salary_range": "$120,000 - $150,000",
        "posting_date": "2023-10-10",
        "application_link": "http://ai-innovations.com/careers/machine-learning-engineer"
    },
    {
        "job_title": "Data Scientist",
        "company_name": "Future Tech Solutions",
        "location": "New York, NY",
        "employment_type": "Part-time",
        "experience_level": "Entry-level",
        "job_description": "Analyze large datasets to derive actionable insights and enhance decision-making.",
        "required_skills": "R, SQL, Data Visualization, Statistics",
        "s

In [1119]:
pd.DataFrame(ast.literal_eval(completion.choices[0].message.content))

Unnamed: 0,job_title,company_name,location,employment_type,experience_level,job_description,required_skills,salary_range,posting_date,application_link
0,Machine Learning Engineer,AI Innovations Inc.,"San Francisco, CA",Full-time,Mid-level,Develop and implement machine learning models ...,"Python, TensorFlow, Scikit-learn, Data Analysis","$120,000 - $150,000",2023-10-10,http://ai-innovations.com/careers/machine-lear...
1,Data Scientist,Future Tech Solutions,"New York, NY",Part-time,Entry-level,Analyze large datasets to derive actionable in...,"R, SQL, Data Visualization, Statistics","$80,000 - $100,000",2023-10-05,http://futuretechsolutions.com/careers/data-sc...
2,AI Research Scientist,Cognitive Technologies LLC,Remote,Contract,Senior-level,Conduct research on advanced AI algorithms and...,"Deep Learning, NLP, Python, Research Publications","$130,000 - $160,000",2023-10-08,http://cognitivetechnologies.com/careers/ai-re...


In [1120]:
completions = claude.messages.create(
    model= "claude-3-5-sonnet-latest",
    max_tokens = 4000,
    system = system_prompt,
    messages = [{"role": "user", "content": user_prompt}])

In [1122]:
completions.content[0].text

'[\n    {\n        "job_id": "AI2023001",\n        "title": "Senior Machine Learning Engineer",\n        "department": "Research & Development",\n        "location": "San Francisco, CA",\n        "employment_type": "Full-time",\n        "experience_level": "5+ years",\n        "salary_range": "150000-180000",\n        "posting_date": "2023-10-15",\n        "skills_required": "Python, TensorFlow, PyTorch, Deep Learning",\n        "education": "PhD in Computer Science or related field",\n        "status": "Open"\n    },\n    {\n        "job_id": "AI2023002",\n        "title": "AI Product Manager",\n        "department": "Product Management",\n        "location": "New York, NY",\n        "employment_type": "Full-time",\n        "experience_level": "3-5 years",\n        "salary_range": "120000-150000",\n        "posting_date": "2023-10-16",\n        "skills_required": "Product Strategy, AI/ML Knowledge, Agile",\n        "education": "Masters in Computer Science or MBA",\n        "status": 

In [1123]:
pd.DataFrame(ast.literal_eval(completions.content[0].text))

Unnamed: 0,job_id,title,department,location,employment_type,experience_level,salary_range,posting_date,skills_required,education,status
0,AI2023001,Senior Machine Learning Engineer,Research & Development,"San Francisco, CA",Full-time,5+ years,150000-180000,2023-10-15,"Python, TensorFlow, PyTorch, Deep Learning",PhD in Computer Science or related field,Open
1,AI2023002,AI Product Manager,Product Management,"New York, NY",Full-time,3-5 years,120000-150000,2023-10-16,"Product Strategy, AI/ML Knowledge, Agile",Masters in Computer Science or MBA,Open
2,AI2023003,Computer Vision Engineer,AI Solutions,"Boston, MA",Contract,2-4 years,130000-160000,2023-10-17,"OpenCV, Python, Deep Learning, CNN",Masters in Computer Vision or related field,Open


### The main Funtion to use in the APP

In [1132]:
# Step 1: The Main Function to Generate the synthesized data
def gen_syn_data(data_subject, model, n):
    global df
    user_prompt = f"Generate {n} instances of synthesized data of {data_subject}"
    system_prompt = f"You are an AI assistant dedicated to generating high-quality synthetic testing data. Provide the generated data as a list of dictionaries that can be directly converted into a Pandas DataFrame. Ensure all dictionary values are strings. Do not use nested lists or nested dictionaries. Respond only with the list of dictionaries—no explanation or extra text."
    
    messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
    ]

    if "gpt" in model:
        completions = openai.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.7)
        
        res = ast.literal_eval(completions.choices[0].message.content)
        
    elif "claude" in model:
        completions = claude.messages.create(
            model = model,
            max_tokens = 4000 ,
            temperature = 0.7 ,
            system = system_prompt ,
            messages = [
                {"role": "user", "content": user_prompt},
            ])
        res = ast.literal_eval(completions.content[0].text)


    df = pd.DataFrame(res)
    
    return df

# Step 2: Function to save the DataFrame and return CSV path
def download_csv():
    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode='w') as f:
        df.to_csv(f.name, index=False)
        
        return f.name
        
# Step 3: Function to Change the visibility of the file component when the download button is clicked
def visible_component():
    
    return gr.update(visible=True)

### The Gradio Page

In [1135]:
with gr.Blocks() as demo:
    
    gr.Markdown("# Welcome the the Synthetic Data Generator")
    with gr.Row():
        input_subject = gr.Textbox( label = "Input Subject Here" , placeholder="AI Job Postings.." , scale = 3 , lines = 2 )
        input_model = gr.Dropdown( [GPT_3_5, GPT_4o_MINI, GPT_4o, claude_3_5_sonnet_latest, claude_3_HAIKU] , label="Select an LLM" ,value= GPT_3_5, scale = 1 )
        input_n = gr.Number( label = "Enter Number of Data records", minimum=2 , value=2 , scale = 1 )

    # Generate Data Button
    generate_button = gr.Button("Generate Data")
    
    # Output components
    dataframe_output = gr.Dataframe()
 
    # Download Data as .csv Button    
    download_button = gr.Button("Download CSV")
    file_output = gr.File(label="Download CSV", interactive=False, visible=False)

    # Hook up buttons to actions
    generate_button.click(
        fn=gen_syn_data,
        inputs= [input_subject , input_model , input_n],
        outputs=dataframe_output
    )

    download_button.click(
        visible_component,
        outputs = file_output     
        ).then(
        fn=download_csv,
        outputs=file_output
        )

demo.launch(server_port=28058, inbrowser=True, share=True)

* Running on local URL:  http://127.0.0.1:28058

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


