# Synthetic Data Generator

Tool for generating sample synthetic data using a local Llama model

In [40]:
# imports 

from openai import OpenAI
import json


In [41]:
openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

In [42]:
# model

MODEL = "llama3.2"

In [43]:
def generate_synthetic_data(user_prompt = (
      "Generate 5 realistic customer reviews for a product. "
      "The review should be 1-2 sentences long and contain a mix of positive and negative comments. "
      "The review should be formatted as a JSON object with the following fields: "
      "review: a string containing the review text"
    )):
  
  system_message = (
    "You are a helpful assistant that generates synthetic data."
  )
  response = openai.chat.completions.create(
    model=MODEL,
    messages=[
      {"role": "system", "content": system_message},
      {"role": "user", "content": user_prompt}
    ],
    response_format={"type": "json_object"}
  )
  result = json.loads(response.choices[0].message.content)
  return result
    

In [44]:
result = generate_synthetic_data()

formatted_json_result = json.dumps(result, indent=4)

In [45]:
print(formatted_json_result)

{
    "review1": "I'm really impressed with how easy the setup was for this product! It only took me about 10 minutes to get everything up and running.",
    "review2": "The quality of the material is top-notch, but I've noticed a few scratches after using it for a week.",
    "review3": "I was skeptical at first, but this product has truly exceeded my expectations - it's even more functional than I thought it would be!",
    "review4": "Unfortunately, the battery life could be longer. It's fine for occasional use, but it doesn't hold up as well during extended periods.",
    "review5": "I love how compact and lightweight this product is - perfect for my morning commute! The only reason I'm giving 4 stars instead of 5 is because the charging port can get a bit finicky."
}


In [46]:
user_prompt = """
Generate a dataset of 5 employees with name, department, salary, and years of experience.
"""

In [47]:
result = generate_synthetic_data(user_prompt)

formatted_json_result = json.dumps(result, indent=4)

In [48]:
print(formatted_json_result)

{
    "employees": [
        {
            "name": "John Doe",
            "department": "Marketing",
            "salary": 60000,
            "years_of_experience": 8
        },
        {
            "name": "Jane Smith",
            "department": "IT",
            "salary": 70000,
            "years_of_experience": 5
        },
        {
            "name": "Bob Johnson",
            "department": "Sales",
            "salary": 55000,
            "years_of_experience": 10
        },
        {
            "name": "Emily Chen",
            "department": "Marketing",
            "salary": 65000,
            "years_of_experience": 6
        },
        {
            "name": "Michael Davis",
            "department": "IT",
            "salary": 75000,
            "years_of_experience": 7
        }
    ]
}
