## Week 3 Exercise - Synthetic Customer Database Generator

This project demonstrates a **Synthetic Data Generation**. It leverages the efficiency of the `Faker` library for structured, rule-based data (names, emails, and cities) combined with the reasoning power of **OpenAI-GTP-OSS** and other models to generate context-aware "smart" data.

In [None]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import gradio as gr
from faker import Faker
load_dotenv(override=True)

In [None]:
OLLAMA_BASE_URL = "http://localhost:11434/v1"
OPEN_ROUTER_BASE_URL = "https://openrouter.ai/api/v1"
OPEN_ROUTER_API_KEY = os.getenv('OPEN_ROUTER_API_KEY')

client = OpenAI(
    base_url=OPEN_ROUTER_BASE_URL,
    api_key=OPEN_ROUTER_API_KEY
)

fake = Faker('en_NG')


MODELS = [
    "openai/gpt-oss-120b",
    "microsoft/phi-4",
    "qwen/qwen3.5-flash-02-23",
    "meta-llama/llama-4-maverick",
    "anthropic/claude-sonnet-4.6",
    "google/gemini-3-flash-preview"
]

INDUSTRIES = [
    "Technology",
    "Finance",
    "Healthcare",
    "Education",
    "Entertainment",
    "Retail",
    "Manufacturing",
    "Agriculture",
    "Transportation",
    "Energy"
]

In [None]:
def generate_customer_data(industry, num_records, model=MODELS[0]):
    base_data = []
    for _ in range(num_records):
        base_data.append({
            "Name": fake.name(),
            "Email": fake.email(),
            "City": fake.city()
        })
        
    prompt = f"""
    Generate a realistic job title and a 1-sentence bio for each of the following people 
    living in Nigeria. The industry context is {industry}.
    
    People:
    {json.dumps(base_data)}
    
    Return the results as a JSON object with a key 'results' containing an array of objects. 
    Each object must have the keys: 'Job' and 'Bio'. 
    The order must match the input list.
    """
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a precise synthetic data generator. Output ONLY JSON."},
            {"role": "user", "content": prompt}
        ],
        response_format={ "type": "json_object" }
    )
        
    json_data = json.loads(response.choices[0].message.content)
    results = json_data.get("results", [])

    customers = [] 
    for i in range(len(base_data)):
        details = results[i] if i < len(results) else {"job": "N/A", "bio": "N/A"}
        customers.append({
            "Name": base_data[i]["Name"],
            "Email": base_data[i]["Email"],
            "City": base_data[i]["City"],
            "Occupation": details.get("Job"),
            "Bio": details.get("Bio")
        })
        
    return customers

In [None]:
generate_customer_data("Technology", 5)

In [None]:
def generate_data(industry, count, model):
    data = generate_customer_data(industry, int(count), model=model)
    df = pd.DataFrame(data)
    df.to_csv("synthetic_customers.csv", index=False)
    return df, "synthetic_customers.csv"

with gr.Blocks() as ui:
    gr.Markdown("# Fake Customer Database Generator")
    gr.Markdown("Generate realistic Nigerian customer datasets for testing your apps.")
    
    model_dropdown = gr.Dropdown(choices=MODELS, value=MODELS[0], label="Model")
    
    with gr.Row():
        industry_input = gr.Dropdown(
            choices=INDUSTRIES, 
            label="Target Industry", 
            value=INDUSTRIES[0]
        )
        count_input = gr.Slider(minimum=1, maximum=10, step=1, label="Number of Records", value=3)
    
    gen_btn = gr.Button("Generate Dataset")
    
    out_table = gr.Dataframe(label="Generated Data")
    out_file = gr.File(label="Download CSV")
    
    gen_btn.click(
        fn=generate_data, 
        inputs=[industry_input, count_input, model_dropdown], 
        outputs=[out_table, out_file]
    )

ui.launch()