# Synthentic Dataset Generator
A Gradio web app  where Nigerian businesses and developers describe a dataset they need, pick an AI model, and get a ready-to-use synthetic CSV file back in seconds.

In [None]:
!pip install anthropic openai google-generativeai gradio pandas

In [None]:

import os
import gradio as gr
import anthropic
import openai
import google.generativeai as genai
import pandas as pd
import json
import traceback

required_keys = {
    "ANTHROPIC_API_KEY": os.getenv("ANTHROPIC_API_KEY"),
    "OPENAI_API_KEY":    os.getenv("OPENAI_API_KEY"),
    "GOOGLE_API_KEY":    os.getenv("GEMINI_API_KEY"),
}

missing = [k for k, v in required_keys.items() if not v]

if missing:
    raise EnvironmentError(f"Missing API keys: {missing}")

print("All API keys found.")
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [None]:
#  Domain Schemas
schemas = {
    "Fintech": {
        "description": "Nigerian financial transactions and customer data",
        "columns": ["transaction_id", "customer_name", "bank", "amount_naira", "transaction_type", "status", "city", "timestamp", "is_fraud"]
    },
    "Healthcare": {
        "description": "Nigerian patient records and diagnoses",
        "columns": ["patient_id", "name", "age", "gender", "state", "diagnosis", "hospital", "admission_date", "discharge_date", "outcome"]
    },
    "Agriculture": {
        "description": "Nigerian farm yield and crop data",
        "columns": ["farmer_id", "name", "state", "crop_type", "farm_size_hectares", "yield_kg", "season", "rainfall_mm", "fertilizer_used", "revenue_naira"]
    },
    "E-commerce": {
        "description": "Nigerian online shopping orders and customers",
        "columns": ["order_id", "customer_name", "product", "category", "price_naira", "quantity", "city", "delivery_status", "payment_method", "order_date"]
    },
    "Logistics": {
        "description": "Nigerian package delivery and route data",
        "columns": ["delivery_id", "sender", "receiver", "origin_city", "destination_city", "weight_kg", "distance_km", "status", "delivery_days", "cost_naira"]
    },
    "Education": {
        "description": "Nigerian student performance and school data",
        "columns": ["student_id", "name", "age", "gender", "state", "school_type", "subject", "score", "grade", "year"]
    }
}

print(f"Loaded {len(schemas)} domain schemas.")

In [None]:
# Prompt Builder
def build_prompt(domain, num_rows):
    schema = schemas[domain]
    columns = ", ".join(schema["columns"])
    
    return f"""Generate {num_rows} rows of realistic synthetic data for Nigerian {domain} sector.
Context: {schema["description"]}
Columns: {columns}

Rules:
- Use real Nigerian names, states, and context
- Return only valid JSON: a list of {num_rows} objects
- Each object must have exactly these keys: {columns}
- No explanation, no markdown, just the JSON array
"""



In [None]:
#  Model Callers

def call_claude(prompt):
    client = anthropic.Anthropic()
    response = client.messages.create(
        model="claude-haiku-4-5",
        max_tokens=16000,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text


def call_gpt(prompt):
    client = openai.OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=16000
    )
    return response.choices[0].message.content


def call_gemini(prompt):
    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text


MODEL_CALLERS = {
    "Claude":  call_claude,
    "GPT-4":   call_gpt,
    "Gemini":  call_gemini
}



In [None]:
# Data Generator

def generate_data(domain, model_name, num_rows):
    prompt = build_prompt(domain, num_rows)
    caller = MODEL_CALLERS[model_name]
    
    raw = caller(prompt)
    
    # Strip markdown code blocks if model returns them
    raw = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
    
    data = json.loads(raw)
    df = pd.DataFrame(data)
    
    return df




In [None]:
# Export to CSV

def export_csv(df, domain, model_name):
    filename = f"{domain}_{model_name}_synthetic.csv".replace("-", "").replace(" ", "_")
    df.to_csv(filename, index=False)
   
    return filename

# Test
export_csv(df, "Fintech", "Claude")

In [None]:
# Cell 9 â€” Gradio UI (Gradio 6.x compatible)





def run_generator(domain, model_name, num_rows):
  
    try:
   
        prompt = build_prompt(domain, model_name)
    
        
        caller = MODEL_CALLERS[model_name]
        raw = caller(prompt)
       

       
        cleaned = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
        data = json.loads(cleaned)
       

        df = pd.DataFrame(data)
      

        filepath = export_csv(df, domain, model_name)
      

        return df, filepath

    except json.JSONDecodeError as e:
        print(f"[ERROR] JSON parse failed: {e}")
        print(f"[ERROR] Raw content was: {raw}")
        raise gr.Error(f"JSON parse error: {e}")
    except Exception as e:
        print(f"[ERROR] {traceback.format_exc()}")
        raise gr.Error(str(e))

with gr.Blocks(title="Nigeria Synthetic Data Generator") as app:
    gr.Markdown("## Nigeria Synthetic Data Generator")

    with gr.Row():
        domain    = gr.Dropdown(choices=list(schemas.keys()), label="Domain", value="Fintech")
        model     = gr.Dropdown(choices=list(MODEL_CALLERS.keys()), label="Model", value="Claude")
        num_rows  = gr.Slider(minimum=5, maximum=500, step=5, value=10, label="Rows")

    generate_btn = gr.Button("Generate")
    table        = gr.Dataframe(label="Preview")
    download     = gr.File(label="Download CSV")

    generate_btn.click(
        fn=run_generator,
        inputs=[domain, model, num_rows],
        outputs=[table, download]
    )

app.launch()