# üé≠ Story-Driven Synthetic Dataset Generator

A creative approach to generating interconnected, narrative-driven synthetic data.

**Features:**
- üìä **Standard Mode** - Classic row-by-row generation
- üìñ **Story Chain** - Chronological narrative data
- ü•ä **Model Battle** - Two AI models compete
- üîÑ **Data Remix** - Generate more data matching your sample
- üé® **Style Personas** - Corporate, Creative, or Data Scientist styles

## Setup & Imports

In [None]:
import os
import json
import random
from io import StringIO
from dotenv import load_dotenv
import pandas as pd
import gradio as gr
from openai import OpenAI
import google.generativeai as genai

# Load environment variables
load_dotenv(override=True)

# Initialize clients
openai_client = OpenAI()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

print("‚úÖ Clients initialized!")

## Model Configurations

In [None]:
MODELS = {
    "GPT-4o-mini": {"provider": "openai", "model": "gpt-4o-mini"},
    "GPT-4o": {"provider": "openai", "model": "gpt-4o"},
    "Gemini 1.5 Flash": {"provider": "gemini", "model": "gemini-1.5-flash"},
    "Gemini 2.0 Flash": {"provider": "gemini", "model": "gemini-2.0-flash"},
}

print(f"Available models: {list(MODELS.keys())}")

## Style Personas

In [None]:
PERSONAS = {
    "üè¢ Corporate Analyst": """You write like a formal business analyst. Use precise business terminology, 
    include realistic numeric data with proper formatting, formal company names, 
    and professional titles. Data should feel like it came from a Fortune 500 quarterly report.""",
    
    "üé® Creative Writer": """You write like a creative storyteller. Use colorful, unique names for products 
    and companies, include descriptive text fields, varied and interesting patterns. 
    Data should feel imaginative yet plausible - like a quirky startup ecosystem.""",
    
    "üî¨ Data Scientist": """You write like a data scientist creating test data. Include realistic statistical 
    distributions, edge cases (nulls, outliers, boundary values), proper data types. 
    Data should feel like it was carefully curated for ML model training.""",
}

print(f"Available personas: {list(PERSONAS.keys())}")

## Domain Templates

In [None]:
DOMAINS = {
    "üöÄ Startup Journey": {
        "description": "A startup's evolution from founding to exit",
        "schema": "date,event_type,company_name,valuation_usd,employees,funding_round,milestone",
        "story_hint": "Show chronological growth: founding ‚Üí seed ‚Üí series A/B/C ‚Üí growth ‚Üí exit"
    },
    "üõí Customer Journey": {
        "description": "A customer's relationship with an e-commerce brand",  
        "schema": "date,customer_id,event,product_category,order_value,loyalty_points,satisfaction",
        "story_hint": "Show relationship evolution: first visit ‚Üí first purchase ‚Üí repeat buyer ‚Üí loyal customer"
    },
    "üè• Patient Treatment": {
        "description": "A patient's healthcare journey",
        "schema": "date,patient_id,visit_type,diagnosis,treatment,provider,outcome",
        "story_hint": "Show treatment progression: initial symptoms ‚Üí diagnosis ‚Üí treatment ‚Üí recovery"
    },
    "üìà Stock Performance": {
        "description": "A company's stock performance over time",
        "schema": "date,ticker,open,high,low,close,volume,sentiment",
        "story_hint": "Show market story: IPO ‚Üí growth ‚Üí volatility events ‚Üí recovery/decline"
    },
    "üéÆ Game Player Progress": {
        "description": "A player's journey through a game",
        "schema": "date,player_id,level,xp_gained,items_acquired,achievements,playtime_mins",
        "story_hint": "Show player engagement: onboarding ‚Üí learning ‚Üí mastery ‚Üí engagement patterns"
    },
}

print(f"Available domains: {list(DOMAINS.keys())}")

## Core Functions - API Calls

In [None]:
def call_openai(model: str, system_prompt: str, user_prompt: str) -> str:
    """Call OpenAI API and return response."""
    response = openai_client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.8,
        max_tokens=2000
    )
    return response.choices[0].message.content


def call_gemini(model: str, system_prompt: str, user_prompt: str) -> str:
    """Call Google Gemini API and return response."""
    gemini_model = genai.GenerativeModel(model)
    full_prompt = f"{system_prompt}\n\n{user_prompt}"
    response = gemini_model.generate_content(full_prompt)
    return response.text


def generate_with_model(model_name: str, system_prompt: str, user_prompt: str) -> str:
    """Route to appropriate model provider."""
    config = MODELS[model_name]
    if config["provider"] == "openai":
        return call_openai(config["model"], system_prompt, user_prompt)
    else:
        return call_gemini(config["model"], system_prompt, user_prompt)

print("‚úÖ API functions defined!")

## Helper Functions - Data Parsing

In [None]:
def clean_csv_response(response: str) -> str:
    """Extract clean CSV from model response."""
    if "```csv" in response:
        response = response.split("```csv")[1].split("```")[0]
    elif "```" in response:
        parts = response.split("```")
        if len(parts) >= 2:
            response = parts[1]
    return response.strip()


def safe_parse_csv(csv_string: str) -> pd.DataFrame:
    """Safely parse CSV with error handling for malformed rows."""
    # Try standard parsing first
    try:
        return pd.read_csv(StringIO(csv_string))
    except Exception:
        pass
    
    # Try with python engine and skip bad lines
    try:
        return pd.read_csv(StringIO(csv_string), engine='python', on_bad_lines='skip')
    except Exception:
        pass
    
    # Manual fallback parsing
    lines = csv_string.strip().split('\n')
    if len(lines) < 2:
        raise ValueError("Not enough data rows")
    
    header = lines[0].split(',')
    num_cols = len(header)
    
    rows = []
    for line in lines[1:]:
        parts = line.split(',')
        if len(parts) >= num_cols:
            rows.append(parts[:num_cols])
        elif len(parts) > 0:
            rows.append(parts + [''] * (num_cols - len(parts)))
    
    return pd.DataFrame(rows, columns=header)

print("‚úÖ Helper functions defined!")

## Generation Mode: Standard

In [None]:
def generate_standard(model_name: str, domain: str, persona: str, num_records: int) -> tuple:
    """Standard row-by-row generation."""
    domain_info = DOMAINS[domain]
    persona_style = PERSONAS[persona]
    
    system_prompt = f"""You are an expert synthetic data generator. {persona_style}
Generate realistic, high-quality synthetic data. Output ONLY valid CSV data, no explanations."""

    user_prompt = f"""Generate {num_records} rows of synthetic data for: {domain_info['description']}

Schema (use these exact column names): {domain_info['schema']}

CRITICAL REQUIREMENTS:
- Output ONLY CSV data starting with the header row
- Do NOT use commas inside any field values
- Use underscores instead of spaces in text
- Every row MUST have exactly {len(domain_info['schema'].split(','))} columns
- Use YYYY-MM-DD for dates
- Use plain numbers without commas (1000000 not 1,000,000)
- No markdown code blocks, no explanations, just raw CSV"""

    try:
        response = generate_with_model(model_name, system_prompt, user_prompt)
        clean_data = clean_csv_response(response)
        df = safe_parse_csv(clean_data)
        return df, clean_data, f"‚úÖ Generated {len(df)} rows with {model_name}"
    except Exception as e:
        return pd.DataFrame(), "", f"‚ùå Error: {str(e)}"

print("‚úÖ Standard generation defined!")

## Generation Mode: Story Chain

In [None]:
def generate_story_chain(model_name: str, domain: str, persona: str, num_records: int) -> tuple:
    """Generate chronologically connected narrative data."""
    domain_info = DOMAINS[domain]
    persona_style = PERSONAS[persona]
    
    system_prompt = f"""You are a creative synthetic data storyteller. {persona_style}
Your specialty is creating data that tells a STORY - each row logically follows from the previous one."""

    user_prompt = f"""Create a STORY through data for: {domain_info['description']}

Schema (use these exact column names): {domain_info['schema']}
Story arc: {domain_info['story_hint']}

Generate {num_records} rows that tell a coherent chronological story.

CRITICAL REQUIREMENTS:
- Output ONLY CSV data starting with the header row
- Do NOT use commas inside any field values
- Every row MUST have exactly {len(domain_info['schema'].split(','))} columns
- Use YYYY-MM-DD for dates, show time progression
- No markdown code blocks, no explanations, just raw CSV"""

    try:
        response = generate_with_model(model_name, system_prompt, user_prompt)
        clean_data = clean_csv_response(response)
        df = safe_parse_csv(clean_data)
        return df, clean_data, f"üìñ Story created: {len(df)} chapters with {model_name}"
    except Exception as e:
        return pd.DataFrame(), "", f"‚ùå Error: {str(e)}"

print("‚úÖ Story Chain generation defined!")

## Generation Mode: Model Battle

In [None]:
def generate_battle(domain: str, persona: str, num_records: int) -> tuple:
    """Two models compete on the same prompt."""
    # Pick two different models
    model_names = list(MODELS.keys())
    model_a, model_b = random.sample(model_names, 2)
    
    domain_info = DOMAINS[domain]
    persona_style = PERSONAS[persona]
    
    system_prompt = f"""You are an expert synthetic data generator. {persona_style}
Generate realistic, high-quality synthetic data. Output ONLY valid CSV data."""

    user_prompt = f"""Generate {num_records} rows of synthetic data for: {domain_info['description']}
Schema: {domain_info['schema']}

CRITICAL: Output ONLY raw CSV. No commas in text fields. Exactly {len(domain_info['schema'].split(','))} columns per row."""

    try:
        # Generate with both models
        response_a = generate_with_model(model_a, system_prompt, user_prompt)
        response_b = generate_with_model(model_b, system_prompt, user_prompt)
        
        data_a = clean_csv_response(response_a)
        data_b = clean_csv_response(response_b)
        df_a = safe_parse_csv(data_a)
        df_b = safe_parse_csv(data_b)
        
        return (
            df_a, data_a, f"ü•ä {model_a}",
            df_b, data_b, f"ü•ä {model_b}",
            f"‚öîÔ∏è Battle complete! Vote for the most realistic dataset!"
        )
    except Exception as e:
        return pd.DataFrame(), "", "Error", pd.DataFrame(), "", "Error", f"‚ùå Error: {str(e)}"

print("‚úÖ Model Battle generation defined!")

## Generation Mode: Data Remix

In [None]:
def generate_remix(sample_data: str, model_name: str, num_records: int) -> tuple:
    """Analyze sample data and generate more in the same style."""
    system_prompt = """You are an expert at pattern recognition and data synthesis.
Analyze the provided sample data and generate MORE data that matches the exact same style and schema."""

    user_prompt = f"""Here is sample data:

{sample_data}

Generate {num_records} NEW rows matching this exact schema and style.

CRITICAL: Output ONLY raw CSV matching the sample schema. No commas in text fields. No explanations."""

    try:
        response = generate_with_model(model_name, system_prompt, user_prompt)
        clean_data = clean_csv_response(response)
        df = safe_parse_csv(clean_data)
        return df, clean_data, f"üîÑ Remixed {len(df)} new rows matching your style"
    except Exception as e:
        return pd.DataFrame(), "", f"‚ùå Error: {str(e)}"

print("‚úÖ Data Remix generation defined!")

## Generation Mode: Custom

In [None]:
def generate_custom(model_name: str, custom_prompt: str, persona: str, num_records: int) -> tuple:
    """Generate from custom user-defined schema."""
    persona_style = PERSONAS[persona]
    
    system_prompt = f"""You are an expert synthetic data generator. {persona_style}
Generate realistic, high-quality synthetic data. Output ONLY valid CSV data."""

    user_prompt = f"""{custom_prompt}

Generate {num_records} rows of CSV data.

CRITICAL: Output ONLY raw CSV. No commas in text fields. No markdown. No explanations."""

    try:
        response = generate_with_model(model_name, system_prompt, user_prompt)
        clean_data = clean_csv_response(response)
        df = safe_parse_csv(clean_data)
        return df, clean_data, f"‚úÖ Generated {len(df)} custom rows with {model_name}"
    except Exception as e:
        return pd.DataFrame(), "", f"‚ùå Error: {str(e)}"

print("‚úÖ Custom generation defined!")

## üé® Gradio UI

In [None]:
with gr.Blocks(title="üé≠ Story-Driven Dataset Generator", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # üé≠ Story-Driven Synthetic Dataset Generator
    **Generate interconnected, narrative-driven synthetic data with AI**
    """)
    
    with gr.Tabs():
        # ===== STANDARD TAB =====
        with gr.TabItem("üìä Standard"):
            with gr.Row():
                with gr.Column(scale=1):
                    std_model = gr.Dropdown(list(MODELS.keys()), label="Model", value="GPT-4o-mini")
                    std_domain = gr.Dropdown(list(DOMAINS.keys()), label="Domain", value="üöÄ Startup Journey")
                    std_persona = gr.Dropdown(list(PERSONAS.keys()), label="Style Persona", value="üè¢ Corporate Analyst")
                    std_records = gr.Slider(5, 50, value=10, step=5, label="Number of Records")
                    std_btn = gr.Button("üöÄ Generate", variant="primary")
                
                with gr.Column(scale=2):
                    std_status = gr.Markdown("Ready to generate...")
                    std_preview = gr.Dataframe(label="Preview")
                    std_raw = gr.Textbox(label="Raw CSV Output", lines=10, show_copy_button=True)
            
            std_btn.click(generate_standard, [std_model, std_domain, std_persona, std_records], 
                         [std_preview, std_raw, std_status])
        
        # ===== STORY CHAIN TAB =====
        with gr.TabItem("üìñ Story Chain"):
            gr.Markdown("*Generate data that tells a chronological story!*")
            with gr.Row():
                with gr.Column(scale=1):
                    story_model = gr.Dropdown(list(MODELS.keys()), label="Model", value="GPT-4o-mini")
                    story_domain = gr.Dropdown(list(DOMAINS.keys()), label="Story Type", value="üöÄ Startup Journey")
                    story_persona = gr.Dropdown(list(PERSONAS.keys()), label="Narrative Style", value="üé® Creative Writer")
                    story_records = gr.Slider(5, 30, value=10, step=5, label="Story Length (rows)")
                    story_btn = gr.Button("üìñ Create Story", variant="primary")
                
                with gr.Column(scale=2):
                    story_status = gr.Markdown("Ready to tell a story...")
                    story_preview = gr.Dataframe(label="Story Preview")
                    story_raw = gr.Textbox(label="Story Data", lines=10, show_copy_button=True)
            
            story_btn.click(generate_story_chain, [story_model, story_domain, story_persona, story_records],
                           [story_preview, story_raw, story_status])
        
        # ===== BATTLE TAB =====
        with gr.TabItem("ü•ä Model Battle"):
            gr.Markdown("*Two AI models compete on the same prompt!*")
            with gr.Row():
                battle_domain = gr.Dropdown(list(DOMAINS.keys()), label="Domain", value="üõí Customer Journey")
                battle_persona = gr.Dropdown(list(PERSONAS.keys()), label="Style", value="üè¢ Corporate Analyst")
                battle_records = gr.Slider(5, 20, value=10, step=5, label="Records Each")
            
            battle_btn = gr.Button("‚öîÔ∏è Start Battle!", variant="primary")
            battle_status = gr.Markdown("Pick your settings and fight!")
            
            with gr.Row():
                with gr.Column():
                    battle_label_a = gr.Markdown("**Contender A**")
                    battle_preview_a = gr.Dataframe(label="Model A Output")
                    battle_raw_a = gr.Textbox(label="Raw A", lines=8, show_copy_button=True)
                with gr.Column():
                    battle_label_b = gr.Markdown("**Contender B**")
                    battle_preview_b = gr.Dataframe(label="Model B Output")
                    battle_raw_b = gr.Textbox(label="Raw B", lines=8, show_copy_button=True)
            
            battle_btn.click(generate_battle, [battle_domain, battle_persona, battle_records],
                            [battle_preview_a, battle_raw_a, battle_label_a, 
                             battle_preview_b, battle_raw_b, battle_label_b, battle_status])
        
        # ===== REMIX TAB =====
        with gr.TabItem("üîÑ Data Remix"):
            gr.Markdown("*Paste sample data and AI will generate more matching your style!*")
            with gr.Row():
                with gr.Column(scale=1):
                    remix_sample = gr.Textbox(
                        label="Paste Your Sample Data (CSV format)",
                        lines=8,
                        placeholder="name,age,city,salary\nJohn_Smith,32,New_York,85000\nJane_Doe,28,Boston,72000"
                    )
                    remix_model = gr.Dropdown(list(MODELS.keys()), label="Model", value="GPT-4o-mini")
                    remix_records = gr.Slider(5, 50, value=10, step=5, label="New Records to Generate")
                    remix_btn = gr.Button("üîÑ Remix Data", variant="primary")
                
                with gr.Column(scale=2):
                    remix_status = gr.Markdown("Paste your sample data to begin...")
                    remix_preview = gr.Dataframe(label="Generated Data")
                    remix_raw = gr.Textbox(label="Raw Output", lines=10, show_copy_button=True)
            
            remix_btn.click(generate_remix, [remix_sample, remix_model, remix_records],
                           [remix_preview, remix_raw, remix_status])
        
        # ===== CUSTOM TAB =====
        with gr.TabItem("‚úèÔ∏è Custom"):
            gr.Markdown("*Define your own schema and requirements!*")
            with gr.Row():
                with gr.Column(scale=1):
                    custom_prompt = gr.Textbox(
                        label="Describe Your Data",
                        lines=5,
                        placeholder="Generate data for a bookstore: book_title,author,isbn,price,genre,stock"
                    )
                    custom_model = gr.Dropdown(list(MODELS.keys()), label="Model", value="GPT-4o-mini")
                    custom_persona = gr.Dropdown(list(PERSONAS.keys()), label="Style", value="üî¨ Data Scientist")
                    custom_records = gr.Slider(5, 50, value=10, step=5, label="Records")
                    custom_btn = gr.Button("‚úèÔ∏è Generate Custom", variant="primary")
                
                with gr.Column(scale=2):
                    custom_status = gr.Markdown("Describe your data needs...")
                    custom_preview = gr.Dataframe(label="Preview")
                    custom_raw = gr.Textbox(label="Raw Output", lines=10, show_copy_button=True)
            
            custom_btn.click(generate_custom, [custom_model, custom_prompt, custom_persona, custom_records],
                            [custom_preview, custom_raw, custom_status])

print("‚úÖ Gradio UI is ready!")

## üöÄ Launch the App!

In [None]:
app.launch()