# Synthetic Dataset Generator with Quality Scoring

An AI-powered tool that creates realistic synthetic datasets for any business case with flexible schema creation, synonym permutation for diversity, and automated quality scoring.

## Features
- **Multi-Model Support**: HuggingFace models (primary) + Commercial APIs
- **Flexible Schema Creation**: LLM-generated, manual, or hybrid approaches
- **Synonym Permutation**: Post-process datasets to increase diversity
- **Quality Scoring**: Separate LLM model evaluates dataset quality
- **GPU Optimized**: Designed for Google Colab T4 GPUs
- **Multiple Output Formats**: CSV, TSV, JSON, JSONL

## Quick Start
1. **Schema Tab**: Define your dataset structure
2. **Generation Tab**: Generate synthetic data
3. **Permutation Tab**: Add diversity with synonyms
4. **Scoring Tab**: Evaluate data quality
5. **Export Tab**: Download your dataset


In [None]:
# Install dependencies
%pip install -q --upgrade bitsandbytes accelerate transformers
%pip install -q openai gradio nltk


In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
  if gpu_info.find('Tesla T4') >= 0:
    print("Success - Connected to a T4")
  else:
    print("NOT CONNECTED TO A T4")

## Start

In [None]:
# Imports and Setup
import os
import io
import time
import json
import pandas as pd
import random
import re
import gc
import torch
from typing import List, Dict, Any, Tuple
import warnings
warnings.filterwarnings("ignore")

# Google Colab
from google.colab import files

# LLM APIs
from openai import OpenAI

# HuggingFace
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Data processing
import nltk
from nltk.corpus import wordnet

# UI
import gradio as gr

# Download NLTK data
try:
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
except:
    print("NLTK data download may have failed - synonym features may not work")

print("✅ All imports successful!")


In [None]:
# API Key Setup - Support both Colab and Local environments
def setup_api_keys():
    """Initialize API keys from environment or Colab secrets"""
    try:
        # Try Colab environment first
        from google.colab import userdata
        api_keys = {
            'openai': userdata.get('OPENAI_API_KEY'),
            'anthropic': userdata.get('ANTHROPIC_API_KEY'),
            'google': userdata.get('GOOGLE_API_KEY'),
            'deepseek': userdata.get('DEEPSEEK_API_KEY'),
            # 'groq': userdata.get('GROQ_API_KEY'),
            'grok': userdata.get('GROK_API_KEY'),
            # 'openrouter': userdata.get('OPENROUTER_API_KEY'),
            # 'ollama': userdata.get('OLLAMA_API_KEY'),
            'hf_token': userdata.get('HF_TOKEN')
        }
        print("✅ Using Colab secrets")
    except:
        # Fallback to local environment
        from dotenv import load_dotenv
        load_dotenv()
        api_keys = {
            'openai': os.getenv('OPENAI_API_KEY'),
            'anthropic': os.getenv('ANTHROPIC_API_KEY'),
            'google': os.getenv('GOOGLE_API_KEY'),
            'deepseek': os.getenv('DEEPSEEK_API_KEY'),
            # 'groq': os.getenv('GROQ_API_KEY'),
            'grok': os.getenv('GROK_API_KEY'),
            # 'openrouter': os.getenv('OPENROUTER_API_KEY'),
            # 'ollama': os.getenv('OLLAMA_API_KEY'),
            'hf_token': os.getenv('HF_TOKEN')
        }
        print("✅ Using local .env file")

    # Initialize API clients
    anthropic_url = "https://api.anthropic.com/v1/"
    gemini_url = "https://generativelanguage.googleapis.com/v1beta/openai/"
    deepseek_url = "https://api.deepseek.com"
    # groq_url = "https://api.groq.com/openai/v1"
    grok_url = "https://api.x.ai/v1"
    # openrouter_url = "https://openrouter.ai/api/v1"
    # ollama_url = "http://localhost:11434/v1"

    clients = {}
    if api_keys['openai']:
        clients['openai'] = OpenAI(api_key=api_keys['openai'])
    if api_keys['anthropic']:
        clients['anthropic'] = OpenAI(api_key=api_keys['anthropic'], base_url=anthropic_url)
        # clients['anthropic'] = anthropic.Anthropic(api_key=api_keys['anthropic'])
    if api_keys['google']:
        # genai.configure(api_key=api_keys['google'])
        clients['google'] = OpenAI(api_key=api_keys['google'], base_url=gemini_url)
    if api_keys['deepseek']:
        clients['deepseek'] = OpenAI(api_key=api_keys['deepseek'], base_url=deepseek_url)
        # clients['deepseek'] = DeepSeek(api_key=api_keys['deepseek'])
    if api_keys['grok']:
        clients['grok'] = OpenAI(api_key=api_keys['grok'], base_url=grok_url)
    if api_keys['hf_token']:
        login(api_keys['hf_token'], add_to_git_credential=True)

    return api_keys, clients

# Initialize API keys and clients
api_keys, clients = setup_api_keys()

In [None]:
# Model Configuration

# HuggingFace Models
HUGGINGFACE_MODELS = {
    "Llama 3.1 8B": {
        "model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
        "description": "Good for structured data generation",
        "size": "8B",
        "type": "huggingface",
        "model_class": "LlamaForCausalLM"
    },
    "Llama 3.2 3B": {
        "model_id": "meta-llama/Llama-3.2-3B-Instruct",
        "description": "Smaller and faster model for simple schemas",
        "size": "3B",
        "type": "huggingface",
        "model_class": "LlamaForCausalLM"
    },
    "Phi-3.5 Mini": {
        "model_id": "microsoft/Phi-3.5-mini-instruct",
        "description": "Reasoning capabilities",
        "size": "3.8B",
        "type": "huggingface",
        "model_class": "Phi3ForCausalLM"
    },
    "Gemma 2 9B": {
        "model_id": "google/gemma-2-9b-it",
        "description": "Instruction-tuned model",
        "size": "9B",
        "type": "huggingface",
        "model_class": "GemmaForCausalLM"
    },
    "Qwen 2.5 7B": {
        "model_id": "Qwen/Qwen2.5-7B-Instruct",
        "description": "Multilingual that is good for diverse data",
        "size": "7B",
        "type": "huggingface",
        "model_class": "Qwen2ForCausalLM"
    },
    "Mistral 7B": {
        "model_id": "mistralai/Mistral-7B-Instruct-v0.3",
        "description": "Fast inference",
        "size": "7B",
        "type": "huggingface",
        "model_class": "MistralForCausalLM"
    },
    "Zephyr 7B": {
        "model_id": "HuggingFaceH4/zephyr-7b-beta",
        "description": "Fine-tuned for instruction following",
        "size": "7B",
        "type": "huggingface",
        "model_class": "ZephyrForCausalLM"
    }
}

# Commercial Models
COMMERCIAL_MODELS = {
    "GPT-5 Mini": {
        "model_id": "gpt-5-mini",
        "description": "Fast, cost-effective OpenAI model",
        "provider": "openai",
        "type": "commercial"
    },
    "Claude 4.5 Haiku": {
        "model_id": "claude-4.5-haiku-20251001",
        "description": "Balance of speed and quality",
        "provider": "anthropic",
        "type": "commercial"
    },
    "Gemini 2.5 Flash": {
        "model_id": "gemini-2.5-flash-lite",
        "description": "Fast Google model",
        "provider": "google",
        "type": "commercial"
    },
    "DeepSeek Chat": {
        "model_id": "deepseek-chat",
        "description": "Cost-effective with good performance",
        "provider": "deepseek",
        "type": "commercial"
    },
    "Grok 4": {
        "model_id": "grok-4",
        "description": "Grok 4",
        "provider": "grok",
        "type": "commercial"
    }
}

# Output formats
OUTPUT_FORMATS = [".csv", ".tsv", ".json", ".jsonl"]

# Default schema for pharmacogenomics (PGx) example
DEFAULT_SCHEMA = [
    ("patient_id", "TEXT", "Unique patient identifier", "PGX_001"),
    ("age", "INT", "Patient age in years", 45),
    ("gender", "TEXT", "Patient gender", "Female"),
    ("ethnicity", "TEXT", "Patient ethnicity", "Caucasian"),
    ("gene_variant", "TEXT", "Genetic variant", "CYP2D6*1/*4"),
    ("drug_name", "TEXT", "Medication name", "Warfarin"),
    ("dosage", "TEXT", "Drug dosage", "5mg daily"),
    ("adverse_reaction", "TEXT", "Any adverse reactions", "None"),
    ("efficacy_score", "INT", "Treatment efficacy (1-10)", 8),
    ("metabolizer_status", "TEXT", "Drug metabolizer phenotype", "Intermediate")
]

DEFAULT_SCHEMA_TEXT = "\n".join([f"{i+1}. {col[0]} ({col[1]}) - {col[2]}, example: {col[3]}" for i, col in enumerate(DEFAULT_SCHEMA)])

print(f"📊 Available HuggingFace models: {len(HUGGINGFACE_MODELS)}")
print(f"🌐 Available Commercial models: {len(COMMERCIAL_MODELS)}")


In [None]:
# HuggingFace Model Loading
def load_huggingface_model(model_id, model_class_name, quantization_config, torch_dtype):
    """Load HuggingFace model with correct model class"""
    try:
        # Import the specific model class
        if model_class_name == "LlamaForCausalLM":
            from transformers import LlamaForCausalLM
            model_class = LlamaForCausalLM
        elif model_class_name == "Phi3ForCausalLM":
            from transformers import Phi3ForCausalLM
            model_class = Phi3ForCausalLM
        elif model_class_name == "GemmaForCausalLM":
            from transformers import GemmaForCausalLM
            model_class = GemmaForCausalLM
        elif model_class_name == "Qwen2ForCausalLM":
            from transformers import Qwen2ForCausalLM
            model_class = Qwen2ForCausalLM
        elif model_class_name == "MistralForCausalLM":
            from transformers import MistralForCausalLM
            model_class = MistralForCausalLM
        else:
            # Fallback to AutoModelForCausalLM
            model_class = AutoModelForCausalLM

        # Load the model
        model = model_class.from_pretrained(
            model_id,
            device_map="auto",
            quantization_config=quantization_config,
            torch_dtype=torch_dtype
        )
        return model

    except Exception as e:
        print(f"Error loading {model_class_name}: {str(e)}")
        # Fallback to AutoModelForCausalLM
        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                device_map="auto",
                quantization_config=quantization_config,
                torch_dtype=torch_dtype
            )
            return model
        except Exception as e2:
            raise Exception(f"Failed to load model with both specific and auto classes: {str(e2)}")

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Schema Management Module
class SchemaManager:
    """Handles schema creation, parsing, and enhancement"""

    def __init__(self):
        self.current_schema = None
        self.schema_text = None
        self.quantization_config = quantization_config

    def generate_schema_with_llm(self, business_case: str, model_name: str, temperature: float = 0.7) -> str:
        """Generate complete schema from business case using LLM"""
        system_prompt = """You are an expert data scientist. Given a business case, generate a comprehensive dataset schema.
        Return the schema in this exact format:
        field_name (TYPE) - Description, example: example_value

        Include 8-12 relevant fields that would be useful for the business case.
        Use realistic field names and appropriate data types (TEXT, INT, FLOAT, BOOLEAN, ARRAY).
        Provide clear descriptions and realistic examples."""

        user_prompt = f"""\n\nBusiness case: {business_case}

        Generate a dataset schema for this business case. Include fields that would be relevant for analysis and decision-making."""

        try:
            response = self._query_llm(model_name, system_prompt, user_prompt, temperature)
            self.schema_text = response
            return response
        except Exception as e:
            return f"Error generating schema: {str(e)}"

    def enhance_schema_with_llm(self, partial_schema: str, business_case: str, model_name: str, temperature: float = 0.7) -> str:
        """Enhance user-provided partial schema using LLM"""
        system_prompt = """You are an expert data scientist. Given a partial schema and business case, enhance it by:
        1. Adding missing relevant fields
        2. Improving field descriptions
        3. Adding realistic examples
        4. Ensuring proper data types

        Return the enhanced schema in the same format as the original."""

        user_prompt = f"""\n\nBusiness case: {business_case}

        Current partial schema:
        {partial_schema}

        Please enhance this schema by adding missing fields and improving the existing ones."""

        try:
            response = self._query_llm(model_name, system_prompt, user_prompt, temperature)
            self.schema_text = response
            return response
        except Exception as e:
            return f"Error enhancing schema: {str(e)}"

    def parse_manual_schema(self, schema_text: str) -> Dict[str, Any]:
        """Parse manually entered schema text"""
        try:
            lines = [line.strip() for line in schema_text.split('\n') if line.strip()]
            parsed_schema = []

            for line in lines:
                if re.match(r'^\d+\.', line):  # Skip line numbers
                    line = re.sub(r'^\d+\.\s*', '', line)

                # Parse format: field_name (TYPE) - Description, example: example_value
                match = re.match(r'^([^(]+)\s*\(([^)]+)\)\s*-\s*([^,]+),\s*example:\s*(.+)$', line)
                if match:
                    field_name, field_type, description, example = match.groups()
                    parsed_schema.append({
                        'name': field_name.strip(),
                        'type': field_type.strip(),
                        'description': description.strip(),
                        'example': example.strip()
                    })

            self.current_schema = parsed_schema
            return parsed_schema
        except Exception as e:
            return {"error": f"Error parsing schema: {str(e)}"}

    def format_schema_for_prompt(self, schema: List[Dict]) -> str:
        """Convert parsed schema to prompt-ready format"""
        if not schema:
            return self.schema_text or ""

        formatted_lines = []
        for i, field in enumerate(schema, 1):
            line = f"{i}. {field['name']} ({field['type']}) - {field['description']}, example: {field['example']}"
            formatted_lines.append(line)

        return "\n".join(formatted_lines)

    def _query_llm(self, model_name: str, system_prompt: str, user_prompt: str, temperature: float) -> str:
        """Universal LLM query interface"""
        # Check if it's a HuggingFace model
        if model_name in HUGGINGFACE_MODELS:
            return self._query_huggingface(model_name, system_prompt, user_prompt, temperature)
        elif model_name in COMMERCIAL_MODELS:
            return self._query_commercial(model_name, system_prompt, user_prompt, temperature)
        else:
            raise ValueError(f"Unknown model: {model_name}")

    def _query_huggingface(self, model_name: str, system_prompt: str, user_prompt: str, temperature: float) -> str:
        """Query HuggingFace models"""
        model_info = HUGGINGFACE_MODELS[model_name]
        model_id = model_info["model_id"]

        try:
            # Check if model is already loaded
            if model_name not in dataset_generator.loaded_models:
                print(f"🔄 Loading {model_name} for schema generation...")

                # Load tokenizer
                tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
                tokenizer.pad_token = tokenizer.eos_token
                print(f"Tokenizer loaded for {model_name}")

                # Load model with quantization using correct model class
                model_class_name = model_info.get("model_class", "AutoModelForCausalLM")
                model = load_huggingface_model(
                    model_id,
                    model_class_name,
                    dataset_generator.quantization_config,
                    torch.bfloat16
                )

                dataset_generator.loaded_models[model_name] = {
                    'model': model,
                    'tokenizer': tokenizer
                }
                print(f"✅ {model_name} loaded successfully for schema generation!")

            # Get model and tokenizer
            model = dataset_generator.loaded_models[model_name]['model']
            tokenizer = dataset_generator.loaded_models[model_name]['tokenizer']

            # Prepare messages
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]

            # Tokenize
            inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

            # Generate
            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_new_tokens=2000,
                    temperature=temperature,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            # Decode response
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the assistant's response
            if "<|assistant|>" in response:
                response = response.split("<|assistant|>")[-1].strip()
            elif "assistant" in response:
                response = response.split("assistant")[-1].strip()

            return response

        except Exception as e:
            # Clean up on error
            if model_name in dataset_generator.loaded_models:
                del dataset_generator.loaded_models[model_name]
                gc.collect()
                torch.cuda.empty_cache()
            raise Exception(f"HuggingFace schema generation error: {str(e)}")

    def _query_commercial(self, model_name: str, system_prompt: str, user_prompt: str, temperature: float) -> str:
        """Query commercial API models"""
        model_info = COMMERCIAL_MODELS[model_name]
        provider = model_info["provider"]
        model_id = model_info["model_id"]


        try:
            response = clients[provider].chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature = temperature if model_id != "gpt-5-mini" else 1.0
            )
            return response.choices[0].message.content

        except Exception as e:
            return f"Error querying {model_name}: {str(e)}"

# Initialize schema manager
schema_manager = SchemaManager()



In [None]:
# Dataset Generation Module
class DatasetGenerator:
    """Handles synthetic dataset generation using multiple LLM models"""

    def __init__(self):
        self.loaded_models = {}  # Cache for HuggingFace models
        self.quantization_config = quantization_config

    def generate_dataset(self, schema_text: str, business_case: str, model_name: str,
                        temperature: float, num_records: int, examples: str = "") -> Tuple[str, List[Dict]]:
        """Generate synthetic dataset using specified model"""
        try:
            # Build generation prompt
            prompt = self._build_generation_prompt(schema_text, business_case, num_records, examples)

            # Query the model
            response = self._query_llm(model_name, prompt, temperature)

            # Parse JSONL response
            records = self._parse_jsonl_response(response)

            if not records:
                return "❌ Error: No valid records generated", []

            if len(records) < num_records:
                return f"⚠️ Warning: Generated {len(records)} records (requested {num_records})", records

            return f"✅ Generated {len(records)} records successfully!", records

        except Exception as e:
            return f"❌ Error: {str(e)}", []

    def _build_generation_prompt(self, schema_text: str, business_case: str, num_records: int, examples: str) -> str:
        """Build the generation prompt"""
        prompt = f"""You are a data generation expert. Generate {num_records} realistic records for the following business case:

Business Case: {business_case}

Schema:
{schema_text}

Requirements:
- Generate exactly {num_records} records
- Each record must be a valid JSON object
- Do NOT repeat values across records
- Make data realistic and diverse
- Output only valid JSONL (one JSON object per line)
- No additional text or explanations

"""

        if examples.strip():
            prompt += f"""
Examples to follow (but do NOT repeat these exact examples):
{examples}

"""

        prompt += "Generate the dataset now:"
        return prompt

    def _query_llm(self, model_name: str, prompt: str, temperature: float) -> str:
        """Universal LLM query interface"""
        if model_name in HUGGINGFACE_MODELS:
            return self._query_huggingface(model_name, prompt, temperature)
        elif model_name in COMMERCIAL_MODELS:
            return self._query_commercial(model_name, prompt, temperature)
        else:
            raise ValueError(f"Unknown model: {model_name}")

    def _query_huggingface(self, model_name: str, prompt: str, temperature: float) -> str:
        """Query HuggingFace models with GPU optimization"""
        model_info = HUGGINGFACE_MODELS[model_name]
        model_id = model_info["model_id"]

        try:
            # Check if model is already loaded
            if model_name not in self.loaded_models:
                print(f"🔄 Loading {model_name}...")

                # Load tokenizer
                tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
                tokenizer.pad_token = tokenizer.eos_token

                # Load model with quantization using correct model class
                model_class_name = model_info.get("model_class", "AutoModelForCausalLM")
                model = load_huggingface_model(
                    model_id,
                    model_class_name,
                    self.quantization_config,
                    torch.bfloat16
                )

                self.loaded_models[model_name] = {
                    'model': model,
                    'tokenizer': tokenizer
                }
                print(f"✅ {model_name} loaded successfully!")

            # Get model and tokenizer
            model = self.loaded_models[model_name]['model']
            tokenizer = self.loaded_models[model_name]['tokenizer']

            # Prepare messages
            messages = [
                {"role": "system", "content": "You are a helpful assistant that generates realistic datasets."},
                {"role": "user", "content": prompt}
            ]

            # Tokenize
            inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

            # Generate
            with torch.no_grad():
                outputs = model.generate(
                    inputs,
                    max_new_tokens=4000,
                    temperature=temperature,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )

            # Decode response
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the assistant's response
            if "<|assistant|>" in response:
                response = response.split("<|assistant|>")[-1].strip()
            elif "assistant" in response:
                response = response.split("assistant")[-1].strip()

            return response

        except Exception as e:
            # Clean up on error
            if model_name in self.loaded_models:
                del self.loaded_models[model_name]
                gc.collect()
                torch.cuda.empty_cache()
            raise Exception(f"HuggingFace model error: {str(e)}")

    def _query_commercial(self, model_name: str, prompt: str, temperature: float) -> str:
        """Query commercial API models"""
        model_info = COMMERCIAL_MODELS[model_name]
        provider = model_info["provider"]
        model_id = model_info["model_id"]

        try:
            response = clients[provider].chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that generates realistic datasets."},
                    {"role": "user", "content": prompt}
                ],
                temperature = temperature if model_id != "gpt-5-mini" else 1.0
            )
            return response.choices[0].message.content

        except Exception as e:
            raise Exception(f"Commercial API error: {str(e)}")

    def _parse_jsonl_response(self, response: str) -> List[Dict]:
        """Parse JSONL response and extract valid JSON records"""
        records = []
        lines = [line.strip() for line in response.strip().split('\n') if line.strip()]

        for line in lines:
            # Skip non-JSON lines
            if not line.startswith('{'):
                continue

            try:
                record = json.loads(line)
                if isinstance(record, dict):
                    records.append(record)
            except json.JSONDecodeError:
                continue

        return records

    def unload_model(self, model_name: str):
        """Unload a HuggingFace model to free memory"""
        if model_name in self.loaded_models:
            del self.loaded_models[model_name]
            gc.collect()
            torch.cuda.empty_cache()
            print(f"✅ {model_name} unloaded from memory")

    def get_memory_usage(self) -> str:
        """Get current GPU memory usage"""
        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            reserved = torch.cuda.memory_reserved() / 1024**3
            return f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved"
        return "GPU not available"

# Initialize dataset generator
dataset_generator = DatasetGenerator()
print("✅ Dataset Generation Module loaded!")


In [None]:
# Quality Scoring Module
class QualityScorer:
    """Evaluates dataset quality using separate LLM models"""

    def __init__(self):
        self.quality_rules = None
        self.scoring_model = None

    def extract_quality_rules(self, original_prompt: str, schema_text: str) -> str:
        """Extract quality criteria from the original generation prompt"""
        rules = f"""Quality Assessment Rules for Dataset:

1. **Schema Compliance (25 points)**
   - All required fields from schema are present
   - Data types match schema specifications
   - No missing values in critical fields

2. **Uniqueness (20 points)**
   - No duplicate records
   - Diverse values across records
   - Avoid repetitive patterns

3. **Relevance to Business Case (25 points)**
   - Data aligns with business context
   - Realistic scenarios and values
   - Appropriate level of detail

4. **Realism and Coherence (20 points)**
   - Values are realistic and plausible
   - Internal consistency within records
   - Logical relationships between fields

5. **Diversity (10 points)**
   - Varied values across the dataset
   - Different scenarios represented
   - Balanced distribution where appropriate

Schema Requirements:
{schema_text}

Original Business Case Context:
{original_prompt}

Score each record from 0-100 based on these criteria."""

        self.quality_rules = rules
        return rules

    def score_single_record(self, record: Dict, model_name: str, temperature: float = 0.3) -> int:
        """Score a single dataset record (0-100)"""
        if not self.quality_rules:
            return 0

        try:
            # Prepare scoring prompt
            prompt = f"""{self.quality_rules}

Record to evaluate:
{json.dumps(record, indent=2)}

Provide a score from 0-100 and brief explanation. Format: "Score: XX - Explanation" """

            # Query the scoring model
            response = self._query_scoring_model(model_name, prompt, temperature)

            # Extract score from response
            score = self._extract_score_from_response(response)
            return score

        except Exception as e:
            print(f"Error scoring record: {e}")
            return 0

    def score_dataset(self, dataset: List[Dict], model_name: str, temperature: float = 0.3) -> Tuple[List[int], Dict[str, Any]]:
        """Score all records in the dataset"""
        if not dataset:
            return [], {}

        scores = []
        total_score = 0

        print(f"🔄 Scoring {len(dataset)} records with {model_name}...")

        for i, record in enumerate(dataset):
            score = self.score_single_record(record, model_name, temperature)
            scores.append(score)
            total_score += score

            if (i + 1) % 10 == 0:
                print(f"   Scored {i + 1}/{len(dataset)} records...")

        # Calculate statistics
        avg_score = total_score / len(scores) if scores else 0
        min_score = min(scores) if scores else 0
        max_score = max(scores) if scores else 0

        # Count quality levels
        excellent = sum(1 for s in scores if s >= 90)
        good = sum(1 for s in scores if 70 <= s < 90)
        fair = sum(1 for s in scores if 50 <= s < 70)
        poor = sum(1 for s in scores if s < 50)

        stats = {
            'total_records': len(dataset),
            'average_score': round(avg_score, 2),
            'min_score': min_score,
            'max_score': max_score,
            'excellent_count': excellent,
            'good_count': good,
            'fair_count': fair,
            'poor_count': poor,
            'excellent_pct': round(excellent / len(dataset) * 100, 1),
            'good_pct': round(good / len(dataset) * 100, 1),
            'fair_pct': round(fair / len(dataset) * 100, 1),
            'poor_pct': round(poor / len(dataset) * 100, 1)
        }

        return scores, stats

    def generate_quality_report(self, scores: List[int], dataset: List[Dict],
                             flagged_threshold: int = 70) -> Dict[str, Any]:
        """Generate comprehensive quality report"""
        if not scores or not dataset:
            return {"error": "No data to analyze"}

        # Find flagged records (low quality)
        flagged_records = []
        for i, (record, score) in enumerate(zip(dataset, scores)):
            if score < flagged_threshold:
                flagged_records.append({
                    'index': i,
                    'score': score,
                    'record': record
                })

        # Quality distribution
        score_ranges = {
            '90-100': sum(1 for s in scores if s >= 90),
            '80-89': sum(1 for s in scores if 80 <= s < 90),
            '70-79': sum(1 for s in scores if 70 <= s < 80),
            '60-69': sum(1 for s in scores if 60 <= s < 70),
            '50-59': sum(1 for s in scores if 50 <= s < 60),
            '0-49': sum(1 for s in scores if s < 50)
        }

        report = {
            'total_records': len(dataset),
            'average_score': round(sum(scores) / len(scores), 2),
            'flagged_count': len(flagged_records),
            'flagged_percentage': round(len(flagged_records) / len(dataset) * 100, 1),
            'score_distribution': score_ranges,
            'flagged_records': flagged_records[:10],  # Limit to first 10 for display
            'recommendations': self._generate_recommendations(scores, flagged_records)
        }

        return report

    def _query_scoring_model(self, model_name: str, prompt: str, temperature: float) -> str:
        """Query the scoring model"""
        # Use the same interface as dataset generation
        if model_name in HUGGINGFACE_MODELS:
            return dataset_generator._query_huggingface(model_name, prompt, temperature)
        elif model_name in COMMERCIAL_MODELS:
            return dataset_generator._query_commercial(model_name, prompt, temperature)
        else:
            raise ValueError(f"Unknown scoring model: {model_name}")

    def _extract_score_from_response(self, response: str) -> int:
        """Extract numerical score from model response"""
        # Look for patterns like "Score: 85" or "85/100" or just "85"
        score_patterns = [
            r'Score:\s*(\d+)',
            r'(\d+)/100',
            r'(\d+)\s*points',
            r'(\d+)\s*out of 100'
        ]

        for pattern in score_patterns:
            match = re.search(pattern, response, re.IGNORECASE)
            if match:
                score = int(match.group(1))
                return max(0, min(100, score))  # Clamp between 0-100

        # If no pattern found, try to find any number in the response
        numbers = re.findall(r'\d+', response)
        if numbers:
            score = int(numbers[0])
            return max(0, min(100, score))

        return 50  # Default score if no number found

    def _generate_recommendations(self, scores: List[int], flagged_records: List[Dict]) -> List[str]:
        """Generate recommendations based on quality analysis"""
        recommendations = []

        avg_score = sum(scores) / len(scores)

        if avg_score < 70:
            recommendations.append("Consider regenerating the dataset with a different model or parameters")

        if len(flagged_records) > len(scores) * 0.3:
            recommendations.append("High number of low-quality records - review generation prompt")

        if max(scores) - min(scores) > 50:
            recommendations.append("High variance in quality - consider more consistent generation approach")

        if avg_score >= 85:
            recommendations.append("Excellent dataset quality - ready for use")
        elif avg_score >= 70:
            recommendations.append("Good dataset quality - minor improvements possible")
        else:
            recommendations.append("Dataset needs improvement - consider regenerating")

        return recommendations

# Initialize quality scorer
quality_scorer = QualityScorer()


In [None]:
# Synonym Permutation Module
class SynonymPermutator:
    """Handles synonym replacement to increase dataset diversity"""

    def __init__(self):
        self.synonym_cache = {}  # Cache for synonyms to avoid repeated lookups

    def get_synonyms(self, word: str) -> List[str]:
        """Get synonyms for a word using NLTK WordNet"""
        if word.lower() in self.synonym_cache:
            return self.synonym_cache[word.lower()]

        synonyms = set()
        try:
            for syn in wordnet.synsets(word.lower()):
                for lemma in syn.lemmas():
                    synonym = lemma.name().replace('_', ' ').lower()
                    if synonym != word.lower() and len(synonym) > 2:
                        synonyms.add(synonym)
        except:
            pass

        # Filter out very similar words and keep only relevant ones
        filtered_synonyms = []
        for syn in synonyms:
            if (len(syn) >= 3 and
                syn != word.lower() and
                not syn.endswith('ing') or word.endswith('ing') and
                not syn.endswith('ed') or word.endswith('ed')):
                filtered_synonyms.append(syn)

        # Limit to 5 synonyms max
        filtered_synonyms = filtered_synonyms[:5]
        self.synonym_cache[word.lower()] = filtered_synonyms
        return filtered_synonyms

    def identify_text_fields(self, dataset: List[Dict]) -> List[str]:
        """Auto-detect text fields suitable for synonym permutation"""
        if not dataset:
            return []

        text_fields = []
        for key, value in dataset[0].items():
            if isinstance(value, str) and len(value) > 3:
                # Check if field contains meaningful text (not just IDs or codes)
                if not re.match(r'^[A-Z0-9_\-]+$', value) and not value.isdigit():
                    text_fields.append(key)

        return text_fields

    def permute_with_synonyms(self, dataset: List[Dict], fields_to_permute: List[str],
                            permutation_rate: float = 0.3) -> Tuple[List[Dict], Dict[str, int]]:
        """Replace words with synonyms in specified fields"""
        if not dataset or not fields_to_permute:
            return dataset, {}

        permuted_dataset = []
        replacement_stats = {field: 0 for field in fields_to_permute}

        for record in dataset:
            permuted_record = record.copy()

            for field in fields_to_permute:
                if field in record and isinstance(record[field], str):
                    original_text = record[field]
                    permuted_text = self._permute_text(original_text, permutation_rate)
                    permuted_record[field] = permuted_text

                    # Count replacements
                    if original_text != permuted_text:
                        replacement_stats[field] += 1

            permuted_dataset.append(permuted_record)

        return permuted_dataset, replacement_stats

    def _permute_text(self, text: str, permutation_rate: float) -> str:
        """Permute words in text with synonyms"""
        words = text.split()
        if len(words) < 2:  # Skip very short texts
            return text

        num_replacements = max(1, int(len(words) * permutation_rate))
        words_to_replace = random.sample(range(len(words)), min(num_replacements, len(words)))

        permuted_words = words.copy()
        for word_idx in words_to_replace:
            word = words[word_idx]
            # Clean word for synonym lookup
            clean_word = re.sub(r'[^\w]', '', word.lower())

            if len(clean_word) > 3:  # Only replace meaningful words
                synonyms = self.get_synonyms(clean_word)
                if synonyms:
                    chosen_synonym = random.choice(synonyms)
                    # Preserve original capitalization and punctuation
                    if word.isupper():
                        chosen_synonym = chosen_synonym.upper()
                    elif word.istitle():
                        chosen_synonym = chosen_synonym.title()

                    permuted_words[word_idx] = word.replace(clean_word, chosen_synonym)

        return ' '.join(permuted_words)

    def get_permutation_preview(self, text: str, permutation_rate: float = 0.3) -> str:
        """Get a preview of how text would look after permutation"""
        return self._permute_text(text, permutation_rate)

    def clear_cache(self):
        """Clear the synonym cache to free memory"""
        self.synonym_cache.clear()

# Initialize synonym permutator
synonym_permutator = SynonymPermutator()


In [None]:
# Output & Export Module
class DatasetExporter:
    """Handles dataset export to multiple formats"""

    def __init__(self):
        self.current_dataset = None
        self.current_scores = None
        self.export_history = []

    def save_dataset(self, records: List[Dict], file_format: str, filename: str) -> str:
        """Save dataset using Gradio File component approach - WORKING VERSION"""
        if not records:
            return None  # Return None to indicate no file

        try:
            # Ensure filename has correct extension
            if not filename.endswith(file_format):
                filename += file_format

            # Generate unique filename to avoid caching issues
            timestamp = int(time.time())
            base_name = filename.replace(file_format, '')
            unique_filename = f"{base_name}_{timestamp}{file_format}"

            # Create file path in /content directory
            file_path = f"/content/{unique_filename}"

            # Create DataFrame
            df = pd.DataFrame(records)

            if file_format == ".csv":
                df.to_csv(file_path, index=False)
            elif file_format == ".tsv":
                df.to_csv(file_path, sep="\t", index=False)
            elif file_format == ".json":
                df.to_json(file_path, orient="records", indent=2)
            elif file_format == ".jsonl":
                with open(file_path, 'w') as f:
                    for record in records:
                        f.write(json.dumps(record) + '\n')
            else:
                return None

            print(f"File generated and saved at: {file_path}")
            return file_path

        except Exception as e:
            print(f"Error saving dataset: {str(e)}")
            return None

    def save_with_scores(self, records: List[Dict], scores: List[int], file_format: str, filename: str) -> str:
        """Save dataset with quality scores using Gradio File component approach"""
        if not records or not scores:
            return None

        try:
            # Add scores to records
            records_with_scores = []
            for i, record in enumerate(records):
                record_with_score = record.copy()
                record_with_score['quality_score'] = scores[i] if i < len(scores) else 0
                records_with_scores.append(record_with_score)

            return self.save_dataset(records_with_scores, file_format, filename)

        except Exception as e:
            print(f"Error saving dataset with scores: {str(e)}")
            return None

    def export_quality_report(self, scores: List[int], dataset: List[Dict], filename: str) -> str:
        """Export quality report as JSON"""
        try:
            if not scores or not dataset:
                return "❌ Error: No data to analyze"

            # Generate quality report
            report = quality_scorer.generate_quality_report(scores, dataset)

            report['export_timestamp'] = pd.Timestamp.now().isoformat()
            report['dataset_size'] = len(dataset)
            report['score_statistics'] = {
                'mean': round(sum(scores) / len(scores), 2),
                'median': round(sorted(scores)[len(scores)//2], 2),
                'std': round(pd.Series(scores).std(), 2)
            }

            # Save report
            with open(filename, 'w') as f:
                json.dump(report, f, indent=2)

            return f"✅ Quality report saved to {filename}"

        except Exception as e:
            return f"❌ Error saving quality report: {str(e)}"

    def create_preview_dataframe(self, records: List[Dict], num_rows: int = 20) -> pd.DataFrame:
        """Create preview DataFrame for display"""
        if not records:
            return pd.DataFrame()

        df = pd.DataFrame(records)
        return df.head(num_rows)

    def get_dataset_summary(self, records: List[Dict]) -> Dict[str, Any]:
        """Get summary statistics for the dataset"""
        if not records:
            return {"error": "No data available"}

        df = pd.DataFrame(records)

        summary = {
            'total_records': len(records),
            'total_fields': len(df.columns),
            'field_names': list(df.columns),
            'data_types': df.dtypes.to_dict(),
            'missing_values': df.isnull().sum().to_dict(),
            'memory_usage': df.memory_usage(deep=True).sum(),
            'sample_records': records[:3]  # First 3 records as sample
        }

        return summary

    def get_export_history(self) -> List[Dict]:
        """Get history of all exports"""
        return self.export_history.copy()

    def clear_history(self):
        """Clear export history"""
        self.export_history.clear()

# Initialize dataset exporter
dataset_exporter = DatasetExporter()


In [None]:
# Global state variables
current_dataset = []
current_scores = []
current_schema_text = DEFAULT_SCHEMA_TEXT
current_business_case = "Pharmacogenomics patient data for drug response analysis"

# Gradio UI Functions
def generate_schema(business_case, schema_mode, schema_text, model_name, temperature):
    """Generate or enhance schema based on mode"""
    if schema_mode == "LLM Generate":
        result = schema_manager.generate_schema_with_llm(business_case, model_name, temperature)
        current_schema_text = result
        current_business_case = business_case
        return result, result, result, business_case
    elif schema_mode == "LLM Enhance Manual":
        result = schema_manager.enhance_schema_with_llm(schema_text, business_case, model_name, temperature)
        current_schema_text = result
        current_business_case = business_case
        return result, result, result, business_case
    else:  # Manual Entry
        current_schema_text = schema_text
        current_business_case = business_case
        return schema_text, schema_text, schema_text, business_case

def generate_dataset_ui(schema_text, business_case, model_name, temperature, num_records, examples):
    """Generate dataset using selected model"""
    global current_dataset

    status, records = dataset_generator.generate_dataset(
        schema_text, business_case, model_name, temperature, num_records, examples
    )

    current_dataset = records
    preview_df = dataset_exporter.create_preview_dataframe(records, 20)

    return status, preview_df, len(records)

def apply_synonym_permutation(enable_permutation, fields_to_permute, permutation_rate):
    """Apply synonym permutation to dataset - FIXED VERSION"""
    global current_dataset

    if not enable_permutation:
        return current_dataset, "❌ Permutation is disabled - check the 'Enable Synonym Permutation' checkbox"

    if not current_dataset:
        return [], "❌ No dataset available - generate a dataset first"

    if not fields_to_permute:
        # Try to auto-identify fields if none are selected
        try:
            auto_fields = synonym_permutator.identify_text_fields(current_dataset)
            if auto_fields:
                fields_to_permute = auto_fields[:2]  # Use first 2 fields as default
                print(f"DEBUG: Auto-selected fields: {fields_to_permute}")
            else:
                return current_dataset, "❌ No text fields found for permutation"
        except Exception as e:
            return current_dataset, f"❌ Error identifying fields: {str(e)}"

    try:
        permuted_dataset, stats = synonym_permutator.permute_with_synonyms(
            current_dataset, fields_to_permute, permutation_rate / 100
        )

        current_dataset = permuted_dataset

        # Convert to DataFrame for proper display
        import pandas as pd
        preview_df = pd.DataFrame(permuted_dataset)

        stats_text = f"✅ Permutation applied to {len(fields_to_permute)} fields. "
        stats_text += f"Replacement counts: {stats}"

        return preview_df, stats_text

    except Exception as e:
        print(f"DEBUG: Error during permutation: {str(e)}")
        return current_dataset, f"❌ Error during permutation: {str(e)}"

def score_dataset_quality(scoring_model, scoring_temperature):
    """Score dataset quality using selected model"""
    global current_dataset, current_scores

    if not current_dataset:
        return "No dataset available for scoring", [], {}

    # Extract quality rules
    original_prompt = f"Business case: {current_business_case}"
    rules = quality_scorer.extract_quality_rules(original_prompt, current_schema_text)

    # Score dataset
    scores, stats = quality_scorer.score_dataset(current_dataset, scoring_model, scoring_temperature)
    current_scores = scores

    # Create scores DataFrame for display
    scores_df = pd.DataFrame({
        'Record_Index': range(len(scores)),
        'Quality_Score': scores,
        'Quality_Level': ['Excellent' if s >= 90 else 'Good' if s >= 70 else 'Fair' if s >= 50 else 'Poor' for s in scores]
    })

    # Generate report
    report = quality_scorer.generate_quality_report(scores, current_dataset)

    status = f"✅ Scored {len(scores)} records. Average score: {stats['average_score']}"

    return status, scores_df, report

def export_dataset(file_format, filename, include_scores):
    """Export dataset to specified format"""
    global current_dataset, current_scores

    if not current_dataset:
        return "No dataset to export"

    try:
        if include_scores and current_scores:
            result = dataset_exporter.save_with_scores(current_dataset, current_scores, file_format, filename)
        else:
            result = dataset_exporter.save_dataset(current_dataset, file_format, filename)
        return result
    except Exception as e:
        return f"❌ Error exporting dataset: {str(e)}"

def get_available_fields():
    """Get available fields for permutation"""
    if not current_dataset:
        return []

    return synonym_permutator.identify_text_fields(current_dataset)


## Graddle

In [None]:
# Create Gradio Interface
def create_gradio_interface():
    """Create the main Gradio interface with 5 tabs"""

    # Combine all models for dropdowns
    all_models = list(COMMERCIAL_MODELS.keys())+list(HUGGINGFACE_MODELS.keys())

    with gr.Blocks(title="Synthetic Dataset Generator", theme=gr.themes.Soft()) as interface:

        gr.Markdown("# Synthetic Dataset Generator with Quality Scoring")
        gr.Markdown("Generate realistic synthetic datasets using multiple LLM models with flexible schema creation, synonym permutation, and automated quality scoring.")

        # Status bar
        with gr.Row():
            gpu_status = gr.Textbox(
                label="GPU Status",
                value=dataset_generator.get_memory_usage(),
                interactive=False,
                scale=1
            )
            current_status = gr.Textbox(
                label="Current Status",
                value="Ready to generate datasets",
                interactive=False,
                scale=2
            )

        # Tab 1: Schema Definition
        with gr.Tab("📋 Schema Definition"):
            gr.Markdown("### Define your dataset schema")

            with gr.Row():
                with gr.Column(scale=2):
                    schema_mode = gr.Radio(
                        choices=["LLM Generate", "Manual Entry", "LLM Enhance Manual"],
                        value="Manual Entry",
                        label="Schema Mode"
                    )

                    business_case_input = gr.Textbox(
                        label="Business Case",
                        value=current_business_case,
                        lines=3,
                        placeholder="Describe your business case or data requirements..."
                    )

                    schema_input = gr.Textbox(
                        label="Schema Definition",
                        value=current_schema_text,
                        lines=15,
                        placeholder="Define your dataset schema here..."
                    )

                    with gr.Row():
                        schema_model = gr.Dropdown(
                            choices=all_models,
                            value=all_models[0],
                            label="Model for Schema Generation"
                        )
                        schema_temperature = gr.Slider(
                            minimum=0.0,
                            maximum=2.0,
                            value=0.7,
                            step=0.1,
                            label="Temperature"
                        )

                    generate_schema_btn = gr.Button("🔄 Generate/Enhance Schema", variant="primary")

                with gr.Column(scale=1):
                    schema_output = gr.Textbox(
                        label="Generated Schema",
                        lines=15,
                        interactive=False
                    )

        # Tab 2: Dataset Generation
        with gr.Tab("🚀 Dataset Generation"):
            gr.Markdown("### Generate synthetic dataset")

            with gr.Row():
                with gr.Column(scale=2):
                    generation_schema = gr.Textbox(
                        label="Schema (from Tab 1)",
                        value=current_schema_text,
                        lines=8,
                        interactive=False
                    )

                    generation_business_case = gr.Textbox(
                        label="Business Case",
                        value=current_business_case,
                        lines=2
                    )

                    examples_input = gr.Textbox(
                        label="Few-shot Examples (JSON format)",
                        lines=5,
                        placeholder='[{"instruction": "example", "response": "example"}]',
                        value=""
                    )

                    with gr.Row():
                        generation_model = gr.Dropdown(
                            choices=all_models,
                            value=all_models[0],
                            label="Generation Model"
                        )
                        generation_temperature = gr.Slider(
                            minimum=0.0,
                            maximum=2.0,
                            value=0.7,
                            step=0.1,
                            label="Temperature"
                        )
                        num_records = gr.Number(
                            value=50,
                            minimum=11,
                            maximum=1000,
                            step=1,
                            label="Number of Records"
                        )

                    generate_dataset_btn = gr.Button("🚀 Generate Dataset", variant="primary", size="lg")

                with gr.Column(scale=1):
                    generation_status = gr.Textbox(
                        label="Generation Status",
                        lines=3,
                        interactive=False
                    )

                    dataset_preview = gr.Dataframe(
                        label="Dataset Preview (First 20 rows)",
                        interactive=False,
                        wrap=True
                    )

                    record_count = gr.Number(
                        label="Total Records Generated",
                        interactive=False
                    )

        # Tab 3: Synonym Permutation
        with gr.Tab("🔄 Synonym Permutation"):
            gr.Markdown("### Add diversity with synonym replacement")

            with gr.Row():
                with gr.Column(scale=2):
                    enable_permutation = gr.Checkbox(
                        label="Enable Synonym Permutation",
                        value=False
                    )

                    fields_to_permute = gr.CheckboxGroup(
                        label="Fields to Permute",
                        choices=[],
                        value=[]
                    )

                    permutation_rate = gr.Slider(
                        minimum=0,
                        maximum=50,
                        value=20,
                        step=5,
                        label="Permutation Rate (%)"
                    )

                    apply_permutation_btn = gr.Button("🔄 Apply Permutation", variant="secondary")

                with gr.Column(scale=1):
                    permutation_status = gr.Textbox(
                        label="Permutation Status",
                        lines=2,
                        interactive=False
                    )

                permuted_preview = gr.Dataframe(
                    label="Permuted Dataset Preview",
                    interactive=False,
                    wrap=True,
                    datatype=["str"] * 10
                )

        # Tab 4: Quality Scoring
        with gr.Tab("📊 Quality Scoring"):
            gr.Markdown("### Evaluate dataset quality")

            with gr.Row():
                with gr.Column(scale=2):
                    scoring_model = gr.Dropdown(
                        choices=all_models,
                        value=all_models[0],
                        label="Scoring Model"
                    )

                    scoring_temperature = gr.Slider(
                        minimum=0.0,
                        maximum=2.0,
                        value=0.3,
                        step=0.1,
                        label="Temperature"
                    )

                    score_dataset_btn = gr.Button("📊 Score Dataset Quality", variant="primary")

                with gr.Column(scale=1):
                    scoring_status = gr.Textbox(
                        label="Scoring Status",
                        lines=2,
                        interactive=False
                    )

                    scores_dataframe = gr.Dataframe(
                        label="Quality Scores",
                        interactive=False
                    )

                    quality_report = gr.JSON(
                        label="Quality Report"
                    )

        with gr.Tab("💾 Export"):
            gr.Markdown("### Export your dataset")

            with gr.Row():
                with gr.Column(scale=2):
                    file_format = gr.Dropdown(
                        choices=OUTPUT_FORMATS,
                        value=".csv",
                        label="File Format"
                    )

                    filename = gr.Textbox(
                        label="Filename",
                        value="synthetic_dataset",
                        placeholder="Enter filename (extension added automatically)"
                    )

                    include_scores = gr.Checkbox(
                        label="Include Quality Scores",
                        value=False
                    )

                    export_btn = gr.Button("💾 Export Dataset", variant="primary")

                with gr.Column(scale=1):
                    # Use gr.File component for download
                    download_file = gr.File(
                        label="Download your file here",
                        interactive=False,
                        visible=True
                    )

                    export_status = gr.Textbox(
                        label="Export Status",
                        lines=3,
                        interactive=False
                    )

        # Event handlers
        generate_schema_btn.click(
            generate_schema,
            inputs=[business_case_input, schema_mode, schema_input, schema_model, schema_temperature],
            outputs=[schema_output, schema_input, generation_schema, generation_business_case]
        )

        generate_dataset_btn.click(
            generate_dataset_ui,
            inputs=[generation_schema, generation_business_case, generation_model, generation_temperature, num_records, examples_input],
            outputs=[generation_status, dataset_preview, record_count]
        )

        apply_permutation_btn.click(
            apply_synonym_permutation,
            inputs=[enable_permutation, fields_to_permute, permutation_rate],
            outputs=[permuted_preview, permutation_status]
        )

        score_dataset_btn.click(
            score_dataset_quality,
            inputs=[scoring_model, scoring_temperature],
            outputs=[scoring_status, scores_dataframe, quality_report]
        )


        def export_dataset_with_file(file_format, filename, include_scores):
              """Export dataset with file download"""
              global current_dataset, current_scores

              if not current_dataset:
                  return None, "❌ No dataset to export"

              try:
                  if include_scores and current_scores:
                      file_path = dataset_exporter.save_with_scores(current_dataset, current_scores, file_format, filename)
                  else:
                      file_path = dataset_exporter.save_dataset(current_dataset, file_format, filename)

                  if file_path:
                      return file_path, f"✅ Dataset ready for download: {filename}"
                  else:
                      return None, "❌ Error creating file"

              except Exception as e:
                  return None, f"❌ Error exporting dataset: {str(e)}"

        export_btn.click(
            export_dataset_with_file,
            inputs=[file_format, filename, include_scores],
            outputs=[download_file, export_status]
        )

        def update_field_choices():
            """Update field choices when dataset is generated - FIXED VERSION"""
            global current_dataset

            if not current_dataset:
                print("DEBUG: No current dataset available")
                return gr.CheckboxGroup(choices=[], value=[])

            try:
                fields = synonym_permutator.identify_text_fields(current_dataset)
                print(f"DEBUG: Available fields for permutation: {fields}")

                if not fields:
                    print("DEBUG: No text fields identified")
                    return gr.CheckboxGroup(choices=[], value=[])

                return gr.CheckboxGroup(choices=fields, value=[])
            except Exception as e:
                print(f"DEBUG: Error identifying fields: {str(e)}")
                return gr.CheckboxGroup(choices=[], value=[])

        # Auto-update field choices
        generate_dataset_btn.click(
            generate_dataset_ui,
            inputs=[generation_schema, generation_business_case, generation_model, generation_temperature, num_records, examples_input],
            outputs=[generation_status, dataset_preview, record_count]
        ).then(
            update_field_choices,  # This should run after dataset generation
            outputs=[fields_to_permute]
        )

    return interface


In [None]:
# Launch the Gradio Interface
interface = create_gradio_interface()
interface.launch(debug=True, share=True)


## Example Workflow: Dataset

This section demonstrates the complete pipeline using a pharmacogenomics (PGx) example.

### Step 1: Schema Definition
The default schema is already configured for pharmacogenomics data, including:
- Patient demographics (age, gender, ethnicity)
- Genetic variants (CYP2D6, CYP2C19, etc.)
- Drug information (name, dosage)
- Clinical outcomes (efficacy, adverse reactions)
- Metabolizer status

### Step 2: Dataset Generation
1. Select a model (recommended: Llama 3.1 8B for quality, Llama 3.2 3B for speed)
2. Set temperature (0.7 for balanced creativity/consistency)
3. Specify number of records (50-100 for testing, 500+ for production)
4. Add few-shot examples if needed

### Step 3: Synonym Permutation
1. Enable permutation checkbox
2. Select text fields (e.g., drug_name, adverse_reaction)
3. Set permutation rate (20-30% recommended)
4. Apply to increase diversity

### Step 4: Quality Scoring
1. Select scoring model (can be different from generation model)
2. Use lower temperature (0.3) for consistent scoring
3. Review quality report and flagged records
4. Regenerate if quality is insufficient

### Step 5: Export
1. Choose format (CSV for analysis, JSON for APIs)
2. Include quality scores if needed
3. Download your dataset


In [None]:
# Testing and Validation Functions
def test_schema_generation():
    """Test schema generation functionality"""
    print("🧪 Testing Schema Generation...")

    # Test manual schema parsing
    test_schema = """1. patient_id (TEXT) - Unique patient identifier, example: PGX_001
2. age (INT) - Patient age in years, example: 45
3. drug_name (TEXT) - Medication name, example: Warfarin"""

    parsed = schema_manager.parse_manual_schema(test_schema)
    print(f"✅ Manual schema parsing: {len(parsed)} fields")

    # Test commercial API schema generation
    if "openai" in clients:
        print("🔄 Testing OpenAI schema generation...")
        result = schema_manager.generate_schema_with_llm(
            "Generate a dataset for e-commerce customer analysis",
            "Phi-3.5 Mini",
            1
        )
        print(f"✅ OpenAI schema generation: {len(result)} characters")

    return True

def test_dataset_generation():
    """Test dataset generation with small sample"""
    print("🧪 Testing Dataset Generation...")

    # Use a simple schema for testing
    test_schema = """1. name (TEXT) - Customer name, example: John Doe
2. age (INT) - Customer age, example: 30
3. purchase_amount (FLOAT) - Purchase amount, example: 99.99"""

    business_case = "Generate customer purchase data for a retail store"

    # Test with commercial API if available
    if "openai" in clients:
        print("🔄 Testing OpenAI dataset generation...")
        status, records = dataset_generator.generate_dataset(
            test_schema, business_case, "GPT-5 Mini", 1, 5, ""
        )
        print(f"✅ OpenAI generation: {status}")
        if records:
            print(f"   Generated {len(records)} records")

    return True

def test_synonym_permutation():
    """Test synonym permutation functionality"""
    print("🧪 Testing Synonym Permutation...")

    # Test synonym lookup
    test_word = "excellent"
    synonyms = synonym_permutator.get_synonyms(test_word)
    print(f"✅ Synonym lookup for '{test_word}': {len(synonyms)} synonyms found")

    # Test text permutation
    test_text = "The patient showed excellent response to treatment"
    permuted = synonym_permutator.get_permutation_preview(test_text, 0.3)
    print(f"✅ Text permutation: '{test_text}' -> '{permuted}'")

    return True

def test_quality_scoring():
    """Test quality scoring functionality"""
    print("🧪 Testing Quality Scoring...")

    # Create test record
    test_record = {
        "patient_id": "TEST_001",
        "age": 45,
        "drug_name": "Warfarin",
        "efficacy_score": 8
    }

    # Test quality rules extraction
    rules = quality_scorer.extract_quality_rules(
        "Test business case",
        "1. patient_id (TEXT) - Patient ID, example: P001"
    )
    print(f"✅ Quality rules extraction: {len(rules)} characters")

    return True

def run_integration_test():
    """Run complete integration test"""
    print("🚀 Running Integration Tests...")
    print("=" * 50)

    try:
        test_schema_generation()
        print()

        test_dataset_generation()
        print()

        test_synonym_permutation()
        print()

        test_quality_scoring()
        print()

        print("✅ All integration tests passed!")
        return True

    except Exception as e:
        print(f"❌ Integration test failed: {str(e)}")
        return False

# Run integration tests
run_integration_test()
