In [None]:
%pip install -q bitsandbytes>=0.43.1 accelerate transformers torch sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import pandas as pd
import random
from io import StringIO
from openai import OpenAI
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from dotenv import load_dotenv
import os

load_dotenv(override=True)
openai = OpenAI()

LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    tokenizer = AutoTokenizer.from_pretrained(LLAMA)
    tokenizer.pad_token = tokenizer.eos_token
    
    if torch.cuda.is_available():
        print("🚀 CUDA available - loading with quantization...")
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4"
        )
        model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
    else:
        print("💻 CPU mode - loading without quantization...")
        model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="cpu", torch_dtype=torch.float16)
    
    print("Llama model loaded successfully!")
except Exception as e:
    print(f"Llama model failed to load: {e}")
    print("Trying alternative loading method...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(LLAMA)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="cpu", torch_dtype=torch.float32)
        print("Llama model loaded in CPU mode!")
    except Exception as e2:
        print(f"Llama model completely failed: {e2}")
        print("Will use OpenAI only mode.")
        model = None
        tokenizer = None


💻 CPU mode - loading without quantization...


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Llama model failed to load: <ContextVar name='shell_parent' at 0x1061d0220>
Trying alternative loading method...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x126b14720>
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x126b14720>
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/opt/miniconda3/lib/python3.13/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'
Exception ignored in: <function tqdm.__del__ at 0x126b14720>
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.13/site-packages/tqdm/std.py", line 1148, in __del__
    self.close(

Llama model completely failed: <ContextVar name='shell_parent' at 0x1061d0220>
Will use OpenAI only mode.


In [7]:
def generate_with_openai(dataset_type, num_records, region):
    prompts = {
        "employees": f"Generate {num_records} synthetic employee records with {region} addresses. Include: employee_id, first_name, last_name, email, phone, department, salary, hire_date, address, city, state, country.",
        "customers": f"Generate {num_records} synthetic customer records with {region} addresses. Include: customer_id, first_name, last_name, email, phone, company, address, city, state, country, registration_date.",
        "products": f"Generate {num_records} synthetic product records. Include: product_id, name, category, price, description, brand, stock_quantity, supplier, created_date.",
        "transactions": f"Generate {num_records} synthetic transaction records. Include: transaction_id, customer_id, product_id, amount, quantity, transaction_date, payment_method, status."
    }
    
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a data generation expert. Create realistic, diverse synthetic data in CSV format."},
            {"role": "user", "content": prompts[dataset_type]}
        ]
    )
    
    return clean_csv_response(response.choices[0].message.content)

def generate_with_llama(dataset_type, num_records, region):
    if model is None or tokenizer is None:
        return "❌ Llama model not available. Please use OpenAI option."
    
    prompts = {
        "employees": f"Create {num_records} employee records with {region} addresses: employee_id, first_name, last_name, email, phone, department, salary, hire_date, address, city, state, country. Format as CSV.",
        "customers": f"Create {num_records} customer records with {region} addresses: customer_id, first_name, last_name, email, phone, company, address, city, state, country, registration_date. Format as CSV.",
        "products": f"Create {num_records} product records: product_id, name, category, price, description, brand, stock_quantity, supplier, created_date. Format as CSV.",
        "transactions": f"Create {num_records} transaction records: transaction_id, customer_id, product_id, amount, quantity, transaction_date, payment_method, status. Format as CSV."
    }
    
    try:
        inputs = tokenizer(prompts[dataset_type], return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=2048,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return clean_csv_response(response)
    except Exception as e:
        return f"❌ Error generating with Llama: {str(e)}"

def clean_csv_response(response):
    response = response.strip()
    if "```" in response:
        response = response.split("```")[1] if len(response.split("```")) > 1 else response
    return response


In [8]:
def generate_dataset(dataset_type, num_records, region, model_choice):
    try:
        if model_choice == "OpenAI GPT-4o-mini":
            csv_data = generate_with_openai(dataset_type, num_records, region)
        else:
            csv_data = generate_with_llama(dataset_type, num_records, region)
        
        df = pd.read_csv(StringIO(csv_data))
        return df, csv_data, f"✅ Generated {len(df)} records successfully!"
    except Exception as e:
        return pd.DataFrame(), "", f"❌ Error: {str(e)}"

def download_csv(csv_data):
    return csv_data if csv_data else ""


In [None]:
with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue="blue",
        neutral_hue="gray",
        font=["Inter", "ui-sans-serif", "system-ui"]
    ),
    css="""
    .gradio-container { max-width: 1200px !important; margin: auto !important; }
    .header { text-align: center; margin-bottom: 2em; }
    .header h1 { color: #1f2937; font-size: 2.5em; margin-bottom: 0.5em; }
    .header p { color: #6b7280; font-size: 1.1em; }
    .generate-btn { background: linear-gradient(135deg, #3b82f6 0%, #1d4ed8 100%) !important; }
    .generate-btn:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 25px rgba(59, 130, 246, 0.3) !important; }
    .stats-card { background: linear-gradient(135deg, #f8fafc 0%, #e2e8f0 100%); border-radius: 12px; padding: 1.5em; margin: 1em 0; }
    """
) as demo:
    
    gr.HTML("""
    <div class="header">
        <h1>Synthetic Dataset Generator</h1>
        <p>Generate realistic synthetic datasets using AI models for testing and development</p>
    </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### Configuration")
            
            dataset_type = gr.Dropdown(
                choices=["employees", "customers", "products", "transactions"],
                value="employees",
                label="Dataset Type",
                info="Choose the type of data to generate"
            )
            
            num_records = gr.Slider(
                minimum=5, maximum=100, step=5, value=20,
                label="Number of Records",
                info="How many records to generate"
            )
            
            region = gr.Dropdown(
                choices=["US Only", "International", "Mixed", "Europe", "Asia"],
                value="US Only",
                label="Geographic Region",
                info="Location for addresses and phone numbers"
            )
            
            model_choice = gr.Radio(
                choices=["OpenAI GPT-4o-mini", "Llama 3.1 8B"],
                value="OpenAI GPT-4o-mini",
                label="AI Model",
                info="Choose the AI model for generation"
            )
            
            generate_btn = gr.Button(
                "Generate Dataset",
                variant="primary",
                elem_classes="generate-btn",
                size="lg"
            )
        
        with gr.Column(scale=2):
            gr.Markdown("### Generated Dataset")
            
            status = gr.Markdown("Ready to generate your dataset!")
            
            dataframe_output = gr.Dataframe(
                value=pd.DataFrame(),
                label="Dataset Preview",
                wrap=True
            )
            
            with gr.Row():
                csv_output = gr.Textbox(
                    value="",
                    label="CSV Data",
                    lines=10,
                    max_lines=15
                )
                
                download_btn = gr.DownloadButton(
                    "Download CSV",
                    elem_id="download-btn"
                )
    
    generate_btn.click(
        generate_dataset,
        inputs=[dataset_type, num_records, region, model_choice],
        outputs=[dataframe_output, csv_output, status]
    )
    
    csv_output.change(
        download_csv,
        inputs=[csv_output],
        outputs=[download_btn]
    )

demo.launch(share=True, inbrowser=True)


* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://aaf0c65f7daaafbd21.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.13/site-packages/gradio/queueing.py", line 759, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
    )
    ^
  File "/opt/miniconda3/lib/python3.13/site-packages/gradio/route_utils.py", line 354, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<11 lines>...
    )
    ^
  File "/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py", line 2127, in process_api
    data = await self.postprocess_data(block_fn, result["prediction"], state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/miniconda3/lib/python3.13/site-packages/gradio/blocks.py", line 1910, in postprocess_data
    await processing_utils.async_move_files_to_cache(
    ...<3 lines>...
    )
  File "/opt/miniconda3/lib/python3.13/site-packages/g