## Synthetic User Data Generator
A simple toolkit for generating realistic but completely fictional user profiles using a language model. 

These profiles can be used for testing, demos, or filling sample databases without exposing real user data.

## Generator Class Features

- Generate any number of synthetic user profiles with customizable attributes

- Uses example profiles as templates to ensure consistent formatting

- Filter by demographics, location, industry, and more

- Export data as CSV or JSON

- Leverages GPU acceleration when available for faster generation

- Creates completely fictional data that looks realistic but doesn't represent real individuals

In [9]:
import json
import logging
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import re
import os

# setup basic logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('generator.log')
    ]
)
logger = logging.getLogger(__name__)

def generate_synthetic_data(
    num_records=10,
    model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct",
    temperature=0.7,
    max_new_tokens=3000,
    output_file="synthetic_users.csv",
    example_data=None,
    save_raw=True,
    # guidance params
    countries=None,
    age_range=None,
    states=None,
    occupations=None,
    industries=None,
    email_domains=None,
    time_period=None,
    gender=None,
    guidance_notes=None
):
    """
    Generate synthetic data based on example structure with optional guidance parameters.
    """
    # fallback to defaults if no examples given
    if example_data is None:
        example_data = [
            {
                "ID": 1,
                "Name": "Melissa Thornton",
                "Age": 34,
                "Email": "m.thornton83@fastmail.net",
                "Phone": "507-382-9155",
                "Address": "726 Willow Lane",
                "City": "Rochester",
                "State": "MN",
                "Country": "United States",
                "ZIP": "55901",
                "Occupation": "Dental Hygienist",
                "Account_Created": "2022-03-17",
                "Last_Login": "2025-02-15"
            },
            {
                "ID": 2,
                "Name": "Jamal Washington",
                "Age": 42,
                "Email": "jwash_business@gmail.com",
                "Phone": "213-555-8071",
                "Address": "1840 Crenshaw Blvd",
                "City": "Los Angeles",
                "State": "CA",
                "Country": "United States",
                "ZIP": "90008",
                "Occupation": "Marketing Director",
                "Account_Created": "2021-06-04",
                "Last_Login": "2025-02-25"
            },
            {
                "ID": 3,
                "Name": "Mei-Ling Chen",
                "Age": 27,
                "Email": "meiling.chen@outlook.com",
                "Phone": "415-222-3644",
                "Address": "892 Stockton Street",
                "City": "San Francisco",
                "State": "CA",
                "Country": "United States",
                "ZIP": "94108",
                "Occupation": "Software Engineer",
                "Account_Created": "2023-08-15",
                "Last_Login": "2025-03-01"
            }
        ]
    
    # count our examples
    num_examples = len(example_data)
    
    # adjust requested records to include examples
    total_records = num_records + num_examples
    
    # figure out where to start the IDs
    next_id = max([ex["ID"] for ex in example_data]) + 1
    
    logger.info(f"Loading model: {model_name}")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    logger.info(f"Using device: {device}")
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        logger.info("Model loaded successfully")
    except Exception as e:
        logger.error(f"Failed to load model: {e}")
        raise

    # format examples for the prompt
    examples_formatted = ""
    for example in example_data:
        examples_formatted += "STARTRECORD\n"
        for key, value in example.items():
            examples_formatted += f"{key}: {value}\n"
        examples_formatted += "ENDRECORD\n\n"
    
    # build up the guidance text based on params
    guidance_instructions = []
    
    if countries:
        if isinstance(countries, list):
            countries_text = ", ".join(countries)
        else:
            countries_text = countries
        guidance_instructions.append(f"- Only generate profiles from these countries/regions: {countries_text}")
    
    if age_range:
        if isinstance(age_range, list) and len(age_range) == 2:
            age_text = f"between {age_range[0]} and {age_range[1]}"
        else:
            age_text = str(age_range)
        guidance_instructions.append(f"- Age should be {age_text}")
    
    if states:
        if isinstance(states, list):
            states_text = ", ".join(states)
        else:
            states_text = states
        guidance_instructions.append(f"- Only use these states/provinces: {states_text}")
    
    if occupations:
        if isinstance(occupations, list):
            occupations_text = ", ".join(occupations)
        else:
            occupations_text = occupations
        guidance_instructions.append(f"- Use these occupations: {occupations_text}")
    
    if industries:
        if isinstance(industries, list):
            industries_text = ", ".join(industries)
        else:
            industries_text = industries
        guidance_instructions.append(f"- Focus on these industries: {industries_text}")
    
    if email_domains:
        if isinstance(email_domains, list):
            domains_text = ", ".join(email_domains)
        else:
            domains_text = email_domains
        guidance_instructions.append(f"- Use these email domains: {domains_text}")
    
    if time_period:
        guidance_instructions.append(f"- All dates should be within: {time_period}")
    
    if gender:
        if isinstance(gender, list):
            gender_text = ", ".join(gender)
        else:
            gender_text = gender
        guidance_instructions.append(f"- Only include {gender_text} individuals")
    
    if guidance_notes:
        guidance_instructions.append(f"- Additional guidance: {guidance_notes}")
    
    # default to something if nothing specified
    if not guidance_instructions:
        guidance_instructions.append("- Include diverse demographics, backgrounds, locations, and occupations")
    
    # stick it all together
    guidance_text = "\n".join(guidance_instructions)
    
    prompt = f"""Generate {total_records} completely fictional user profiles.

Here are some example profiles in the required format:

{examples_formatted}

Rules:
1. Generate exactly {total_records} total records
2. Use the EXACT same field names and format as the examples
3. Include the initial sample examples in the answer.
3. Each profile must be wrapped with STARTRECORD and ENDRECORD delimiters
4. Make the data realistic but entirely fictional
5. Make sure some fields such as address, phone number, email address, etc, are all unique. We should not have this sort of data repeated between individuals.

Specific guidance for these profiles:
{guidance_text}

ONLY generate profiles with the STARTRECORD/ENDRECORD format. No other text."""

    logger.debug(f"Prompt: {prompt}")

    # setup for the model
    messages = [{"role": "user", "content": prompt}]
    input_text = tokenizer.apply_chat_template(messages, tokenize=False)
    
    logger.info(f"Generating {total_records} records (including {num_examples} examples)")
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=0.95,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # get what the model output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = output_text.split(input_text)[-1].strip()
    
    # raw output for debugging
    if save_raw:
        with open("raw_model_output.txt", "w") as f:
            f.write(response)
        logger.info("Raw model output saved to raw_model_output.txt")
    
    # parse out the records from the response
    records = []
    pattern = r"STARTRECORD(.*?)ENDRECORD"
    matches = re.findall(pattern, response, re.DOTALL)
    
    logger.info(f"Found {len(matches)} record blocks in response")
    
    for i, match in enumerate(matches):
        record = {}
        
        # split into lines and extract field:value pairs
        lines = match.strip().split('\n')
        for line in lines:
            line = line.strip()
            if ':' in line:
                key, value = line.split(':', 1)
                key = key.strip()
                value = value.strip()
                record[key] = value
        
        if record:
            records.append(record)
    
    logger.info(f"Final record count: {len(records)}")
    
    # save records to dataframe
    df = pd.DataFrame(records)
    
    # save to disk if needed
    if output_file:
        output_dir = os.path.dirname(output_file)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        if output_file.endswith('.csv'):
            df.to_csv(output_file, index=False)
        elif output_file.endswith('.json'):
            df.to_json(output_file, orient='records', indent=2)
        else:
            df.to_csv(output_file, index=False)
            
        logger.info(f"Saved {len(df)} records to {output_file}")
    
    return df

In [10]:
tech_professionals_df = generate_synthetic_data(
    num_records=10, 
    states=["CA", "NY"],
    industries="Technology",
    occupations=["Software Engineer", "Data Scientist", "Product Manager", "UX Designer"],
    age_range=[25, 45],
    email_domains=["gmail.com", "outlook.com", "icloud.com", "protonmail.com"],
    time_period="2020-present",
    output_file="tech_professionals.csv"
)

2025-03-04 04:46:49,783 - INFO - Loading model: HuggingFaceTB/SmolLM2-1.7B-Instruct
2025-03-04 04:46:49,784 - INFO - Using device: cpu
2025-03-04 04:46:53,071 - INFO - Model loaded successfully
2025-03-04 04:46:53,072 - INFO - Generating 13 records (including 3 examples)
2025-03-04 04:50:49,605 - INFO - Raw model output saved to raw_model_output.txt
2025-03-04 04:50:49,607 - INFO - Found 18 record blocks in response
2025-03-04 04:50:49,608 - INFO - Final record count: 16
2025-03-04 04:50:49,612 - INFO - Saved 16 records to tech_professionals.csv
