In [1]:
import pandas as pd
import random, uuid, math
from datetime import datetime

# ----------------------------
# Constants
# ----------------------------
GENDERS = ["Woman", "Man", "Non-binary"]

CITIES = [
    "Mumbai", "Delhi", "Bengaluru", "Hyderabad", "Chennai",
    "Pune", "Kolkata", "Ahmedabad", "Jaipur", "Surat"
]

INTEREST_CLUSTERS = {
    "Active": ["Hiking", "Running", "Yoga", "Dancing", "Photography"],
    "Arts": ["Art", "Theatre", "Poetry", "Movies", "Music"],
    "Geek": ["Tech", "Gaming", "Startups", "Board Games"],
    "Social": ["Foodie", "Travel", "Standup Comedy", "Volunteering"],
    "Sports": ["Cricket", "Football", "Basketball"],
}

CITY_META = {
    "Mumbai": 10, "Delhi": 10, "Bengaluru": 9, "Hyderabad": 8, "Chennai": 7,
    "Pune": 6, "Kolkata": 6, "Ahmedabad": 5, "Jaipur": 4, "Surat": 4
}

FEMALE_FIRST = ["Aditi","Aarohi","Anaya","Diya","Isha","Myra","Sara","Siya","Tara","Zara",
                "Neha","Priya","Naina","Rhea","Meera","Anika","Kavya","Ritu","Pooja","Sana"]
MALE_FIRST   = ["Aarav","Vivaan","Aditya","Vihaan","Arjun","Sai","Krishna","Ishaan","Rohan","Kabir",
                "Raghav","Aman","Rajat","Varun","Anil","Rahul","Aakash","Nikhil","Sandeep","Yash"]
UNISEX_FIRST = ["Sam","Krishna","Dev","Shiv","Arya","Sasha","Riyaan","Jai","Ray","Kiran"]

rng = random.Random(42)

# ----------------------------
# Helpers
# ----------------------------
def face_url_from_randomuser(pid: str, gender: str) -> str:
    idx = int(pid, 16) % 100
    folder = "women" if (gender == "Woman" or (gender == "Non-binary" and idx % 2 == 0)) else "men"
    return f"https://randomuser.me/api/portraits/{folder}/{idx}.jpg"

def weighted_choice(items, weights):
    return rng.choices(items, weights=weights, k=1)[0]

def sample_gender():
    return rng.choices(GENDERS, weights=[0.47, 0.47, 0.06], k=1)[0]

def sample_name(gender):
    if gender == "Woman":
        pool = FEMALE_FIRST + UNISEX_FIRST
    elif gender == "Man":
        pool = MALE_FIRST + UNISEX_FIRST
    else:
        pool = UNISEX_FIRST + FEMALE_FIRST[:8] + MALE_FIRST[:8]
    return rng.choice(pool)

def truncated_normal(mean, sd, min_v, max_v):
    while True:
        x = rng.gauss(mean, sd)
        if min_v <= x <= max_v:
            return int(round(x))

def sample_age(city):
    mean = 27 if city in {"Bengaluru","Pune","Hyderabad"} else 29
    sd = 4.5
    return truncated_normal(mean, sd, 21, 45)

def sample_distance_km(city):
    lam = 1/6.0
    val = int(round(min(30, max(1, rng.expovariate(lam)))))
    if city in {"Mumbai","Delhi"} and rng.random() < 0.3:
        val = min(30, val + rng.randint(2,5))
    return val

def pick_interest_cluster(age, city):
    weights = {"Active":1,"Arts":1,"Geek":1,"Social":1,"Sports":1}
    if age <= 26:
        weights["Geek"] += 0.6; weights["Social"] += 0.4
    if age >= 30:
        weights["Arts"] += 0.4; weights["Active"] += 0.2
    if city in {"Bengaluru","Hyderabad","Pune"}:
        weights["Geek"] += 0.6
    if city in {"Mumbai","Delhi"}:
        weights["Arts"] += 0.3; weights["Social"] += 0.3
    keys = list(INTEREST_CLUSTERS.keys())
    w = [weights[k] for k in keys]
    return rng.choices(keys, weights=w, k=1)[0]

def sample_interests(age, city):
    k = rng.randint(3,6)
    base = pick_interest_cluster(age, city)
    alt = base if rng.random() < 0.6 else rng.choice(list(INTEREST_CLUSTERS.keys()))
    pool = INTEREST_CLUSTERS[base] + INTEREST_CLUSTERS[alt]
    pool = list(dict.fromkeys(pool))
    if rng.random() < 0.35:
        all_interests = [i for v in INTEREST_CLUSTERS.values() for i in v]
        extras = [i for i in all_interests if i not in pool]
        if extras:
            pool += rng.sample(extras, k=min(3, len(extras)))
    rng.shuffle(pool)
    return pool[:k]

def make_bio(name, age, city, interests):
    lead = rng.choice([
        "Powered by chai and chaotic good energy.",
        "Part-time explorer, full-time snack enthusiast.",
        "Weekends = long walks + long playlists.",
        "Trying new things and new foods—recommendations welcome.",
        "Recovering overthinker, thriving bruncher.",
        "Swaps memes for restaurant tips.",
    ])
    hook = rng.choice([
        f"Into {interests[0].lower()} and {interests[1].lower()}",
        f"{interests[0]} > {interests[1]}? Discuss.",
        f"If you like {interests[0].lower()}, we’ll get along.",
        f"From {city}, chasing {interests[-1].lower()} vibes.",
        f"{interests[0]}, {interests[1]}, and probably {interests[-1].lower()}",
    ])
    closer = rng.choice([
        "Coffee then a walk?",
        "Open to spontaneous day trips.",
        "Here for good banter and better food.",
        "Teach me your niche skill.",
        "Playlist swaps encouraged.",
    ])
    return f"{lead} {hook}. {closer}"

# ----------------------------
# Main generator
# ----------------------------
def make_fake_profiles(n=10000, seed=123):
    rng.seed(seed)
    city_names = list(CITY_META.keys())
    city_weights = list(CITY_META.values())
    rows = []
    for _ in range(n):
        pid = str(uuid.uuid4())[:8]
        city = weighted_choice(city_names, city_weights)
        gender = sample_gender()
        name = sample_name(gender)
        age = sample_age(city)
        distance = sample_distance_km(city)
        interests = sample_interests(age, city)
        about = make_bio(name, age, city, interests)
        photo_url = face_url_from_randomuser(pid, gender)
        rows.append({
            "id": pid,
            "name": name,
            "age": age,
            "gender": gender,
            "city": city,
            "distance_km": distance,
            "interests": interests,
            "about": about,
            "photo_url": photo_url,
        })
    return pd.DataFrame(rows)

# ----------------------------
# Run + save


In [2]:

df = make_fake_profiles(n=10000, seed=123)
print(df.head())
df.to_csv("profiles.csv", index=False)
print(f"\n✅ Saved {len(df)} profiles to profiles.csv at {datetime.now().strftime('%H:%M:%S')}")


         id     name  age gender       city  distance_km  \
0  bd652cd1     Rhea   28  Woman     Mumbai           12   
1  bb835d19     Isha   38  Woman     Jaipur            1   
2  4954b69b  Sandeep   30    Man  Ahmedabad            3   
3  077a429d      Sai   25    Man     Jaipur            2   
4  115c0f49   Riyaan   32    Man  Bengaluru           29   

                                           interests  \
0  [Theatre, Dancing, Music, Travel, Standup Comedy]   
1                           [Music, Poetry, Theatre]   
2  [Travel, Foodie, Football, Standup Comedy, Bas...   
3  [Foodie, Travel, Volunteering, Standup Comedy,...   
4                   [Startups, Gaming, Theatre, Art]   

                                               about  \
0  Weekends = long walks + long playlists. From M...   
1  Recovering overthinker, thriving bruncher. Fro...   
2  Swaps memes for restaurant tips. Travel > Food...   
3  Powered by chai and chaotic good energy. If yo...   
4  Part-time explorer,

In [3]:
df

Unnamed: 0,id,name,age,gender,city,distance_km,interests,about,photo_url
0,bd652cd1,Rhea,28,Woman,Mumbai,12,"[Theatre, Dancing, Music, Travel, Standup Comedy]",Weekends = long walks + long playlists. From M...,https://randomuser.me/api/portraits/women/33.jpg
1,bb835d19,Isha,38,Woman,Jaipur,1,"[Music, Poetry, Theatre]","Recovering overthinker, thriving bruncher. Fro...",https://randomuser.me/api/portraits/women/41.jpg
2,4954b69b,Sandeep,30,Man,Ahmedabad,3,"[Travel, Foodie, Football, Standup Comedy, Bas...",Swaps memes for restaurant tips. Travel > Food...,https://randomuser.me/api/portraits/men/39.jpg
3,077a429d,Sai,25,Man,Jaipur,2,"[Foodie, Travel, Volunteering, Standup Comedy,...",Powered by chai and chaotic good energy. If yo...,https://randomuser.me/api/portraits/men/57.jpg
4,115c0f49,Riyaan,32,Man,Bengaluru,29,"[Startups, Gaming, Theatre, Art]","Part-time explorer, full-time snack enthusiast...",https://randomuser.me/api/portraits/men/97.jpg
...,...,...,...,...,...,...,...,...,...
9995,76fe9929,Myra,23,Non-binary,Pune,1,"[Gaming, Dancing, Volunteering, Tech]","Recovering overthinker, thriving bruncher. Fro...",https://randomuser.me/api/portraits/men/41.jpg
9996,357e201e,Jai,29,Man,Chennai,1,"[Hiking, Basketball, Cricket, Movies, Football...","Recovering overthinker, thriving bruncher. If ...",https://randomuser.me/api/portraits/men/6.jpg
9997,2fb19fa9,Aarav,30,Man,Pune,1,"[Tech, Startups, Board Games]",Swaps memes for restaurant tips. Into tech and...,https://randomuser.me/api/portraits/men/97.jpg
9998,9bbcb7ee,Rahul,32,Man,Mumbai,4,"[Foodie, Standup Comedy, Music]","Recovering overthinker, thriving bruncher. Foo...",https://randomuser.me/api/portraits/men/34.jpg


In [4]:
    import pandas as pd
    import random, uuid
    from datetime import datetime

    # ================================================================
    # Config: global RNG & sizes
    # ================================================================
    DEFAULT_SEED = 123
    N_PROFILES = 10_000
    rng = random.Random(DEFAULT_SEED)

    # ================================================================
    # Taxonomies
    # ================================================================
    GENDERS = ["Woman", "Man", "Non-binary"]

    INTEREST_CLUSTERS = {
        "Active": ["Hiking", "Running", "Yoga", "Dancing", "Photography"],
        "Arts": ["Art", "Theatre", "Poetry", "Movies", "Music"],
        "Geek": ["Tech", "Gaming", "Startups", "Board Games"],
        "Social": ["Foodie", "Travel", "Standup Comedy", "Volunteering"],
        "Sports": ["Cricket", "Football", "Basketball"],
    }
    ALL_INTERESTS = sorted({i for v in INTEREST_CLUSTERS.values() for i in v})

    # Name pools (extend as you like)
    FEMALE_FIRST = [
        "Aditi","Aarohi","Anaya","Diya","Isha","Myra","Sara","Siya","Tara","Zara",
        "Neha","Priya","Naina","Rhea","Meera","Anika","Kavya","Ritu","Pooja","Sana",
        "Anna","Maria","Sofia","Emma","Olivia","Mia","Aisha","Fatima","Yuna","Mei",
        "Camila","Valentina","Amara","Zainab","Helena","Elena","Giulia","Lina","Aya"
    ]
    MALE_FIRST = [
        "Aarav","Vivaan","Aditya","Vihaan","Arjun","Sai","Krishna","Ishaan","Rohan","Kabir",
        "Raghav","Aman","Rajat","Varun","Anil","Rahul","Aakash","Nikhil","Sandeep","Yash",
        "Liam","Noah","Lucas","Mateo","Ethan","Leo","Hiro","Daichi","Minjun","Jae",
        "Luis","Diego","Andre","Omar","Youssef","Ali","Marco","Jonas","Felix","Tariq"
    ]
    UNISEX_FIRST = ["Sam","Dev","Shiv","Arya","Sasha","Riyaan","Jai","Ray","Kiran","Alex","Charlie","Noor","Ariel","Jordan","Kai"]

    # ================================================================
    # Geography: India tiers + global regions
    # Weights are rough, population-leaning proxies (tune freely)
    # ================================================================
    INDIA_TIERS = {
        "Tier-1": [
            ("India", "Mumbai", 10),
            ("India", "Delhi", 10),
            ("India", "Bengaluru", 9),
            ("India", "Hyderabad", 8),
            ("India", "Chennai", 7),
            ("India", "Kolkata", 7),
            ("India", "Pune", 6),
            ("India", "Ahmedabad", 5),
        ],
        "Tier-2": [
            ("India", "Jaipur", 4), ("India", "Surat", 4), ("India", "Lucknow", 4),
            ("India", "Kanpur", 3), ("India", "Nagpur", 3), ("India", "Indore", 3),
            ("India", "Bhopal", 3), ("India", "Chandigarh", 2), ("India", "Kochi", 2),
            ("India", "Coimbatore", 2),
        ],
        "Tier-3": [
            ("India", "Patna", 2), ("India", "Guwahati", 2), ("India", "Visakhapatnam", 2),
            ("India", "Vijayawada", 2), ("India", "Bhubaneswar", 2), ("India", "Thiruvananthapuram", 2),
            ("India", "Vadodara", 2), ("India", "Nashik", 2), ("India", "Ludhiana", 2), ("India", "Rajkot", 2),
        ],
    }

    WORLD_REGIONS = {
        "South Asia (non-India)": [
            ("Bangladesh", "Dhaka", 8), ("Bangladesh", "Chittagong", 3),
            ("Pakistan", "Karachi", 9), ("Pakistan", "Lahore", 6), ("Pakistan", "Islamabad", 2),
            ("Sri Lanka", "Colombo", 2), ("Nepal", "Kathmandu", 2),
        ],
        "East Asia": [
            ("Japan", "Tokyo", 10), ("Japan", "Osaka", 4),
            ("South Korea", "Seoul", 8), ("South Korea", "Busan", 3),
            ("China", "Shanghai", 10), ("China", "Beijing", 9), ("China", "Shenzhen", 7), ("China", "Guangzhou", 7),
            ("Taiwan", "Taipei", 4), ("Hong Kong", "Hong Kong", 5),
        ],
        "Southeast Asia": [
            ("Singapore", "Singapore", 6),
            ("Malaysia", "Kuala Lumpur", 4),
            ("Thailand", "Bangkok", 7),
            ("Indonesia", "Jakarta", 9), ("Vietnam", "Ho Chi Minh City", 6), ("Vietnam", "Hanoi", 5),
            ("Philippines", "Manila", 8),
        ],
        "North America": [
            ("USA", "New York", 9), ("USA", "Los Angeles", 8), ("USA", "Chicago", 6),
            ("USA", "San Francisco", 5), ("USA", "Houston", 5), ("USA", "Miami", 5),
            ("Canada", "Toronto", 6), ("Canada", "Vancouver", 4), ("Canada", "Montreal", 4),
            ("Mexico", "Mexico City", 9), ("Mexico", "Guadalajara", 4),
        ],
        "Europe": [
            ("UK", "London", 9), ("France", "Paris", 8), ("Germany", "Berlin", 6),
            ("Spain", "Madrid", 5), ("Spain", "Barcelona", 5),
            ("Italy", "Rome", 5), ("Italy", "Milan", 4),
            ("Netherlands", "Amsterdam", 4), ("Austria", "Vienna", 4), ("Sweden", "Stockholm", 3),
        ],
        "MENA": [
            ("UAE", "Dubai", 7), ("UAE", "Abu Dhabi", 4),
            ("Saudi Arabia", "Riyadh", 6), ("Saudi Arabia", "Jeddah", 5),
            ("Egypt", "Cairo", 8), ("Egypt", "Alexandria", 4),
            ("Türkiye", "Istanbul", 8), ("Morocco", "Casablanca", 4),
        ],
        "Sub-Saharan Africa": [
            ("Nigeria", "Lagos", 9), ("Nigeria", "Abuja", 4),
            ("Kenya", "Nairobi", 6), ("Kenya", "Mombasa", 3),
            ("Ghana", "Accra", 4), ("Ghana", "Kumasi", 3),
            ("South Africa", "Johannesburg", 5), ("South Africa", "Cape Town", 5), ("South Africa", "Durban", 3),
            ("Ethiopia", "Addis Ababa", 5),
        ],
        "Latin America": [
            ("Brazil", "São Paulo", 10), ("Brazil", "Rio de Janeiro", 7),
            ("Argentina", "Buenos Aires", 8), ("Chile", "Santiago", 6),
            ("Peru", "Lima", 7), ("Colombia", "Bogotá", 7), ("Colombia", "Medellín", 4),
            ("Ecuador", "Quito", 3), ("Uruguay", "Montevideo", 3),
        ],
        "Oceania": [
            ("Australia", "Sydney", 6), ("Australia", "Melbourne", 6),
            ("Australia", "Brisbane", 3), ("Australia", "Perth", 3),
            ("New Zealand", "Auckland", 3), ("New Zealand", "Wellington", 2),
        ],
    }

    def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
        rows = []
        if include_india:
            tiers = ["Tier-1", "Tier-2", "Tier-3"]
            tier_w = dict(zip(tiers, india_tier_bias))
            for tier in tiers:
                for country, city, w in INDIA_TIERS[tier]:
                    rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
        for region, cities in WORLD_REGIONS.items():
            for country, city, w in cities:
                rows.append((region, country, city, w))
        return rows

    WORLD_CITY_TABLE = build_city_table()

    # ================================================================
    # Background attributes (coarse) — optional
    # ================================================================
    COUNTRY_LANG_PALETTE = {
        "India": ["Hindi","English","Bengali","Telugu","Marathi","Tamil","Urdu","Gujarati","Kannada","Malayalam","Punjabi"],
        "USA": ["English","Spanish"], "UK": ["English"], "Canada": ["English","French"],
        "Mexico": ["Spanish"], "Brazil": ["Portuguese"], "France": ["French"], "Germany": ["German"],
        "Spain": ["Spanish","Catalan"], "Italy": ["Italian"], "Netherlands": ["Dutch"], "Sweden": ["Swedish"],
        "Turkey": ["Turkish"], "UAE": ["Arabic","English"], "Saudi Arabia": ["Arabic"], "Egypt": ["Arabic"],
        "Nigeria": ["English","Yoruba","Hausa","Igbo"], "Kenya": ["English","Swahili"],
        "South Africa": ["English","Zulu","Xhosa","Afrikaans"], "Ethiopia": ["Amharic","Oromo","Tigrinya"],
        "Bangladesh": ["Bengali"], "Pakistan": ["Urdu","Punjabi","Pashto","Sindhi"], "Sri Lanka": ["Sinhala","Tamil"],
        "Nepal": ["Nepali"], "Japan": ["Japanese"], "South Korea": ["Korean"], "China": ["Mandarin"],
        "Hong Kong": ["Cantonese","English"], "Taiwan": ["Mandarin"], "Singapore": ["English","Mandarin","Malay","Tamil"],
        "Malaysia": ["Malay","English","Mandarin","Tamil"], "Thailand": ["Thai"], "Indonesia": ["Indonesian"],
        "Vietnam": ["Vietnamese"], "Philippines": ["Filipino","English"], "Australia": ["English"],
        "New Zealand": ["English","Māori"], "Argentina": ["Spanish"], "Chile": ["Spanish"], "Peru": ["Spanish"],
        "Colombia": ["Spanish"], "Uruguay": ["Spanish"], "Ecuador": ["Spanish"],
    }

    COUNTRY_RELIGION_PALETTE = {
        "India": ["Hindu","Muslim","Christian","Sikh","Buddhist","Jain","Other"],
        "USA": ["Christian","Unaffiliated","Jewish","Muslim","Hindu","Buddhist","Other"],
        "UK": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Jewish","Buddhist"],
        "Canada": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Buddhist","Jewish"],
        "Mexico": ["Christian","Unaffiliated","Other"], "Brazil": ["Christian","Spiritist","Afro-Brazilian","Unaffiliated","Other"],
        "France": ["Unaffiliated","Christian","Muslim","Jewish","Buddhist","Other"],
        "Germany": ["Christian","Unaffiliated","Muslim","Other"], "Spain": ["Christian","Unaffiliated","Other"],
        "Italy": ["Christian","Unaffiliated","Other"], "Netherlands": ["Unaffiliated","Christian","Muslim","Other"],
        "Sweden": ["Unaffiliated","Christian","Other"], "Turkey": ["Muslim","Other"], "UAE": ["Muslim","Christian","Hindu","Buddhist","Other"],
        "Saudi Arabia": ["Muslim","Other"], "Egypt": ["Muslim","Christian","Other"],
        "Nigeria": ["Christian","Muslim","Traditional","Other"], "Kenya": ["Christian","Muslim","Traditional","Other"],
        "South Africa": ["Christian","Traditional","Unaffiliated","Other"], "Ethiopia": ["Christian","Muslim","Other"],
        "Bangladesh": ["Muslim","Hindu","Other"], "Pakistan": ["Muslim","Other"], "Sri Lanka": ["Buddhist","Hindu","Muslim","Christian"],
        "Nepal": ["Hindu","Buddhist","Other"], "Japan": ["Shinto","Buddhist","Other"], "South Korea": ["Unaffiliated","Christian","Buddhist","Other"],
        "China": ["Unaffiliated","Folk/Traditional","Buddhist","Christian","Other"], "Hong Kong": ["Buddhist","Taoist","Christian","Other"],
        "Taiwan": ["Folk/Traditional","Buddhist","Taoist","Other"], "Singapore": ["Buddhist","Taoist","Muslim","Christian","Hindu","Other"],
        "Malaysia": ["Muslim","Buddhist","Christian","Hindu","Other"], "Thailand": ["Buddhist","Other"],
        "Indonesia": ["Muslim","Christian","Hindu","Buddhist","Other"], "Vietnam": ["Unaffiliated","Buddhist","Christian","Other"],
        "Philippines": ["Christian","Muslim","Other"], "Australia": ["Christian","Unaffiliated","Other"],
        "New Zealand": ["Unaffiliated","Christian","Other"], "Argentina": ["Christian","Unaffiliated","Other"],
        "Chile": ["Christian","Unaffiliated","Other"], "Peru": ["Christian","Unaffiliated","Other"],
        "Colombia": ["Christian","Unaffiliated","Other"], "Uruguay": ["Unaffiliated","Christian","Other"],
        "Ecuador": ["Christian","Other"],
    }

    # ================================================================
    # Ethnicity mapping (coarse) by country -> ethnicity label
    # You can refine these weights per your needs.
    # ================================================================
    ETHNICITY_LABELS = [
        "South Asian","East Asian","Southeast Asian","Middle Eastern/North African",
        "Black/African","White/European","Latino/Hispanic","Pacific Islander","Mixed/Other"
    ]

    COUNTRY_TO_ETHNICITY = {
        # South Asia
        "India": "South Asian", "Pakistan": "South Asian", "Bangladesh": "South Asian",
        "Sri Lanka": "South Asian", "Nepal": "South Asian",
        # East Asia
        "Japan": "East Asian", "South Korea": "East Asian", "China": "East Asian",
        "Taiwan": "East Asian", "Hong Kong": "East Asian",
        # SE Asia
        "Singapore": "Southeast Asian", "Malaysia": "Southeast Asian", "Thailand": "Southeast Asian",
        "Indonesia": "Southeast Asian", "Vietnam": "Southeast Asian", "Philippines": "Southeast Asian",
        # MENA / West Asia
        "UAE": "Middle Eastern/North African", "Saudi Arabia": "Middle Eastern/North African",
        "Egypt": "Middle Eastern/North African", "Türkiye": "Middle Eastern/North African", "Morocco": "Middle Eastern/North African",
        # Sub-Saharan Africa
        "Nigeria": "Black/African", "Kenya": "Black/African", "Ghana": "Black/African",
        "South Africa": "Black/African", "Ethiopia": "Black/African",
        # Latin America
        "Brazil": "Latino/Hispanic", "Argentina": "Latino/Hispanic", "Chile": "Latino/Hispanic",
        "Peru": "Latino/Hispanic", "Colombia": "Latino/Hispanic", "Uruguay": "Latino/Hispanic", "Ecuador": "Latino/Hispanic",
        # North America & Europe (coarse defaults)
        "USA": "White/European", "Canada": "White/European", "Mexico": "Latino/Hispanic",
        "UK": "White/European", "France": "White/European", "Germany": "White/European",
        "Spain": "White/European", "Italy": "White/European", "Netherlands": "White/European",
        "Austria": "White/European", "Sweden": "White/European",
        # Oceania
        "Australia": "White/European", "New Zealand": "White/European",
    }

    # ================================================================
    # Photo provider
    # ------------------------------------------------
    # Provide lists of headshot URLs per ethnicity. Expand with your own CDN.
    # When a bucket is empty, fallback to RandomUser portraits (gendered).
    # ================================================================
    PHOTO_CATALOG = {
        # Fill these with URLs you control or curated placeholders.
        # Example placeholder sets (add many more per bucket for realism):
        "South Asian": [
            # e.g., "https://your-cdn.example/faces/south_asian/women/sa_w_01.jpg",
            #       "https://your-cdn.example/faces/south_asian/men/sa_m_07.jpg",
        ],
        "East Asian": [],
        "Southeast Asian": [],
        "Middle Eastern/North African": [],
        "Black/African": [],
        "White/European": [],
        "Latino/Hispanic": [],
        "Pacific Islander": [],
        "Mixed/Other": [],
    }

    def randomuser_url(pid: str, gender: str) -> str:
        idx = int(pid, 16) % 100
        if gender == "Woman":
            folder = "women"
        elif gender == "Man":
            folder = "men"
        else:
            folder = "women" if (idx % 2 == 0) else "men"
        return f"https://randomuser.me/api/portraits/{folder}/{idx}.jpg"

    def photo_url_for(gender: str, ethnicity: str, pid: str) -> str:
        pool = PHOTO_CATALOG.get(ethnicity, [])
        if pool:
            # deterministic pick from pool by pid
            return pool[int(pid, 16) % len(pool)]
        # fallback to generic RandomUser portraits (not guaranteed ethnicity)
        return randomuser_url(pid, gender)

    # ================================================================
    # Helpers
    # ================================================================
    def weighted_choice(items, weights):
        return rng.choices(items, weights=weights, k=1)[0]

    def sample_world_city():
        weights = [w for (_, _, _, w) in WORLD_CITY_TABLE]
        choices = [(r, ctry, cty) for (r, ctry, cty, _) in WORLD_CITY_TABLE]
        return rng.choices(choices, weights=weights, k=1)[0]

    def sample_gender():
        return rng.choices(GENDERS, weights=[0.47, 0.47, 0.06], k=1)[0]

    def sample_name(gender):
        if gender == "Woman":
            pool = FEMALE_FIRST + UNISEX_FIRST
        elif gender == "Man":
            pool = MALE_FIRST + UNISEX_FIRST
        else:
            pool = UNISEX_FIRST + FEMALE_FIRST[:10] + MALE_FIRST[:10]
        return rng.choice(pool)

    def truncated_normal(mean, sd, lo, hi):
        while True:
            x = rng.gauss(mean, sd)
            if lo <= x <= hi:
                return int(round(x))

    def sample_age(region):
        # slight regional tilt
        mean = 27
        if region in {"Europe","North America"}: mean = 29
        if region in {"South Asia","South Asia (non-India)","Africa","Sub-Saharan Africa"}: mean = 26
        return truncated_normal(mean, 4.5, 21, 45)

    def sample_distance_km(region):
        # not geographic; think "city-scale distance"
        lam = 1 / 6.0
        val = int(round(min(30, max(1, rng.expovariate(lam)))))
        if region in {"Europe","North America"} and rng.random() < 0.25:
            val = min(30, val + rng.randint(2,5))
        return val

    def pick_interest_cluster(age, region):
        w = {"Active":1,"Arts":1,"Geek":1,"Social":1,"Sports":1}
        if age <= 26: w["Geek"] += 0.6; w["Social"] += 0.4
        if age >= 30: w["Arts"] += 0.4; w["Active"] += 0.2
        if region in {"Europe","North America"}: w["Arts"] += 0.2
        if region in {"South Asia","South Asia (non-India)","East Asia"}: w["Geek"] += 0.3
        keys = list(INTEREST_CLUSTERS.keys())
        return rng.choices(keys, weights=[w[k] for k in keys], k=1)[0]

    def sample_interests(age, region):
        k = rng.randint(3,6)
        base = pick_interest_cluster(age, region)
        alt = base if rng.random() < 0.6 else rng.choice(list(INTEREST_CLUSTERS.keys()))
        pool = list(dict.fromkeys(INTEREST_CLUSTERS[base] + INTEREST_CLUSTERS[alt]))
        if rng.random() < 0.35:
            extras = [i for i in ALL_INTERESTS if i not in pool]
            if extras:
                pool += rng.sample(extras, k=min(3, len(extras)))
        rng.shuffle(pool)
        return pool[:k]

    def make_bio(name, age, city, interests):
        lead = rng.choice([
            "Powered by coffee and chaotic good energy.",
            "Part-time explorer, full-time snack enthusiast.",
            "Weekends = long walks + long playlists.",
            "Trying new things and new foods—recommendations welcome.",
            "Recovering overthinker, thriving bruncher.",
            "Swaps memes for restaurant tips.",
        ])
        hook = rng.choice([
            f"Into {interests[0].lower()} and {interests[1].lower()}",
            f"{interests[0]} > {interests[1]}? Discuss.",
            f"If you like {interests[0].lower()}, we’ll get along.",
            f"From {city}, chasing {interests[-1].lower()} vibes.",
            f"{interests[0]}, {interests[1]}, and probably {interests[-1].lower()}",
        ])
        closer = rng.choice([
            "Coffee then a walk?",
            "Open to spontaneous day trips.",
            "Here for good banter and better food.",
            "Teach me your niche skill.",
            "Playlist swaps encouraged.",
        ])
        return f"{lead} {hook}. {closer}"

    def sample_languages(country, k_max=2):
        pool = COUNTRY_LANG_PALETTE.get(country, ["English"])
        k = 1 if len(pool) == 1 else rng.randint(1, min(k_max, len(pool)))
        return rng.sample(pool, k=k)

    def sample_religion(country):
        pool = COUNTRY_RELIGION_PALETTE.get(country, ["Other"])
        return rng.choice(pool)

    def country_to_ethnicity(country):
        return COUNTRY_TO_ETHNICITY.get(country, "Mixed/Other")

    # ================================================================
    # Main generator
    # ================================================================
    def make_world_profiles(n=N_PROFILES, seed=DEFAULT_SEED):
        rng.seed(seed)
        rows = []
        for _ in range(n):
            pid = str(uuid.uuid4())[:8]
            region, country, city = sample_world_city()
            gender = sample_gender()
            name = sample_name(gender)
            age = sample_age(region)
            distance = sample_distance_km(region)
            interests = sample_interests(age, region)
            bio = make_bio(name, age, city, interests)
            languages = sample_languages(country)
            religion = sample_religion(country)
            ethnicity = country_to_ethnicity(country)
            photo_url = photo_url_for(gender, ethnicity, pid)

            rows.append({
                "id": pid,
                "name": name,
                "age": age,
                "gender": gender,
                "region": region,
                "country": country,
                "city": city,
                "distance_km": distance,
                "ethnicity": ethnicity,
                "languages": languages,
                "religion": religion,
                "interests": interests,
                "about": bio,
                "photo_url": photo_url,
            })
        return pd.DataFrame(rows)

In [5]:

df = make_world_profiles(n=N_PROFILES, seed=DEFAULT_SEED)
# print(df.head(10))
# out = "profiles_world.csv"
# df.to_csv(out, index=False)
# print(f"\n✅ Saved {len(df)} profiles to {out} at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


In [6]:
df

Unnamed: 0,id,name,age,gender,region,country,city,distance_km,ethnicity,languages,religion,interests,about,photo_url
0,3f51fd13,Aisha,25,Woman,South Asia,India,Mumbai,12,South Asian,[Hindi],Sikh,"[Startups, Board Games, Tech, Gaming]","Recovering overthinker, thriving bruncher. If ...",https://randomuser.me/api/portraits/women/11.jpg
1,1e43fa7a,Ethan,38,Man,Europe,France,Paris,1,White/European,[French],Unaffiliated,"[Dancing, Yoga, Photography, Hiking, Running]","Recovering overthinker, thriving bruncher. Fro...",https://randomuser.me/api/portraits/men/14.jpg
2,11f23df6,Ariel,30,Woman,Europe,UK,London,6,White/European,[English],Jewish,"[Running, Hiking, Football, Yoga, Dancing]",Swaps memes for restaurant tips. Running > Hik...,https://randomuser.me/api/portraits/women/46.jpg
3,822e1c98,Elena,25,Woman,South Asia,India,Kolkata,4,South Asian,[Punjabi],Christian,"[Photography, Dancing, Running, Hiking, Yoga]",Trying new things and new foods—recommendation...,https://randomuser.me/api/portraits/women/56.jpg
4,71495956,Kai,28,Non-binary,Oceania,Australia,Perth,5,White/European,[English],Unaffiliated,"[Running, Yoga, Photography, Hiking]",Trying new things and new foods—recommendation...,https://randomuser.me/api/portraits/women/6.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,afdb9594,Sasha,25,Man,South Asia,India,Pune,7,South Asian,"[Tamil, Urdu]",Sikh,"[Volunteering, Travel, Standup Comedy]","Swaps memes for restaurant tips. Volunteering,...",https://randomuser.me/api/portraits/men/76.jpg
9996,25e74314,Zainab,23,Woman,South Asia,India,Hyderabad,1,South Asian,"[Punjabi, Telugu]",Sikh,"[Travel, Volunteering, Standup Comedy, Foodie]",Weekends = long walks + long playlists. If you...,https://randomuser.me/api/portraits/women/80.jpg
9997,fcef6989,Charlie,30,Woman,South Asia,India,Pune,5,South Asian,"[Hindi, Gujarati]",Jain,"[Volunteering, Travel, Standup Comedy, Foodie]","Swaps memes for restaurant tips. Volunteering,...",https://randomuser.me/api/portraits/women/53.jpg
9998,0b9a34c1,Dev,30,Woman,South Asia,India,Bengaluru,6,South Asian,"[Malayalam, Telugu]",Jain,"[Standup Comedy, Travel, Gaming, Volunteering,...",Trying new things and new foods—recommendation...,https://randomuser.me/api/portraits/women/25.jpg
