In [1]:
import pandas as pd
import random, uuid, json
from datetime import datetime

# ================================================================
# Config: global RNG & sizes
# ================================================================
DEFAULT_SEED = 123
N_PROFILES = 10_000
rng = random.Random(DEFAULT_SEED)

# ================================================================
# Taxonomies
# ================================================================
GENDERS = ["Woman", "Man", "Non-binary"]

INTEREST_CLUSTERS = {
    "Active": ["Hiking", "Running", "Yoga", "Dancing", "Photography"],
    "Arts": ["Art", "Theatre", "Poetry", "Movies", "Music"],
    "Geek": ["Tech", "Gaming", "Startups", "Board Games"],
    "Social": ["Foodie", "Travel", "Standup Comedy", "Volunteering"],
    "Sports": ["Cricket", "Football", "Basketball"],
}
ALL_INTERESTS = sorted({i for v in INTEREST_CLUSTERS.values() for i in v})

# Name pools (extend as you like)
FEMALE_FIRST = [
    "Aditi","Aarohi","Anaya","Diya","Isha","Myra","Sara","Siya","Tara","Zara",
    "Neha","Priya","Naina","Rhea","Meera","Anika","Kavya","Ritu","Pooja","Sana",
    "Anna","Maria","Sofia","Emma","Olivia","Mia","Aisha","Fatima","Yuna","Mei",
    "Camila","Valentina","Amara","Zainab","Helena","Elena","Giulia","Lina","Aya"
]
MALE_FIRST = [
    "Aarav","Vivaan","Aditya","Vihaan","Arjun","Sai","Krishna","Ishaan","Rohan","Kabir",
    "Raghav","Aman","Rajat","Varun","Anil","Rahul","Aakash","Nikhil","Sandeep","Yash",
    "Liam","Noah","Lucas","Mateo","Ethan","Leo","Hiro","Daichi","Minjun","Jae",
    "Luis","Diego","Andre","Omar","Youssef","Ali","Marco","Jonas","Felix","Tariq"
]
UNISEX_FIRST = ["Sam","Dev","Shiv","Arya","Sasha","Riyaan","Jai","Ray","Kiran","Alex","Charlie","Noor","Ariel","Jordan","Kai"]

# ================================================================
# Geography: India tiers + global regions
# Weights are rough, population-leaning proxies (tune freely)
# ================================================================
INDIA_TIERS = {
    "Tier-1": [
        ("India", "Mumbai", 10),
        ("India", "Delhi", 10),
        ("India", "Bengaluru", 9),
        ("India", "Hyderabad", 8),
        ("India", "Chennai", 7),
        ("India", "Kolkata", 7),
        ("India", "Pune", 6),
        ("India", "Ahmedabad", 5),
    ],
    "Tier-2": [
        ("India", "Jaipur", 4), ("India", "Surat", 4), ("India", "Lucknow", 4),
        ("India", "Kanpur", 3), ("India", "Nagpur", 3), ("India", "Indore", 3),
        ("India", "Bhopal", 3), ("India", "Chandigarh", 2), ("India", "Kochi", 2),
        ("India", "Coimbatore", 2),
    ],
    "Tier-3": [
        ("India", "Patna", 2), ("India", "Guwahati", 2), ("India", "Visakhapatnam", 2),
        ("India", "Vijayawada", 2), ("India", "Bhubaneswar", 2), ("India", "Thiruvananthapuram", 2),
        ("India", "Vadodara", 2), ("India", "Nashik", 2), ("India", "Ludhiana", 2), ("India", "Rajkot", 2),
    ],
}

WORLD_REGIONS = {
    "South Asia (non-India)": [
        ("Bangladesh", "Dhaka", 8), ("Bangladesh", "Chittagong", 3),
        ("Pakistan", "Karachi", 9), ("Pakistan", "Lahore", 6), ("Pakistan", "Islamabad", 2),
        ("Sri Lanka", "Colombo", 2), ("Nepal", "Kathmandu", 2),
    ],
    "East Asia": [
        ("Japan", "Tokyo", 10), ("Japan", "Osaka", 4),
        ("South Korea", "Seoul", 8), ("South Korea", "Busan", 3),
        ("China", "Shanghai", 10), ("China", "Beijing", 9), ("China", "Shenzhen", 7), ("China", "Guangzhou", 7),
        ("Taiwan", "Taipei", 4), ("Hong Kong", "Hong Kong", 5),
    ],
    "Southeast Asia": [
        ("Singapore", "Singapore", 6),
        ("Malaysia", "Kuala Lumpur", 4),
        ("Thailand", "Bangkok", 7),
        ("Indonesia", "Jakarta", 9), ("Vietnam", "Ho Chi Minh City", 6), ("Vietnam", "Hanoi", 5),
        ("Philippines", "Manila", 8),
    ],
    "North America": [
        ("USA", "New York", 9), ("USA", "Los Angeles", 8), ("USA", "Chicago", 6),
        ("USA", "San Francisco", 5), ("USA", "Houston", 5), ("USA", "Miami", 5),
        ("Canada", "Toronto", 6), ("Canada", "Vancouver", 4), ("Canada", "Montreal", 4),
        ("Mexico", "Mexico City", 9), ("Mexico", "Guadalajara", 4),
    ],
    "Europe": [
        ("UK", "London", 9), ("France", "Paris", 8), ("Germany", "Berlin", 6),
        ("Spain", "Madrid", 5), ("Spain", "Barcelona", 5),
        ("Italy", "Rome", 5), ("Italy", "Milan", 4),
        ("Netherlands", "Amsterdam", 4), ("Austria", "Vienna", 4), ("Sweden", "Stockholm", 3),
    ],
    "MENA": [
        ("UAE", "Dubai", 7), ("UAE", "Abu Dhabi", 4),
        ("Saudi Arabia", "Riyadh", 6), ("Saudi Arabia", "Jeddah", 5),
        ("Egypt", "Cairo", 8), ("Egypt", "Alexandria", 4),
        ("Türkiye", "Istanbul", 8), ("Morocco", "Casablanca", 4),
    ],
    "Sub-Saharan Africa": [
        ("Nigeria", "Lagos", 9), ("Nigeria", "Abuja", 4),
        ("Kenya", "Nairobi", 6), ("Kenya", "Mombasa", 3),
        ("Ghana", "Accra", 4), ("Ghana", "Kumasi", 3),
        ("South Africa", "Johannesburg", 5), ("South Africa", "Cape Town", 5), ("South Africa", "Durban", 3),
        ("Ethiopia", "Addis Ababa", 5),
    ],
    "Latin America": [
        ("Brazil", "São Paulo", 10), ("Brazil", "Rio de Janeiro", 7),
        ("Argentina", "Buenos Aires", 8), ("Chile", "Santiago", 6),
        ("Peru", "Lima", 7), ("Colombia", "Bogotá", 7), ("Colombia", "Medellín", 4),
        ("Ecuador", "Quito", 3), ("Uruguay", "Montevideo", 3),
    ],
    "Oceania": [
        ("Australia", "Sydney", 6), ("Australia", "Melbourne", 6),
        ("Australia", "Brisbane", 3), ("Australia", "Perth", 3),
        ("New Zealand", "Auckland", 3), ("New Zealand", "Wellington", 2),
    ],
}

def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
    rows = []
    if include_india:
        tiers = ["Tier-1", "Tier-2", "Tier-3"]
        tier_w = dict(zip(tiers, india_tier_bias))
        for tier in tiers:
            for country, city, w in INDIA_TIERS[tier]:
                rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
    for region, cities in WORLD_REGIONS.items():
        for country, city, w in cities:
            rows.append((region, country, city, w))
    return rows

WORLD_CITY_TABLE = build_city_table()

# ================================================================
# Background attributes (coarse) — optional
# ================================================================
COUNTRY_LANG_PALETTE = {
    "India": ["Hindi","English","Bengali","Telugu","Marathi","Tamil","Urdu","Gujarati","Kannada","Malayalam","Punjabi"],
    "USA": ["English","Spanish"], "UK": ["English"], "Canada": ["English","French"],
    "Mexico": ["Spanish"], "Brazil": ["Portuguese"], "France": ["French"], "Germany": ["German"],
    "Spain": ["Spanish","Catalan"], "Italy": ["Italian"], "Netherlands": ["Dutch"], "Sweden": ["Swedish"],
    "Turkey": ["Turkish"], "UAE": ["Arabic","English"], "Saudi Arabia": ["Arabic"], "Egypt": ["Arabic"],
    "Nigeria": ["English","Yoruba","Hausa","Igbo"], "Kenya": ["English","Swahili"],
    "South Africa": ["English","Zulu","Xhosa","Afrikaans"], "Ethiopia": ["Amharic","Oromo","Tigrinya"],
    "Bangladesh": ["Bengali"], "Pakistan": ["Urdu","Punjabi","Pashto","Sindhi"], "Sri Lanka": ["Sinhala","Tamil"],
    "Nepal": ["Nepali"], "Japan": ["Japanese"], "South Korea": ["Korean"], "China": ["Mandarin"],
    "Hong Kong": ["Cantonese","English"], "Taiwan": ["Mandarin"], "Singapore": ["English","Mandarin","Malay","Tamil"],
    "Malaysia": ["Malay","English","Mandarin","Tamil"], "Thailand": ["Thai"], "Indonesia": ["Indonesian"],
    "Vietnam": ["Vietnamese"], "Philippines": ["Filipino","English"], "Australia": ["English"],
    "New Zealand": ["English","Māori"], "Argentina": ["Spanish"], "Chile": ["Spanish"], "Peru": ["Spanish"],
    "Colombia": ["Spanish"], "Uruguay": ["Spanish"], "Ecuador": ["Spanish"],
}

COUNTRY_RELIGION_PALETTE = {
    "India": ["Hindu","Muslim","Christian","Sikh","Buddhist","Jain","Other"],
    "USA": ["Christian","Unaffiliated","Jewish","Muslim","Hindu","Buddhist","Other"],
    "UK": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Jewish","Buddhist"],
    "Canada": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Buddhist","Jewish"],
    "Mexico": ["Christian","Unaffiliated","Other"], "Brazil": ["Christian","Spiritist","Afro-Brazilian","Unaffiliated","Other"],
    "France": ["Unaffiliated","Christian","Muslim","Jewish","Buddhist","Other"],
    "Germany": ["Christian","Unaffiliated","Muslim","Other"], "Spain": ["Christian","Unaffiliated","Other"],
    "Italy": ["Christian","Unaffiliated","Other"], "Netherlands": ["Unaffiliated","Christian","Muslim","Other"],
    "Sweden": ["Unaffiliated","Christian","Other"], "Turkey": ["Muslim","Other"], "UAE": ["Muslim","Christian","Hindu","Buddhist","Other"],
    "Saudi Arabia": ["Muslim","Other"], "Egypt": ["Muslim","Christian","Other"],
    "Nigeria": ["Christian","Muslim","Traditional","Other"], "Kenya": ["Christian","Muslim","Traditional","Other"],
    "South Africa": ["Christian","Traditional","Unaffiliated","Other"], "Ethiopia": ["Christian","Muslim","Other"],
    "Bangladesh": ["Muslim","Hindu","Other"], "Pakistan": ["Muslim","Other"], "Sri Lanka": ["Buddhist","Hindu","Muslim","Christian"],
    "Nepal": ["Hindu","Buddhist","Other"], "Japan": ["Shinto","Buddhist","Other"], "South Korea": ["Unaffiliated","Christian","Buddhist","Other"],
    "China": ["Unaffiliated","Folk/Traditional","Buddhist","Christian","Other"], "Hong Kong": ["Buddhist","Taoist","Christian","Other"],
    "Taiwan": ["Folk/Traditional","Buddhist","Taoist","Other"], "Singapore": ["Buddhist","Taoist","Muslim","Christian","Hindu","Other"],
    "Malaysia": ["Muslim","Buddhist","Christian","Hindu","Other"], "Thailand": ["Buddhist","Other"],
    "Indonesia": ["Muslim","Christian","Hindu","Buddhist","Other"], "Vietnam": ["Unaffiliated","Buddhist","Christian","Other"],
    "Philippines": ["Christian","Muslim","Other"], "Australia": ["Christian","Unaffiliated","Other"],
    "New Zealand": ["Unaffiliated","Christian","Other"], "Argentina": ["Christian","Unaffiliated","Other"],
    "Chile": ["Christian","Unaffiliated","Other"], "Peru": ["Christian","Unaffiliated","Other"],
    "Colombia": ["Christian","Unaffiliated","Other"], "Uruguay": ["Unaffiliated","Christian","Other"],
    "Ecuador": ["Christian","Other"],
}

# ================================================================
# Ethnicity mapping (coarse)
# ================================================================
ETHNICITY_LABELS = [
    "South Asian","East Asian","Southeast Asian","Middle Eastern/North African",
    "Black/African","White/European","Latino/Hispanic","Pacific Islander","Mixed/Other"
]

COUNTRY_TO_ETHNICITY = {
    "India": "South Asian", "Pakistan": "South Asian", "Bangladesh": "South Asian",
    "Sri Lanka": "South Asian", "Nepal": "South Asian",
    "Japan": "East Asian", "South Korea": "East Asian", "China": "East Asian",
    "Taiwan": "East Asian", "Hong Kong": "East Asian",
    "Singapore": "Southeast Asian", "Malaysia": "Southeast Asian", "Thailand": "Southeast Asian",
    "Indonesia": "Southeast Asian", "Vietnam": "Southeast Asian", "Philippines": "Southeast Asian",
    "UAE": "Middle Eastern/North African", "Saudi Arabia": "Middle Eastern/North African",
    "Egypt": "Middle Eastern/North African", "Türkiye": "Middle Eastern/North African", "Morocco": "Middle Eastern/North African",
    "Nigeria": "Black/African", "Kenya": "Black/African", "Ghana": "Black/African",
    "South Africa": "Black/African", "Ethiopia": "Black/African",
    "Brazil": "Latino/Hispanic", "Argentina": "Latino/Hispanic", "Chile": "Latino/Hispanic",
    "Peru": "Latino/Hispanic", "Colombia": "Latino/Hispanic", "Uruguay": "Latino/Hispanic",
    "USA": "White/European", "Canada": "White/European", "Mexico": "Latino/Hispanic",
    "UK": "White/European", "France": "White/European", "Germany": "White/European",
    "Spain": "White/European", "Italy": "White/European", "Netherlands": "White/European",
    "Austria": "White/European", "Sweden": "White/European",
    "Australia": "White/European", "New Zealand": "White/European",
}

# ================================================================
# Photo provider
# ================================================================
PHOTO_CATALOG = {
    "South Asian": [],
    "East Asian": [],
    "Southeast Asian": [],
    "Middle Eastern/North African": [],
    "Black/African": [],
    "White/European": [],
    "Latino/Hispanic": [],
    "Pacific Islander": [],
    "Mixed/Other": [],
}

def randomuser_url(pid: str, gender: str) -> str:
    idx = int(pid, 16) % 100
    if gender == "Woman":
        folder = "women"
    elif gender == "Man":
        folder = "men"
    else:
        folder = "women" if (idx % 2 == 0) else "men"
    return f"https://randomuser.me/api/portraits/{folder}/{idx}.jpg"

def photo_url_for(gender: str, ethnicity: str, pid: str) -> str:
    pool = PHOTO_CATALOG.get(ethnicity, [])
    if pool:
        return pool[int(pid, 16) % len(pool)]
    return randomuser_url(pid, gender)

# ================================================================
# Helpers
# ================================================================
def weighted_choice(items, weights):
    return rng.choices(items, weights=weights, k=1)[0]

def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
    rows = []
    if include_india:
        tiers = ["Tier-1", "Tier-2", "Tier-3"]
        tier_w = dict(zip(tiers, india_tier_bias))
        for tier in tiers:
            for country, city, w in INDIA_TIERS[tier]:
                rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
    for region, cities in WORLD_REGIONS.items():
        for country, city, w in cities:
            rows.append((region, country, city, w))
    return rows

WORLD_CITY_TABLE = build_city_table()

def sample_world_city():
    weights = [w for (_, _, _, w) in WORLD_CITY_TABLE]
    choices = [(r, ctry, cty) for (r, ctry, cty, _) in WORLD_CITY_TABLE]
    return rng.choices(choices, weights=weights, k=1)[0]

def sample_gender():
    return rng.choices(GENDERS, weights=[0.47, 0.47, 0.06], k=1)[0]

def sample_name(gender):
    if gender == "Woman":
        pool = FEMALE_FIRST + UNISEX_FIRST
    elif gender == "Man":
        pool = MALE_FIRST + UNISEX_FIRST
    else:
        pool = UNISEX_FIRST + FEMALE_FIRST[:10] + MALE_FIRST[:10]
    return rng.choice(pool)

def truncated_normal(mean, sd, lo, hi):
    while True:
        x = rng.gauss(mean, sd)
        if lo <= x <= hi:
            return int(round(x))

def sample_age(region):
    mean = 27
    if region in {"Europe","North America"}: mean = 29
    if region in {"South Asia","South Asia (non-India)","Africa","Sub-Saharan Africa"}: mean = 26
    return truncated_normal(mean, 4.5, 21, 45)

def sample_distance_km(region):
    lam = 1 / 6.0
    val = int(round(min(30, max(1, rng.expovariate(lam)))))
    if region in {"Europe","North America"} and rng.random() < 0.25:
        val = min(30, val + rng.randint(2,5))
    return val

def pick_interest_cluster(age, region):
    w = {"Active":1,"Arts":1,"Geek":1,"Social":1,"Sports":1}
    if age <= 26: w["Geek"] += 0.6; w["Social"] += 0.4
    if age >= 30: w["Arts"] += 0.4; w["Active"] += 0.2
    if region in {"Europe","North America"}: w["Arts"] += 0.2
    if region in {"South Asia","South Asia (non-India)","East Asia"}: w["Geek"] += 0.3
    keys = list(INTEREST_CLUSTERS.keys())
    return rng.choices(keys, weights=[w[k] for k in keys], k=1)[0]

def sample_interests(age, region):
    k = rng.randint(3,6)
    base = pick_interest_cluster(age, region)
    alt = base if rng.random() < 0.6 else rng.choice(list(INTEREST_CLUSTERS.keys()))
    pool = list(dict.fromkeys(INTEREST_CLUSTERS[base] + INTEREST_CLUSTERS[alt]))
    if rng.random() < 0.35:
        extras = [i for i in ALL_INTERESTS if i not in pool]
        if extras:
            pool += rng.sample(extras, k=min(3, len(extras)))
    rng.shuffle(pool)
    return pool[:k]

def make_bio(name, age, city, interests):
    lead = rng.choice([
        "Powered by coffee and chaotic good energy.",
        "Part-time explorer, full-time snack enthusiast.",
        "Weekends = long walks + long playlists.",
        "Trying new things and new foods—recommendations welcome.",
        "Recovering overthinker, thriving bruncher.",
        "Swaps memes for restaurant tips.",
    ])
    hook = rng.choice([
        f"Into {interests[0].lower()} and {interests[1].lower()}",
        f"{interests[0]} > {interests[1]}? Discuss.",
        f"If you like {interests[0].lower()}, we’ll get along.",
        f"From {city}, chasing {interests[-1].lower()} vibes.",
        f"{interests[0]}, {interests[1]}, and probably {interests[-1].lower()}",
    ])
    closer = rng.choice([
        "Coffee then a walk?",
        "Open to spontaneous day trips.",
        "Here for good banter and better food.",
        "Teach me your niche skill.",
        "Playlist swaps encouraged.",
    ])
    return f"{lead} {hook}. {closer}"

def sample_languages(country, k_max=2):
    pool = COUNTRY_LANG_PALETTE.get(country, ["English"])
    k = 1 if len(pool) == 1 else rng.randint(1, min(k_max, len(pool)))
    return rng.sample(pool, k=k)

def sample_religion(country):
    pool = COUNTRY_RELIGION_PALETTE.get(country, ["Other"])
    return rng.choice(pool)

def country_to_ethnicity(country):
    return COUNTRY_TO_ETHNICITY.get(country, "Mixed/Other")

# ================================================================
# NEW: deterministic, seed-based IDs (stable across runs for same seed)
# ================================================================
def stable_profile_id(seed: int, i: int, name: str, city: str, gender: str) -> str:
    basis = f"{seed}::{i}::{name}::{city}::{gender}"
    return uuid.uuid5(uuid.NAMESPACE_URL, basis).hex[:8]

# ================================================================
# Main generator
# ================================================================
def make_world_profiles(n=N_PROFILES, seed=DEFAULT_SEED):
    rng.seed(seed)
    rows = []
    for i in range(n):
        region, country, city = sample_world_city()
        gender = sample_gender()
        name = sample_name(gender)
        age = sample_age(region)
        distance = sample_distance_km(region)
        interests = sample_interests(age, region)
        bio = make_bio(name, age, city, interests)
        languages = sample_languages(country)
        religion = sample_religion(country)
        ethnicity = country_to_ethnicity(country)

        pid = stable_profile_id(seed, i, name, city, gender)
        photo_url = photo_url_for(gender, ethnicity, pid)

        rows.append({
            "id": pid,
            "name": name,
            "age": age,
            "gender": gender,
            "region": region,
            "country": country,
            "city": city,
            "distance_km": distance,
            "ethnicity": ethnicity,
            "languages": languages,
            "religion": religion,
            "interests": interests,
            "about": bio,
            "photo_url": photo_url,
        })
    return pd.DataFrame(rows)

# ================================================================
# Optional: save with JSON-encoded list columns
# ================================================================
def save_profiles_csv(df: pd.DataFrame, path: str):
    out = df.copy()
    for col in ["languages", "interests"]:
        if col in out.columns:
            out[col] = out[col].apply(json.dumps, ensure_ascii=False)
    out.to_csv(path, index=False)

df = make_world_profiles(n=10000, seed=123)
# save_profiles_csv(df, "world_profiles.csv")


In [9]:
import pandas as pd

profiles = pd.read_csv("./data/profiles.csv")
viewers = pd.read_csv("./data/viewers.csv")
interactions = pd.read_csv("./data/interactions.csv", names=['datetime', 'viewer_id', 'viewer_name', 'profile_id','profile_name', 'status', 'score'])

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
profiles['text'] = profiles.drop(columns=['id']).astype(str).agg(" - ".join, axis=1)
embeddings = model.encode(profiles['text'], batch_size=128, convert_to_numpy=True, normalize_embeddings=True)
profiles['embedding'] = list(embeddings)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'profiles' is not defined

In [11]:
profiles[['id', 'embedding']]

Unnamed: 0,id,embedding
0,ssse1024,"[-0.005661143, -0.0029685695, 0.0457694, -0.01..."
1,d8334dc0,"[-0.002191868, -0.012887895, 0.05675405, 0.026..."
2,309e9139,"[0.030787738, 0.043839186, 0.0544222, -0.01888..."
3,7899f9e2,"[0.063246764, 0.0684222, 0.07976355, 0.0779395..."
4,fde9c9f3,"[-0.05164085, 0.0015087755, 0.010844314, 0.065..."
...,...,...
9996,c25d98dc,"[-0.061014995, -0.011367507, 0.0038794263, 0.0..."
9997,1fab6d6a,"[-0.03792872, -0.022427015, -0.032286942, 0.05..."
9998,c2ae6461,"[-0.050376404, -0.040498063, 0.023308842, 0.01..."
9999,acaeab0e,"[-0.0025407786, -0.049863584, -0.03467388, 0.0..."


In [8]:
profiles[['id', 'embedding']].to_csv("./data/profile_embedding.csv", index=False)

In [3]:
import pandas as pd
import random, uuid, json
from datetime import datetime

# ================================================================
# Config: global RNG & sizes
# ================================================================
DEFAULT_SEED = 123
N_PROFILES = 10_000
rng = random.Random(DEFAULT_SEED)

# ================================================================
# Taxonomies
# ================================================================
GENDERS = ["Woman", "Man", "Non-binary"]

INTEREST_CLUSTERS = {
    "Active": ["Hiking", "Running", "Yoga", "Dancing", "Photography"],
    "Arts": ["Art", "Theatre", "Poetry", "Movies", "Music"],
    "Geek": ["Tech", "Gaming", "Startups", "Board Games"],
    "Social": ["Foodie", "Travel", "Standup Comedy", "Volunteering"],
    "Sports": ["Cricket", "Football", "Basketball"],
}
ALL_INTERESTS = sorted({i for v in INTEREST_CLUSTERS.values() for i in v})

# Name pools (extend as you like)
FEMALE_FIRST = [
    "Aditi","Aarohi","Anaya","Diya","Isha","Myra","Sara","Siya","Tara","Zara",
    "Neha","Priya","Naina","Rhea","Meera","Anika","Kavya","Ritu","Pooja","Sana",
    "Anna","Maria","Sofia","Emma","Olivia","Mia","Aisha","Fatima","Yuna","Mei",
    "Camila","Valentina","Amara","Zainab","Helena","Elena","Giulia","Lina","Aya"
]
MALE_FIRST = [
    "Aarav","Vivaan","Aditya","Vihaan","Arjun","Sai","Krishna","Ishaan","Rohan","Kabir",
    "Raghav","Aman","Rajat","Varun","Anil","Rahul","Aakash","Nikhil","Sandeep","Yash",
    "Liam","Noah","Lucas","Mateo","Ethan","Leo","Hiro","Daichi","Minjun","Jae",
    "Luis","Diego","Andre","Omar","Youssef","Ali","Marco","Jonas","Felix","Tariq"
]
UNISEX_FIRST = ["Sam","Dev","Shiv","Arya","Sasha","Riyaan","Jai","Ray","Kiran","Alex","Charlie","Noor","Ariel","Jordan","Kai"]

# ================================================================
# Geography: India tiers + global regions
# Weights are rough, population-leaning proxies (tune freely)
# ================================================================
INDIA_TIERS = {
    "Tier-1": [
        ("India", "Mumbai", 10),
        ("India", "Delhi", 10),
        ("India", "Bengaluru", 9),
        ("India", "Hyderabad", 8),
        ("India", "Chennai", 7),
        ("India", "Kolkata", 7),
        ("India", "Pune", 6),
        ("India", "Ahmedabad", 5),
    ],
    "Tier-2": [
        ("India", "Jaipur", 4), ("India", "Surat", 4), ("India", "Lucknow", 4),
        ("India", "Kanpur", 3), ("India", "Nagpur", 3), ("India", "Indore", 3),
        ("India", "Bhopal", 3), ("India", "Chandigarh", 2), ("India", "Kochi", 2),
        ("India", "Coimbatore", 2),
    ],
    "Tier-3": [
        ("India", "Patna", 2), ("India", "Guwahati", 2), ("India", "Visakhapatnam", 2),
        ("India", "Vijayawada", 2), ("India", "Bhubaneswar", 2), ("India", "Thiruvananthapuram", 2),
        ("India", "Vadodara", 2), ("India", "Nashik", 2), ("India", "Ludhiana", 2), ("India", "Rajkot", 2),
    ],
}

WORLD_REGIONS = {
    "South Asia (non-India)": [
        ("Bangladesh", "Dhaka", 8), ("Bangladesh", "Chittagong", 3),
        ("Pakistan", "Karachi", 9), ("Pakistan", "Lahore", 6), ("Pakistan", "Islamabad", 2),
        ("Sri Lanka", "Colombo", 2), ("Nepal", "Kathmandu", 2),
    ],
    "East Asia": [
        ("Japan", "Tokyo", 10), ("Japan", "Osaka", 4),
        ("South Korea", "Seoul", 8), ("South Korea", "Busan", 3),
        ("China", "Shanghai", 10), ("China", "Beijing", 9), ("China", "Shenzhen", 7), ("China", "Guangzhou", 7),
        ("Taiwan", "Taipei", 4), ("Hong Kong", "Hong Kong", 5),
    ],
    "Southeast Asia": [
        ("Singapore", "Singapore", 6),
        ("Malaysia", "Kuala Lumpur", 4),
        ("Thailand", "Bangkok", 7),
        ("Indonesia", "Jakarta", 9), ("Vietnam", "Ho Chi Minh City", 6), ("Vietnam", "Hanoi", 5),
        ("Philippines", "Manila", 8),
    ],
    "North America": [
        ("USA", "New York", 9), ("USA", "Los Angeles", 8), ("USA", "Chicago", 6),
        ("USA", "San Francisco", 5), ("USA", "Houston", 5), ("USA", "Miami", 5),
        ("Canada", "Toronto", 6), ("Canada", "Vancouver", 4), ("Canada", "Montreal", 4),
        ("Mexico", "Mexico City", 9), ("Mexico", "Guadalajara", 4),
    ],
    "Europe": [
        ("UK", "London", 9), ("France", "Paris", 8), ("Germany", "Berlin", 6),
        ("Spain", "Madrid", 5), ("Spain", "Barcelona", 5),
        ("Italy", "Rome", 5), ("Italy", "Milan", 4),
        ("Netherlands", "Amsterdam", 4), ("Austria", "Vienna", 4), ("Sweden", "Stockholm", 3),
    ],
    "MENA": [
        ("UAE", "Dubai", 7), ("UAE", "Abu Dhabi", 4),
        ("Saudi Arabia", "Riyadh", 6), ("Saudi Arabia", "Jeddah", 5),
        ("Egypt", "Cairo", 8), ("Egypt", "Alexandria", 4),
        ("Türkiye", "Istanbul", 8), ("Morocco", "Casablanca", 4),
    ],
    "Sub-Saharan Africa": [
        ("Nigeria", "Lagos", 9), ("Nigeria", "Abuja", 4),
        ("Kenya", "Nairobi", 6), ("Kenya", "Mombasa", 3),
        ("Ghana", "Accra", 4), ("Ghana", "Kumasi", 3),
        ("South Africa", "Johannesburg", 5), ("South Africa", "Cape Town", 5), ("South Africa", "Durban", 3),
        ("Ethiopia", "Addis Ababa", 5),
    ],
    "Latin America": [
        ("Brazil", "São Paulo", 10), ("Brazil", "Rio de Janeiro", 7),
        ("Argentina", "Buenos Aires", 8), ("Chile", "Santiago", 6),
        ("Peru", "Lima", 7), ("Colombia", "Bogotá", 7), ("Colombia", "Medellín", 4),
        ("Ecuador", "Quito", 3), ("Uruguay", "Montevideo", 3),
    ],
    "Oceania": [
        ("Australia", "Sydney", 6), ("Australia", "Melbourne", 6),
        ("Australia", "Brisbane", 3), ("Australia", "Perth", 3),
        ("New Zealand", "Auckland", 3), ("New Zealand", "Wellington", 2),
    ],
}

def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
    rows = []
    if include_india:
        tiers = ["Tier-1", "Tier-2", "Tier-3"]
        tier_w = dict(zip(tiers, india_tier_bias))
        for tier in tiers:
            for country, city, w in INDIA_TIERS[tier]:
                rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
    for region, cities in WORLD_REGIONS.items():
        for country, city, w in cities:
            rows.append((region, country, city, w))
    return rows

WORLD_CITY_TABLE = build_city_table()

# ================================================================
# Background attributes (coarse) — optional
# ================================================================
COUNTRY_LANG_PALETTE = {
    "India": ["Hindi","English","Bengali","Telugu","Marathi","Tamil","Urdu","Gujarati","Kannada","Malayalam","Punjabi"],
    "USA": ["English","Spanish"], "UK": ["English"], "Canada": ["English","French"],
    "Mexico": ["Spanish"], "Brazil": ["Portuguese"], "France": ["French"], "Germany": ["German"],
    "Spain": ["Spanish","Catalan"], "Italy": ["Italian"], "Netherlands": ["Dutch"], "Sweden": ["Swedish"],
    "Turkey": ["Turkish"], "UAE": ["Arabic","English"], "Saudi Arabia": ["Arabic"], "Egypt": ["Arabic"],
    "Nigeria": ["English","Yoruba","Hausa","Igbo"], "Kenya": ["English","Swahili"],
    "South Africa": ["English","Zulu","Xhosa","Afrikaans"], "Ethiopia": ["Amharic","Oromo","Tigrinya"],
    "Bangladesh": ["Bengali"], "Pakistan": ["Urdu","Punjabi","Pashto","Sindhi"], "Sri Lanka": ["Sinhala","Tamil"],
    "Nepal": ["Nepali"], "Japan": ["Japanese"], "South Korea": ["Korean"], "China": ["Mandarin"],
    "Hong Kong": ["Cantonese","English"], "Taiwan": ["Mandarin"], "Singapore": ["English","Mandarin","Malay","Tamil"],
    "Malaysia": ["Malay","English","Mandarin","Tamil"], "Thailand": ["Thai"], "Indonesia": ["Indonesian"],
    "Vietnam": ["Vietnamese"], "Philippines": ["Filipino","English"], "Australia": ["English"],
    "New Zealand": ["English","Māori"], "Argentina": ["Spanish"], "Chile": ["Spanish"], "Peru": ["Spanish"],
    "Colombia": ["Spanish"], "Uruguay": ["Spanish"], "Ecuador": ["Spanish"],
}

COUNTRY_RELIGION_PALETTE = {
    "India": ["Hindu","Muslim","Christian","Sikh","Buddhist","Jain","Other"],
    "USA": ["Christian","Unaffiliated","Jewish","Muslim","Hindu","Buddhist","Other"],
    "UK": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Jewish","Buddhist"],
    "Canada": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Buddhist","Jewish"],
    "Mexico": ["Christian","Unaffiliated","Other"], "Brazil": ["Christian","Spiritist","Afro-Brazilian","Unaffiliated","Other"],
    "France": ["Unaffiliated","Christian","Muslim","Jewish","Buddhist","Other"],
    "Germany": ["Christian","Unaffiliated","Muslim","Other"], "Spain": ["Christian","Unaffiliated","Other"],
    "Italy": ["Christian","Unaffiliated","Other"], "Netherlands": ["Unaffiliated","Christian","Muslim","Other"],
    "Sweden": ["Unaffiliated","Christian","Other"], "Turkey": ["Muslim","Other"], "UAE": ["Muslim","Christian","Hindu","Buddhist","Other"],
    "Saudi Arabia": ["Muslim","Other"], "Egypt": ["Muslim","Christian","Other"],
    "Nigeria": ["Christian","Muslim","Traditional","Other"], "Kenya": ["Christian","Muslim","Traditional","Other"],
    "South Africa": ["Christian","Traditional","Unaffiliated","Other"], "Ethiopia": ["Christian","Muslim","Other"],
    "Bangladesh": ["Muslim","Hindu","Other"], "Pakistan": ["Muslim","Other"], "Sri Lanka": ["Buddhist","Hindu","Muslim","Christian"],
    "Nepal": ["Hindu","Buddhist","Other"], "Japan": ["Shinto","Buddhist","Other"], "South Korea": ["Unaffiliated","Christian","Buddhist","Other"],
    "China": ["Unaffiliated","Folk/Traditional","Buddhist","Christian","Other"], "Hong Kong": ["Buddhist","Taoist","Christian","Other"],
    "Taiwan": ["Folk/Traditional","Buddhist","Taoist","Other"], "Singapore": ["Buddhist","Taoist","Muslim","Christian","Hindu","Other"],
    "Malaysia": ["Muslim","Buddhist","Christian","Hindu","Other"], "Thailand": ["Buddhist","Other"],
    "Indonesia": ["Muslim","Christian","Hindu","Buddhist","Other"], "Vietnam": ["Unaffiliated","Buddhist","Christian","Other"],
    "Philippines": ["Christian","Muslim","Other"], "Australia": ["Christian","Unaffiliated","Other"],
    "New Zealand": ["Unaffiliated","Christian","Other"], "Argentina": ["Christian","Unaffiliated","Other"],
    "Chile": ["Christian","Unaffiliated","Other"], "Peru": ["Christian","Unaffiliated","Other"],
    "Colombia": ["Christian","Unaffiliated","Other"], "Uruguay": ["Unaffiliated","Christian","Other"],
    "Ecuador": ["Christian","Other"],
}

# ================================================================
# Ethnicity mapping (coarse)
# ================================================================
ETHNICITY_LABELS = [
    "South Asian","East Asian","Southeast Asian","Middle Eastern/North African",
    "Black/African","White/European","Latino/Hispanic","Pacific Islander","Mixed/Other"
]

COUNTRY_TO_ETHNICITY = {
    "India": "South Asian", "Pakistan": "South Asian", "Bangladesh": "South Asian",
    "Sri Lanka": "South Asian", "Nepal": "South Asian",
    "Japan": "East Asian", "South Korea": "East Asian", "China": "East Asian",
    "Taiwan": "East Asian", "Hong Kong": "East Asian",
    "Singapore": "Southeast Asian", "Malaysia": "Southeast Asian", "Thailand": "Southeast Asian",
    "Indonesia": "Southeast Asian", "Vietnam": "Southeast Asian", "Philippines": "Southeast Asian",
    "UAE": "Middle Eastern/North African", "Saudi Arabia": "Middle Eastern/North African",
    "Egypt": "Middle Eastern/North African", "Türkiye": "Middle Eastern/North African", "Morocco": "Middle Eastern/North African",
    "Nigeria": "Black/African", "Kenya": "Black/African", "Ghana": "Black/African",
    "South Africa": "Black/African", "Ethiopia": "Black/African",
    "Brazil": "Latino/Hispanic", "Argentina": "Latino/Hispanic", "Chile": "Latino/Hispanic",
    "Peru": "Latino/Hispanic", "Colombia": "Latino/Hispanic", "Uruguay": "Latino/Hispanic",
    "USA": "White/European", "Canada": "White/European", "Mexico": "Latino/Hispanic",
    "UK": "White/European", "France": "White/European", "Germany": "White/European",
    "Spain": "White/European", "Italy": "White/European", "Netherlands": "White/European",
    "Austria": "White/European", "Sweden": "White/European",
    "Australia": "White/European", "New Zealand": "White/European",
}

# ================================================================
# Photo provider
# ================================================================
PHOTO_CATALOG = {
    "South Asian": [],
    "East Asian": [],
    "Southeast Asian": [],
    "Middle Eastern/North African": [],
    "Black/African": [],
    "White/European": [],
    "Latino/Hispanic": [],
    "Pacific Islander": [],
    "Mixed/Other": [],
}

def randomuser_url(pid: str, gender: str) -> str:
    idx = int(pid, 16) % 100
    if gender == "Woman":
        folder = "women"
    elif gender == "Man":
        folder = "men"
    else:
        folder = "women" if (idx % 2 == 0) else "men"
    return f"https://randomuser.me/api/portraits/{folder}/{idx}.jpg"

def photo_url_for(gender: str, ethnicity: str, pid: str) -> str:
    pool = PHOTO_CATALOG.get(ethnicity, [])
    if pool:
        return pool[int(pid, 16) % len(pool)]
    return randomuser_url(pid, gender)

# ================================================================
# Helpers
# ================================================================
def weighted_choice(items, weights):
    return rng.choices(items, weights=weights, k=1)[0]

def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
    rows = []
    if include_india:
        tiers = ["Tier-1", "Tier-2", "Tier-3"]
        tier_w = dict(zip(tiers, india_tier_bias))
        for tier in tiers:
            for country, city, w in INDIA_TIERS[tier]:
                rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
    for region, cities in WORLD_REGIONS.items():
        for country, city, w in cities:
            rows.append((region, country, city, w))
    return rows

WORLD_CITY_TABLE = build_city_table()

def sample_world_city():
    weights = [w for (_, _, _, w) in WORLD_CITY_TABLE]
    choices = [(r, ctry, cty) for (r, ctry, cty, _) in WORLD_CITY_TABLE]
    return rng.choices(choices, weights=weights, k=1)[0]

def sample_gender():
    return rng.choices(GENDERS, weights=[0.47, 0.47, 0.06], k=1)[0]

def sample_name(gender):
    if gender == "Woman":
        pool = FEMALE_FIRST + UNISEX_FIRST
    elif gender == "Man":
        pool = MALE_FIRST + UNISEX_FIRST
    else:
        pool = UNISEX_FIRST + FEMALE_FIRST[:10] + MALE_FIRST[:10]
    return rng.choice(pool)

def truncated_normal(mean, sd, lo, hi):
    while True:
        x = rng.gauss(mean, sd)
        if lo <= x <= hi:
            return int(round(x))

def sample_age(region):
    mean = 27
    if region in {"Europe","North America"}: mean = 29
    if region in {"South Asia","South Asia (non-India)","Africa","Sub-Saharan Africa"}: mean = 26
    return truncated_normal(mean, 4.5, 21, 45)

def sample_distance_km(region):
    lam = 1 / 6.0
    val = int(round(min(30, max(1, rng.expovariate(lam)))))
    if region in {"Europe","North America"} and rng.random() < 0.25:
        val = min(30, val + rng.randint(2,5))
    return val

def pick_interest_cluster(age, region):
    w = {"Active":1,"Arts":1,"Geek":1,"Social":1,"Sports":1}
    if age <= 26: w["Geek"] += 0.6; w["Social"] += 0.4
    if age >= 30: w["Arts"] += 0.4; w["Active"] += 0.2
    if region in {"Europe","North America"}: w["Arts"] += 0.2
    if region in {"South Asia","South Asia (non-India)","East Asia"}: w["Geek"] += 0.3
    keys = list(INTEREST_CLUSTERS.keys())
    return rng.choices(keys, weights=[w[k] for k in keys], k=1)[0]

def sample_interests(age, region):
    k = rng.randint(3,6)
    base = pick_interest_cluster(age, region)
    alt = base if rng.random() < 0.6 else rng.choice(list(INTEREST_CLUSTERS.keys()))
    pool = list(dict.fromkeys(INTEREST_CLUSTERS[base] + INTEREST_CLUSTERS[alt]))
    if rng.random() < 0.35:
        extras = [i for i in ALL_INTERESTS if i not in pool]
        if extras:
            pool += rng.sample(extras, k=min(3, len(extras)))
    rng.shuffle(pool)
    return pool[:k]

def make_bio(name, age, city, interests):
    lead = rng.choice([
        "Powered by coffee and chaotic good energy.",
        "Part-time explorer, full-time snack enthusiast.",
        "Weekends = long walks + long playlists.",
        "Trying new things and new foods—recommendations welcome.",
        "Recovering overthinker, thriving bruncher.",
        "Swaps memes for restaurant tips.",
    ])
    hook = rng.choice([
        f"Into {interests[0].lower()} and {interests[1].lower()}",
        f"{interests[0]} > {interests[1]}? Discuss.",
        f"If you like {interests[0].lower()}, we’ll get along.",
        f"From {city}, chasing {interests[-1].lower()} vibes.",
        f"{interests[0]}, {interests[1]}, and probably {interests[-1].lower()}",
    ])
    closer = rng.choice([
        "Coffee then a walk?",
        "Open to spontaneous day trips.",
        "Here for good banter and better food.",
        "Teach me your niche skill.",
        "Playlist swaps encouraged.",
    ])
    return f"{lead} {hook}. {closer}"

def sample_languages(country, k_max=2):
    pool = COUNTRY_LANG_PALETTE.get(country, ["English"])
    k = 1 if len(pool) == 1 else rng.randint(1, min(k_max, len(pool)))
    return rng.sample(pool, k=k)

def sample_religion(country):
    pool = COUNTRY_RELIGION_PALETTE.get(country, ["Other"])
    return rng.choice(pool)

def country_to_ethnicity(country):
    return COUNTRY_TO_ETHNICITY.get(country, "Mixed/Other")

# ================================================================
# NEW: deterministic, seed-based IDs (stable across runs for same seed)
# ================================================================
def stable_profile_id(seed: int, i: int, name: str, city: str, gender: str) -> str:
    basis = f"{seed}::{i}::{name}::{city}::{gender}"
    return uuid.uuid5(uuid.NAMESPACE_URL, basis).hex[:8]

# ================================================================
# Main generator
# ================================================================
def make_world_profiles(n=N_PROFILES, seed=DEFAULT_SEED):
    rng.seed(seed)
    rows = []
    for i in range(n):
        region, country, city = sample_world_city()
        gender = sample_gender()
        name = sample_name(gender)
        age = sample_age(region)
        distance = sample_distance_km(region)
        interests = sample_interests(age, region)
        bio = make_bio(name, age, city, interests)
        languages = sample_languages(country)
        religion = sample_religion(country)
        ethnicity = country_to_ethnicity(country)

        pid = stable_profile_id(seed, i, name, city, gender)
        photo_url = photo_url_for(gender, ethnicity, pid)

        rows.append({
            "id": pid,
            "name": name,
            "age": age,
            "gender": gender,
            "region": region,
            "country": country,
            "city": city,
            "distance_km": distance,
            "ethnicity": ethnicity,
            "languages": languages,
            "religion": religion,
            "interests": interests,
            "about": bio,
            "photo_url": photo_url,
        })
    return pd.DataFrame(rows)

# ================================================================
# Optional: save with JSON-encoded list columns
# ================================================================
def save_profiles_csv(df: pd.DataFrame, path: str):
    out = df.copy()
    for col in ["languages", "interests"]:
        if col in out.columns:
            out[col] = out[col].apply(json.dumps, ensure_ascii=False)
    out.to_csv(path, index=False)


# save_profiles_csv(df, "world_profiles.csv")




In [2]:
import sys
from supabase import create_client, Client

url: str = "https://tpquhacpoxoschgsarie.supabase.co"
key: str = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InRwcXVoYWNwb3hvc2NoZ3NhcmllIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjMyMTk3NjksImV4cCI6MjA3ODc5NTc2OX0.T06IB1qnCr8eL1BCvuSypVkS7Cgeu5wdnE8QrSWmb-w"

if not url or not key:
    print("Please set SUPABASE_URL and SUPABASE_KEY environment variables.", file=sys.stderr)
    sys.exit(1)

supabase: Client = create_client(url, key)



In [25]:
import os
import time
from supabase import create_client
from postgrest import APIError  # may raise; keep for clarity but we catch generic Exception too


def normalize_profile(p):
    # minimal normalizer from earlier
    if 'distance_km' in p:
        try:
            fv = float(p['distance_km'])
            p['distance_km'] = int(fv) if fv.is_integer() else fv
        except Exception:
            p.pop('distance_km', None)
    if 'languages' in p and not isinstance(p['languages'], list):
        p['languages'] = [p['languages']]
    if 'interests' in p and not isinstance(p['interests'], list):
        p['interests'] = [p['interests']]
    if 'age' in p:
        try:
            p['age'] = int(p['age'])
        except Exception:
            p['age'] = None
    return p

def safe_response_to_dict(resp):
    """
    Convert the API response to a plain dict in a way that works for
    multiple client versions (pydantic v1 .dict(), v2 .model_dump()).
    If neither exists, fall back to str(resp).
    """
    if resp is None:
        return {"raw": None}
    # try common attributes first
    for attr in ("data", "error", "status_code", "status"):
        if hasattr(resp, attr):
            # We won't return early; we'll build a dict below.
            pass
    # try pydantic model_dump (v2)
    try:
        return resp.model_dump()
    except Exception:
        pass
    # try pydantic .dict() (v1)
    try:
        return resp.dict()
    except Exception:
        pass
    # try to turn into map-like via .__dict__
    try:
        return dict(resp.__dict__)
    except Exception:
        return {"raw": str(resp)}

def insert_profiles(rows, upsert=False):
    """
    Insert or upsert rows and return a plain dict with keys like:
      { "data": ..., "error": ..., "status_code": ... }
    """
    try:
        table = supabase.table('people')
        if upsert:
            resp = table.upsert(rows).execute()
        else:
            resp = table.insert(rows).execute()
    except Exception as e:
        # This catches APIError, HTTP errors, connection issues, etc.
        return {"data": None, "error": {"message": str(e), "type": type(e).__name__}}

    resp_dict = safe_response_to_dict(resp)

    # Standardize common shapes: some clients return {"data":..., "error":...}
    # Others wrap in top-level keys. Try to detect.
    if "error" not in resp_dict and "data" not in resp_dict:
        # maybe the dict has top-level keys like 'body' or similar; just return for inspection
        # but try to detect nested pattern
        for k in ("body", "result", "response"):
            if k in resp_dict and isinstance(resp_dict[k], dict):
                nested = resp_dict[k]
                if "error" in nested or "data" in nested:
                    return nested
        # fallback
        return resp_dict

    return resp_dict


# Example chunked loader using the robust function:
def full_load(profiles, chunk_size=200, upsert=True):

    import sys
    from supabase import create_client, Client

    url: str = "https://tpquhacpoxoschgsarie.supabase.co"
    key: str = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InRwcXVoYWNwb3hvc2NoZ3NhcmllIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NjMyMTk3NjksImV4cCI6MjA3ODc5NTc2OX0.T06IB1qnCr8eL1BCvuSypVkS7Cgeu5wdnE8QrSWmb-w"

    if not url or not key:
        print("Please set SUPABASE_URL and SUPABASE_KEY environment variables.", file=sys.stderr)
        sys.exit(1)

    supabase: Client = create_client(url, key)
    # normalize in-place (or copy if you prefer)

    import pandas as pd
    import random, uuid, json
    from datetime import datetime

    # ================================================================
    # Config: global RNG & sizes
    # ================================================================
    DEFAULT_SEED = 123
    N_PROFILES = 10_000
    rng = random.Random(DEFAULT_SEED)

    # ================================================================
    # Taxonomies
    # ================================================================
    GENDERS = ["Woman", "Man", "Non-binary"]

    INTEREST_CLUSTERS = {
        "Active": ["Hiking", "Running", "Yoga", "Dancing", "Photography"],
        "Arts": ["Art", "Theatre", "Poetry", "Movies", "Music"],
        "Geek": ["Tech", "Gaming", "Startups", "Board Games"],
        "Social": ["Foodie", "Travel", "Standup Comedy", "Volunteering"],
        "Sports": ["Cricket", "Football", "Basketball"],
    }
    ALL_INTERESTS = sorted({i for v in INTEREST_CLUSTERS.values() for i in v})

    # Name pools (extend as you like)
    FEMALE_FIRST = [
        "Aditi","Aarohi","Anaya","Diya","Isha","Myra","Sara","Siya","Tara","Zara",
        "Neha","Priya","Naina","Rhea","Meera","Anika","Kavya","Ritu","Pooja","Sana",
        "Anna","Maria","Sofia","Emma","Olivia","Mia","Aisha","Fatima","Yuna","Mei",
        "Camila","Valentina","Amara","Zainab","Helena","Elena","Giulia","Lina","Aya"
    ]
    MALE_FIRST = [
        "Aarav","Vivaan","Aditya","Vihaan","Arjun","Sai","Krishna","Ishaan","Rohan","Kabir",
        "Raghav","Aman","Rajat","Varun","Anil","Rahul","Aakash","Nikhil","Sandeep","Yash",
        "Liam","Noah","Lucas","Mateo","Ethan","Leo","Hiro","Daichi","Minjun","Jae",
        "Luis","Diego","Andre","Omar","Youssef","Ali","Marco","Jonas","Felix","Tariq"
    ]
    UNISEX_FIRST = ["Sam","Dev","Shiv","Arya","Sasha","Riyaan","Jai","Ray","Kiran","Alex","Charlie","Noor","Ariel","Jordan","Kai"]

    # ================================================================
    # Geography: India tiers + global regions
    # Weights are rough, population-leaning proxies (tune freely)
    # ================================================================
    INDIA_TIERS = {
        "Tier-1": [
            ("India", "Mumbai", 10),
            ("India", "Delhi", 10),
            ("India", "Bengaluru", 9),
            ("India", "Hyderabad", 8),
            ("India", "Chennai", 7),
            ("India", "Kolkata", 7),
            ("India", "Pune", 6),
            ("India", "Ahmedabad", 5),
        ],
        "Tier-2": [
            ("India", "Jaipur", 4), ("India", "Surat", 4), ("India", "Lucknow", 4),
            ("India", "Kanpur", 3), ("India", "Nagpur", 3), ("India", "Indore", 3),
            ("India", "Bhopal", 3), ("India", "Chandigarh", 2), ("India", "Kochi", 2),
            ("India", "Coimbatore", 2),
        ],
        "Tier-3": [
            ("India", "Patna", 2), ("India", "Guwahati", 2), ("India", "Visakhapatnam", 2),
            ("India", "Vijayawada", 2), ("India", "Bhubaneswar", 2), ("India", "Thiruvananthapuram", 2),
            ("India", "Vadodara", 2), ("India", "Nashik", 2), ("India", "Ludhiana", 2), ("India", "Rajkot", 2),
        ],
    }

    WORLD_REGIONS = {
        "South Asia (non-India)": [
            ("Bangladesh", "Dhaka", 8), ("Bangladesh", "Chittagong", 3),
            ("Pakistan", "Karachi", 9), ("Pakistan", "Lahore", 6), ("Pakistan", "Islamabad", 2),
            ("Sri Lanka", "Colombo", 2), ("Nepal", "Kathmandu", 2),
        ],
        "East Asia": [
            ("Japan", "Tokyo", 10), ("Japan", "Osaka", 4),
            ("South Korea", "Seoul", 8), ("South Korea", "Busan", 3),
            ("China", "Shanghai", 10), ("China", "Beijing", 9), ("China", "Shenzhen", 7), ("China", "Guangzhou", 7),
            ("Taiwan", "Taipei", 4), ("Hong Kong", "Hong Kong", 5),
        ],
        "Southeast Asia": [
            ("Singapore", "Singapore", 6),
            ("Malaysia", "Kuala Lumpur", 4),
            ("Thailand", "Bangkok", 7),
            ("Indonesia", "Jakarta", 9), ("Vietnam", "Ho Chi Minh City", 6), ("Vietnam", "Hanoi", 5),
            ("Philippines", "Manila", 8),
        ],
        "North America": [
            ("USA", "New York", 9), ("USA", "Los Angeles", 8), ("USA", "Chicago", 6),
            ("USA", "San Francisco", 5), ("USA", "Houston", 5), ("USA", "Miami", 5),
            ("Canada", "Toronto", 6), ("Canada", "Vancouver", 4), ("Canada", "Montreal", 4),
            ("Mexico", "Mexico City", 9), ("Mexico", "Guadalajara", 4),
        ],
        "Europe": [
            ("UK", "London", 9), ("France", "Paris", 8), ("Germany", "Berlin", 6),
            ("Spain", "Madrid", 5), ("Spain", "Barcelona", 5),
            ("Italy", "Rome", 5), ("Italy", "Milan", 4),
            ("Netherlands", "Amsterdam", 4), ("Austria", "Vienna", 4), ("Sweden", "Stockholm", 3),
        ],
        "MENA": [
            ("UAE", "Dubai", 7), ("UAE", "Abu Dhabi", 4),
            ("Saudi Arabia", "Riyadh", 6), ("Saudi Arabia", "Jeddah", 5),
            ("Egypt", "Cairo", 8), ("Egypt", "Alexandria", 4),
            ("Türkiye", "Istanbul", 8), ("Morocco", "Casablanca", 4),
        ],
        "Sub-Saharan Africa": [
            ("Nigeria", "Lagos", 9), ("Nigeria", "Abuja", 4),
            ("Kenya", "Nairobi", 6), ("Kenya", "Mombasa", 3),
            ("Ghana", "Accra", 4), ("Ghana", "Kumasi", 3),
            ("South Africa", "Johannesburg", 5), ("South Africa", "Cape Town", 5), ("South Africa", "Durban", 3),
            ("Ethiopia", "Addis Ababa", 5),
        ],
        "Latin America": [
            ("Brazil", "São Paulo", 10), ("Brazil", "Rio de Janeiro", 7),
            ("Argentina", "Buenos Aires", 8), ("Chile", "Santiago", 6),
            ("Peru", "Lima", 7), ("Colombia", "Bogotá", 7), ("Colombia", "Medellín", 4),
            ("Ecuador", "Quito", 3), ("Uruguay", "Montevideo", 3),
        ],
        "Oceania": [
            ("Australia", "Sydney", 6), ("Australia", "Melbourne", 6),
            ("Australia", "Brisbane", 3), ("Australia", "Perth", 3),
            ("New Zealand", "Auckland", 3), ("New Zealand", "Wellington", 2),
        ],
    }

    def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
        rows = []
        if include_india:
            tiers = ["Tier-1", "Tier-2", "Tier-3"]
            tier_w = dict(zip(tiers, india_tier_bias))
            for tier in tiers:
                for country, city, w in INDIA_TIERS[tier]:
                    rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
        for region, cities in WORLD_REGIONS.items():
            for country, city, w in cities:
                rows.append((region, country, city, w))
        return rows

    WORLD_CITY_TABLE = build_city_table()

    # ================================================================
    # Background attributes (coarse) — optional
    # ================================================================
    COUNTRY_LANG_PALETTE = {
        "India": ["Hindi","English","Bengali","Telugu","Marathi","Tamil","Urdu","Gujarati","Kannada","Malayalam","Punjabi"],
        "USA": ["English","Spanish"], "UK": ["English"], "Canada": ["English","French"],
        "Mexico": ["Spanish"], "Brazil": ["Portuguese"], "France": ["French"], "Germany": ["German"],
        "Spain": ["Spanish","Catalan"], "Italy": ["Italian"], "Netherlands": ["Dutch"], "Sweden": ["Swedish"],
        "Turkey": ["Turkish"], "UAE": ["Arabic","English"], "Saudi Arabia": ["Arabic"], "Egypt": ["Arabic"],
        "Nigeria": ["English","Yoruba","Hausa","Igbo"], "Kenya": ["English","Swahili"],
        "South Africa": ["English","Zulu","Xhosa","Afrikaans"], "Ethiopia": ["Amharic","Oromo","Tigrinya"],
        "Bangladesh": ["Bengali"], "Pakistan": ["Urdu","Punjabi","Pashto","Sindhi"], "Sri Lanka": ["Sinhala","Tamil"],
        "Nepal": ["Nepali"], "Japan": ["Japanese"], "South Korea": ["Korean"], "China": ["Mandarin"],
        "Hong Kong": ["Cantonese","English"], "Taiwan": ["Mandarin"], "Singapore": ["English","Mandarin","Malay","Tamil"],
        "Malaysia": ["Malay","English","Mandarin","Tamil"], "Thailand": ["Thai"], "Indonesia": ["Indonesian"],
        "Vietnam": ["Vietnamese"], "Philippines": ["Filipino","English"], "Australia": ["English"],
        "New Zealand": ["English","Māori"], "Argentina": ["Spanish"], "Chile": ["Spanish"], "Peru": ["Spanish"],
        "Colombia": ["Spanish"], "Uruguay": ["Spanish"], "Ecuador": ["Spanish"],
    }

    COUNTRY_RELIGION_PALETTE = {
        "India": ["Hindu","Muslim","Christian","Sikh","Buddhist","Jain","Other"],
        "USA": ["Christian","Unaffiliated","Jewish","Muslim","Hindu","Buddhist","Other"],
        "UK": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Jewish","Buddhist"],
        "Canada": ["Christian","Unaffiliated","Muslim","Hindu","Sikh","Buddhist","Jewish"],
        "Mexico": ["Christian","Unaffiliated","Other"], "Brazil": ["Christian","Spiritist","Afro-Brazilian","Unaffiliated","Other"],
        "France": ["Unaffiliated","Christian","Muslim","Jewish","Buddhist","Other"],
        "Germany": ["Christian","Unaffiliated","Muslim","Other"], "Spain": ["Christian","Unaffiliated","Other"],
        "Italy": ["Christian","Unaffiliated","Other"], "Netherlands": ["Unaffiliated","Christian","Muslim","Other"],
        "Sweden": ["Unaffiliated","Christian","Other"], "Turkey": ["Muslim","Other"], "UAE": ["Muslim","Christian","Hindu","Buddhist","Other"],
        "Saudi Arabia": ["Muslim","Other"], "Egypt": ["Muslim","Christian","Other"],
        "Nigeria": ["Christian","Muslim","Traditional","Other"], "Kenya": ["Christian","Muslim","Traditional","Other"],
        "South Africa": ["Christian","Traditional","Unaffiliated","Other"], "Ethiopia": ["Christian","Muslim","Other"],
        "Bangladesh": ["Muslim","Hindu","Other"], "Pakistan": ["Muslim","Other"], "Sri Lanka": ["Buddhist","Hindu","Muslim","Christian"],
        "Nepal": ["Hindu","Buddhist","Other"], "Japan": ["Shinto","Buddhist","Other"], "South Korea": ["Unaffiliated","Christian","Buddhist","Other"],
        "China": ["Unaffiliated","Folk/Traditional","Buddhist","Christian","Other"], "Hong Kong": ["Buddhist","Taoist","Christian","Other"],
        "Taiwan": ["Folk/Traditional","Buddhist","Taoist","Other"], "Singapore": ["Buddhist","Taoist","Muslim","Christian","Hindu","Other"],
        "Malaysia": ["Muslim","Buddhist","Christian","Hindu","Other"], "Thailand": ["Buddhist","Other"],
        "Indonesia": ["Muslim","Christian","Hindu","Buddhist","Other"], "Vietnam": ["Unaffiliated","Buddhist","Christian","Other"],
        "Philippines": ["Christian","Muslim","Other"], "Australia": ["Christian","Unaffiliated","Other"],
        "New Zealand": ["Unaffiliated","Christian","Other"], "Argentina": ["Christian","Unaffiliated","Other"],
        "Chile": ["Christian","Unaffiliated","Other"], "Peru": ["Christian","Unaffiliated","Other"],
        "Colombia": ["Christian","Unaffiliated","Other"], "Uruguay": ["Unaffiliated","Christian","Other"],
        "Ecuador": ["Christian","Other"],
    }

    # ================================================================
    # Ethnicity mapping (coarse)
    # ================================================================
    ETHNICITY_LABELS = [
        "South Asian","East Asian","Southeast Asian","Middle Eastern/North African",
        "Black/African","White/European","Latino/Hispanic","Pacific Islander","Mixed/Other"
    ]

    COUNTRY_TO_ETHNICITY = {
        "India": "South Asian", "Pakistan": "South Asian", "Bangladesh": "South Asian",
        "Sri Lanka": "South Asian", "Nepal": "South Asian",
        "Japan": "East Asian", "South Korea": "East Asian", "China": "East Asian",
        "Taiwan": "East Asian", "Hong Kong": "East Asian",
        "Singapore": "Southeast Asian", "Malaysia": "Southeast Asian", "Thailand": "Southeast Asian",
        "Indonesia": "Southeast Asian", "Vietnam": "Southeast Asian", "Philippines": "Southeast Asian",
        "UAE": "Middle Eastern/North African", "Saudi Arabia": "Middle Eastern/North African",
        "Egypt": "Middle Eastern/North African", "Türkiye": "Middle Eastern/North African", "Morocco": "Middle Eastern/North African",
        "Nigeria": "Black/African", "Kenya": "Black/African", "Ghana": "Black/African",
        "South Africa": "Black/African", "Ethiopia": "Black/African",
        "Brazil": "Latino/Hispanic", "Argentina": "Latino/Hispanic", "Chile": "Latino/Hispanic",
        "Peru": "Latino/Hispanic", "Colombia": "Latino/Hispanic", "Uruguay": "Latino/Hispanic",
        "USA": "White/European", "Canada": "White/European", "Mexico": "Latino/Hispanic",
        "UK": "White/European", "France": "White/European", "Germany": "White/European",
        "Spain": "White/European", "Italy": "White/European", "Netherlands": "White/European",
        "Austria": "White/European", "Sweden": "White/European",
        "Australia": "White/European", "New Zealand": "White/European",
    }

    # ================================================================
    # Photo provider
    # ================================================================
    PHOTO_CATALOG = {
        "South Asian": [],
        "East Asian": [],
        "Southeast Asian": [],
        "Middle Eastern/North African": [],
        "Black/African": [],
        "White/European": [],
        "Latino/Hispanic": [],
        "Pacific Islander": [],
        "Mixed/Other": [],
    }

    def randomuser_url(pid: str, gender: str) -> str:
        idx = int(pid, 16) % 100
        if gender == "Woman":
            folder = "women"
        elif gender == "Man":
            folder = "men"
        else:
            folder = "women" if (idx % 2 == 0) else "men"
        return f"https://randomuser.me/api/portraits/{folder}/{idx}.jpg"

    def photo_url_for(gender: str, ethnicity: str, pid: str) -> str:
        pool = PHOTO_CATALOG.get(ethnicity, [])
        if pool:
            return pool[int(pid, 16) % len(pool)]
        return randomuser_url(pid, gender)

    # ================================================================
    # Helpers
    # ================================================================
    def weighted_choice(items, weights):
        return rng.choices(items, weights=weights, k=1)[0]

    def build_city_table(include_india=True, india_tier_bias=(0.5, 0.35, 0.15)):
        rows = []
        if include_india:
            tiers = ["Tier-1", "Tier-2", "Tier-3"]
            tier_w = dict(zip(tiers, india_tier_bias))
            for tier in tiers:
                for country, city, w in INDIA_TIERS[tier]:
                    rows.append(("South Asia", country, city, w * (1 + 9 * tier_w[tier])))
        for region, cities in WORLD_REGIONS.items():
            for country, city, w in cities:
                rows.append((region, country, city, w))
        return rows

    WORLD_CITY_TABLE = build_city_table()

    def sample_world_city():
        weights = [w for (_, _, _, w) in WORLD_CITY_TABLE]
        choices = [(r, ctry, cty) for (r, ctry, cty, _) in WORLD_CITY_TABLE]
        return rng.choices(choices, weights=weights, k=1)[0]

    def sample_gender():
        return rng.choices(GENDERS, weights=[0.47, 0.47, 0.06], k=1)[0]

    def sample_name(gender):
        if gender == "Woman":
            pool = FEMALE_FIRST + UNISEX_FIRST
        elif gender == "Man":
            pool = MALE_FIRST + UNISEX_FIRST
        else:
            pool = UNISEX_FIRST + FEMALE_FIRST[:10] + MALE_FIRST[:10]
        return rng.choice(pool)

    def truncated_normal(mean, sd, lo, hi):
        while True:
            x = rng.gauss(mean, sd)
            if lo <= x <= hi:
                return int(round(x))

    def sample_age(region):
        mean = 27
        if region in {"Europe","North America"}: mean = 29
        if region in {"South Asia","South Asia (non-India)","Africa","Sub-Saharan Africa"}: mean = 26
        return truncated_normal(mean, 4.5, 21, 45)

    def sample_distance_km(region):
        lam = 1 / 6.0
        val = int(round(min(30, max(1, rng.expovariate(lam)))))
        if region in {"Europe","North America"} and rng.random() < 0.25:
            val = min(30, val + rng.randint(2,5))
        return val

    def pick_interest_cluster(age, region):
        w = {"Active":1,"Arts":1,"Geek":1,"Social":1,"Sports":1}
        if age <= 26: w["Geek"] += 0.6; w["Social"] += 0.4
        if age >= 30: w["Arts"] += 0.4; w["Active"] += 0.2
        if region in {"Europe","North America"}: w["Arts"] += 0.2
        if region in {"South Asia","South Asia (non-India)","East Asia"}: w["Geek"] += 0.3
        keys = list(INTEREST_CLUSTERS.keys())
        return rng.choices(keys, weights=[w[k] for k in keys], k=1)[0]

    def sample_interests(age, region):
        k = rng.randint(3,6)
        base = pick_interest_cluster(age, region)
        alt = base if rng.random() < 0.6 else rng.choice(list(INTEREST_CLUSTERS.keys()))
        pool = list(dict.fromkeys(INTEREST_CLUSTERS[base] + INTEREST_CLUSTERS[alt]))
        if rng.random() < 0.35:
            extras = [i for i in ALL_INTERESTS if i not in pool]
            if extras:
                pool += rng.sample(extras, k=min(3, len(extras)))
        rng.shuffle(pool)
        return pool[:k]

    def make_bio(name, age, city, interests):
        lead = rng.choice([
            "Powered by coffee and chaotic good energy.",
            "Part-time explorer, full-time snack enthusiast.",
            "Weekends = long walks + long playlists.",
            "Trying new things and new foods—recommendations welcome.",
            "Recovering overthinker, thriving bruncher.",
            "Swaps memes for restaurant tips.",
        ])
        hook = rng.choice([
            f"Into {interests[0].lower()} and {interests[1].lower()}",
            f"{interests[0]} > {interests[1]}? Discuss.",
            f"If you like {interests[0].lower()}, we’ll get along.",
            f"From {city}, chasing {interests[-1].lower()} vibes.",
            f"{interests[0]}, {interests[1]}, and probably {interests[-1].lower()}",
        ])
        closer = rng.choice([
            "Coffee then a walk?",
            "Open to spontaneous day trips.",
            "Here for good banter and better food.",
            "Teach me your niche skill.",
            "Playlist swaps encouraged.",
        ])
        return f"{lead} {hook}. {closer}"

    def sample_languages(country, k_max=2):
        pool = COUNTRY_LANG_PALETTE.get(country, ["English"])
        k = 1 if len(pool) == 1 else rng.randint(1, min(k_max, len(pool)))
        return rng.sample(pool, k=k)

    def sample_religion(country):
        pool = COUNTRY_RELIGION_PALETTE.get(country, ["Other"])
        return rng.choice(pool)

    def country_to_ethnicity(country):
        return COUNTRY_TO_ETHNICITY.get(country, "Mixed/Other")

    # ================================================================
    # NEW: deterministic, seed-based IDs (stable across runs for same seed)
    # ================================================================
    def stable_profile_id(seed: int, i: int, name: str, city: str, gender: str) -> str:
        basis = f"{seed}::{i}::{name}::{city}::{gender}"
        return uuid.uuid5(uuid.NAMESPACE_URL, basis).hex[:8]

    # ================================================================
    # Main generator
    # ================================================================
    def make_world_profiles(n=N_PROFILES, seed=DEFAULT_SEED):
        rng.seed(seed)
        rows = []
        for i in range(n):
            region, country, city = sample_world_city()
            gender = sample_gender()
            name = sample_name(gender)
            age = sample_age(region)
            distance = sample_distance_km(region)
            interests = sample_interests(age, region)
            bio = make_bio(name, age, city, interests)
            languages = sample_languages(country)
            religion = sample_religion(country)
            ethnicity = country_to_ethnicity(country)

            pid = stable_profile_id(seed, i, name, city, gender)
            photo_url = photo_url_for(gender, ethnicity, pid)

            rows.append({
                "id": pid,
                "name": name,
                "age": age,
                "gender": gender,
                "region": region,
                "country": country,
                "city": city,
                "distance_km": distance,
                "ethnicity": ethnicity,
                "languages": languages,
                "religion": religion,
                "interests": interests,
                "about": bio,
                "photo_url": photo_url,
            })
        return pd.DataFrame(rows)

    # ================================================================
    # Optional: save with JSON-encoded list columns
    # ================================================================
    def save_profiles_csv(df: pd.DataFrame, path: str):
        out = df.copy()
        for col in ["languages", "interests"]:
            if col in out.columns:
                out[col] = out[col].apply(json.dumps, ensure_ascii=False)
        out.to_csv(path, index=False)

    profiles = make_world_profiles(n=10000)

    # save_profiles_csv(df, "world_profiles.csv")



    profiles = [normalize_profile(dict(p)) for p in profiles]

    def chunk_list(lst, n):
        for i in range(0, len(lst), n):
            yield lst[i:i+n]

    for idx, chunk in enumerate(chunk_list(profiles, chunk_size), start=1):
        resp = insert_profiles(chunk, upsert=upsert)
        # Print a concise but informative status
        if resp.get("error"):
            print(f"Chunk {idx} FAILED — error present.")
            print("error:", resp["error"])
            # helpful debug: show the raw resp keys so you can adapt
            print("full response dump keys:", list(resp.keys()))
            # show first few rows of the chunk for inspection
            print("sample bad rows (first 5):", chunk[:5])
            # short backoff
            time.sleep(1)
            # you might want to continue or break depending on policy
            # continue
        else:
            # success path — some clients put results under 'data'
            data = resp.get("data", resp)
            # print number of rows upserted if available
            try:
                count = len(data) if isinstance(data, list) else ("1" if data else "0")
            except Exception:
                count = "unknown"
            print(f"Chunk {idx} OK — upserted rows: {count}")

    print("full_load finished.")

# USAGE:
full_load(profiles)   # call this to run


ValueError: dictionary update sequence element #0 has length 1; 2 is required