### Data prepartion

To prepare the training data follow these steps:

- Download the [Kaggle Celebrity net worth dataset](https://www.kaggle.com/datasets/monkeybusiness7/2023-celebrity-net-worth?resource=download) in the current directory
- Run this notebook to scrape one image per celebrity

In [None]:
import pandas as pd

# Load your CSV
df = pd.read_csv("2023_celeb_net_worth.csv", encoding="latin1")

# Keep only 'Celebrity' and 'Estimated Net Worth'
df_clean = df[["Celebrity", "Estimated Net Worth"]].dropna()

# Optionally: remove duplicates, strip whitespace
df_clean["Celebrity"] = df_clean["Celebrity"].str.strip()
df_clean = df_clean.drop_duplicates()

# Save cleaned file
df_clean.to_csv("celebrity_names_networth.csv", index=False)


In [None]:
import pandas as pd
from duckduckgo_search import DDGS
import requests
from pathlib import Path
import time

# Load cleaned dataset
df = pd.read_csv("celebrity_names_networth.csv")
output_dir = Path("images")
output_dir.mkdir(exist_ok=True)

def download_image(query, output_path):
    try:
        with DDGS() as ddgs:
            results = ddgs.images(query, max_results=1)
            for result in results:
                url = result["image"]
                try:
                    response = requests.get(url, timeout=10)
                    if response.status_code == 200:
                        with open(output_path, "wb") as f:
                            f.write(response.content)
                        return True
                except Exception as e:
                    print(f"Failed to download from {url}: {e}")
    except Exception as e:
        print(f"Search failed for '{query}': {e}")
    return False

for i, row in df.iterrows():
    name = row["Celebrity"]
    filename = name.replace(" ", "_") + ".jpg"
    out_path = output_dir / filename

    if out_path.exists():
        print(f"✅ Skipping (already exists): {name}")
        continue

    print(f"🔍 Downloading: {name}")
    query = f"{name} face photo"
    success = download_image(query, out_path)

    if not success:
        print(f"❌ Failed to download: {name}")

    time.sleep(1.5)  # Delay to avoid rate limits
