In [1]:
import requests
import time
import os
import random
import pandas as pd

In [25]:
# files
INPUT_CSV = "artists_unique.csv"
PARTIAL_CSV = "artist_gender_lookup_partial.csv"
FINAL_CSV = "artist_gender_lookup.csv"

# checkpointing 
SAVE_EVERY_SECONDS = 120 

# settings (pacing)
SLEEP_SEARCH_BASE = 0.03
SLEEP_SPARQL_BASE = 0.08
JITTER = 0.05
LIMIT_N = None

SEARCH_URL = "https://www.wikidata.org/w/api.php"
SPARQL_URL = "https://query.wikidata.org/sparql"

HEADERS = {
    "Accept": "application/sparql-results+json",
    "User-Agent": "artist-gender-research/1.0 (contact: you@example.com)"
}

# persistent HTTP session 
session = requests.Session()

MALE_QID = "Q6581097"
FEMALE_QID = "Q6581072"

# caches
qid_cache = {}      # name -> QID or None
gender_cache = {}   # QID -> "Male"/"Female"/None

In [22]:
# functions
def wikidata_search_qid(name: str, limit: int = 1, max_retries: int = 4) -> str | None:
    """Return top Wikidata QID for a name using the Wikidata search API."""
    params = {
        "action": "wbsearchentities",
        "search": name,
        "language": "en",
        "format": "json",
        "limit": limit,
        "type": "item",
    }

    for attempt in range(max_retries):
        try:
            r = session.get(
                SEARCH_URL,
                params=params,
                headers={"User-Agent": HEADERS["User-Agent"]},
                timeout=30
            )

            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(min(2 ** attempt, 5) + random.random())
                continue

            if r.status_code != 200:
                return None

            data = r.json()
            hits = data.get("search", [])
            return hits[0]["id"] if hits else None

        except requests.exceptions.RequestException:
            time.sleep(min(2 ** attempt, 5) + random.random())
            continue
        except Exception:
            return None

    return None

def gender_from_qid_fast(qid: str, max_retries: int = 5) -> str | None:
    """
    Returns "Male", "Female", or None using P21.
    Uses capped backoff to avoid 10s+ stalls per artist.
    """
    if qid in gender_cache:
        return gender_cache[qid]

    query = f"""
    SELECT ?gender WHERE {{
      wd:{qid} wdt:P21 ?gender .
    }} LIMIT 1
    """

    for attempt in range(max_retries):
        try:
            r = session.get(
                SPARQL_URL,
                params={"query": query},
                headers=HEADERS,
                timeout=40
            )

            if r.status_code in (429, 500, 502, 503, 504):
                time.sleep(min(2 ** attempt, 5) + random.random())
                continue

            if r.status_code != 200:
                gender_cache[qid] = None
                return None

            data = r.json()
            bindings = data.get("results", {}).get("bindings", [])
            if not bindings:
                gender_cache[qid] = None
                return None

            gender_uri = bindings[0]["gender"]["value"]
            if gender_uri.endswith(MALE_QID):
                gender_cache[qid] = "Male"
                return "Male"
            if gender_uri.endswith(FEMALE_QID):
                gender_cache[qid] = "Female"
                return "Female"

            gender_cache[qid] = None
            return None

        except requests.exceptions.RequestException:
            time.sleep(min(2 ** attempt, 5) + random.random())
            continue
        except Exception:
            gender_cache[qid] = None
            return None

    gender_cache[qid] = None
    return None

def get_gender_wikidata_robust_fast(name: str) -> str | None:
    """
    name -> QID via search API -> gender via SPARQL
    """
    if name in qid_cache:
        qid = qid_cache[name]
    else:
        qid = wikidata_search_qid(name)
        qid_cache[name] = qid

    if not qid:
        return None

    return gender_from_qid_fast(qid)

In [23]:
# load names
artists = pd.read_csv(INPUT_CSV)

if "artist_name" not in artists.columns:
    raise ValueError("INPUT_CSV must contain a column named 'artist_name'")

artists = artists.dropna(subset=["artist_name"])
artists["artist_name"] = artists["artist_name"].astype(str).str.strip()
artists = artists.drop_duplicates(subset=["artist_name"]).reset_index(drop=True)

if LIMIT_N is not None:
    artists = artists.head(LIMIT_N).reset_index(drop=True)

names = artists["artist_name"].tolist()
print(f"Unique names to process: {len(names):,}")

Unique names to process: 307,286


In [24]:
# resume support
if os.path.exists(PARTIAL_CSV):
    done = pd.read_csv(PARTIAL_CSV)
    done["artist_name"] = done["artist_name"].astype(str).str.strip()
    done_set = set(done["artist_name"])
    print(f"Resuming: already have {len(done_set):,} artists")
else:
    done_set = set()
    print("Starting fresh")

rows = []
processed_new = 0
start = time.time()
last_save = time.time()

# recent speed window
recent_processed = 0
recent_start = time.time()

for name in names:
    if name in done_set:
        continue

    g = get_gender_wikidata_robust_fast(name)

    rows.append({"artist_name": name, "artist_gender": g})
    done_set.add(name)
    processed_new += 1
    recent_processed += 1

    # pacing
    time.sleep(SLEEP_SEARCH_BASE + SLEEP_SPARQL_BASE + random.random() * JITTER)

    # time-based checkpoint (append only new rows)
    if rows and (time.time() - last_save) >= SAVE_EVERY_SECONDS:
        out = pd.DataFrame(rows)

        t0 = time.time()
        out.to_csv(
            PARTIAL_CSV,
            mode="a",
            header=not os.path.exists(PARTIAL_CSV),
            index=False
        )
        save_secs = time.time() - t0

        rows = []
        last_save = time.time()

        elapsed = time.time() - start
        overall_sec_per = elapsed / processed_new if processed_new else float("inf")

        recent_elapsed = time.time() - recent_start
        recent_sec_per = recent_elapsed / max(recent_processed, 1)

        print(
            f"Checkpoint saved: total≈{len(done_set):,} | "
            f"recent={recent_sec_per:.2f} sec/artist | "
            f"overall={overall_sec_per:.2f} sec/artist | "
            f"save={save_secs:.2f}s"
        )

        recent_processed = 0
        recent_start = time.time()

# final flush (append remaining)
if rows:
    out = pd.DataFrame(rows)
    out.to_csv(
        PARTIAL_CSV,
        mode="a",
        header=not os.path.exists(PARTIAL_CSV),
        index=False
    )

# read partial once, dedupe, save final
done = pd.read_csv(PARTIAL_CSV)
done["artist_name"] = done["artist_name"].astype(str).str.strip()
done = done.drop_duplicates(subset=["artist_name"], keep="last")

done.to_csv(FINAL_CSV, index=False)
print(f"Saved {len(done):,} rows to {FINAL_CSV}")

Resuming: already have 22,000 artists
Checkpoint saved: total≈22,284 | recent=0.42 sec/artist | overall=0.42 sec/artist | save=0.01s
Checkpoint saved: total≈22,545 | recent=0.46 sec/artist | overall=0.44 sec/artist | save=0.00s
Checkpoint saved: total≈22,793 | recent=0.49 sec/artist | overall=0.46 sec/artist | save=0.01s
Checkpoint saved: total≈23,039 | recent=0.49 sec/artist | overall=0.46 sec/artist | save=0.00s
Checkpoint saved: total≈23,278 | recent=0.50 sec/artist | overall=0.47 sec/artist | save=0.02s
Checkpoint saved: total≈23,532 | recent=0.47 sec/artist | overall=0.47 sec/artist | save=0.01s
Checkpoint saved: total≈23,776 | recent=0.49 sec/artist | overall=0.47 sec/artist | save=0.01s
Checkpoint saved: total≈24,014 | recent=0.51 sec/artist | overall=0.48 sec/artist | save=0.01s
Checkpoint saved: total≈24,294 | recent=0.43 sec/artist | overall=0.47 sec/artist | save=0.01s
Checkpoint saved: total≈24,589 | recent=0.41 sec/artist | overall=0.46 sec/artist | save=0.00s
Checkpoint s