In [1]:
import os
import time
import random
import pandas as pd
import requests

In [2]:
# input/output files
INPUT_CSV = "artists_unique.csv"
PARTIAL_CSV = "artist_vitals_lookup_partial.csv"
FINAL_CSV = "artist_vitals_lookup.csv"

# checkpointing
SAVE_EVERY_SECONDS = 120

# settings
SLEEP_BASE = 0.15   
SLEEP_JITTER = 0.15 
LIMIT_N = None

In [10]:
SEARCH_URL = "https://www.wikidata.org/w/api.php"
SPARQL_URL = "https://query.wikidata.org/sparql"

HEADERS = {
    "Accept": "application/sparql-results+json",
    "User-Agent": "artist-vitals-research/1.0 (contact: you@example.com)"
}

def wikidata_search_qid(name: str, limit: int = 1) -> str | None:
    params = {
        "action": "wbsearchentities",
        "search": name,
        "language": "en",
        "format": "json",
        "limit": limit,
        "type": "item",
    }
    try:
        r = session.get(
            SEARCH_URL,
            params=params,
            headers={"User-Agent": HEADERS["User-Agent"]},
            timeout=30
        )
        if r.status_code != 200:
            return None
        data = r.json()
        hits = data.get("search", [])
        return hits[0]["id"] if hits else None
    except Exception:
        return None

session = requests.Session()

def death_date_from_qid(qid: str, max_retries: int = 5) -> pd.Timestamp | None:
    query = f"""
    SELECT ?death WHERE {{
      wd:{qid} wdt:P570 ?death .
    }} LIMIT 1
    """

    for attempt in range(max_retries):
        try:
            r = session.get(
                SPARQL_URL,
                params={"query": query},
                headers=HEADERS,
                timeout=40
            )

            if r.status_code in (429, 500, 502, 503, 504):
                # cap the backoff so one bad artist doesn't wreck averages
                time.sleep(min(2 ** attempt, 5) + random.random())
                continue

            if r.status_code != 200:
                return None

            data = r.json()
            bindings = data.get("results", {}).get("bindings", [])
            if not bindings:
                return None

            death_str = bindings[0]["death"]["value"]
            death_ts = pd.to_datetime(death_str, errors="coerce", utc=True)
            if pd.isna(death_ts):
                return None
            return death_ts.tz_convert(None)

        except requests.exceptions.RequestException:
            time.sleep(min(2 ** attempt, 5) + random.random())
            continue
        except Exception:
            return None

    return None


def get_vitals(name: str, qid_cache: dict, death_cache: dict) -> tuple[str | None, str, pd.Timestamp | None]:
    """
    Returns (qid, vital_status, death_date)
      - vital_status: "Dead" if death_date exists, else "Alive/Unknown"
    """
    if name in qid_cache:
        qid = qid_cache[name]
    else:
        qid = wikidata_search_qid(name)
        qid_cache[name] = qid

    if not qid:
        return None, "Alive/Unknown", None

    if qid in death_cache:
        death_dt = death_cache[qid]
    else:
        death_dt = death_date_from_qid(qid)
        death_cache[qid] = death_dt

    if death_dt is not None:
        return qid, "Dead", death_dt
    return qid, "Alive/Unknown", None

In [8]:
# load names
artists = pd.read_csv(INPUT_CSV)

if "artist_name" not in artists.columns:
    raise ValueError("INPUT_CSV must contain a column named 'artist_name'")

artists = artists.dropna(subset=["artist_name"])
artists["artist_name"] = artists["artist_name"].astype(str).str.strip()
artists = artists.drop_duplicates(subset=["artist_name"]).reset_index(drop=True)

if LIMIT_N is not None:
    artists = artists.head(LIMIT_N).reset_index(drop=True)

names = artists["artist_name"].tolist()
print(f"Unique names to process: {len(names):,}")

Unique names to process: 307,286


In [11]:
# resume support
file_exists = os.path.exists(PARTIAL_CSV)
if file_exists:
    done = pd.read_csv(PARTIAL_CSV)
    done["artist_name"] = done["artist_name"].astype(str)
    done_set = set(done["artist_name"])
    print(f"Resuming: already have {len(done_set):,} artists")
else:
    done_set = set()
    print("Starting fresh")

qid_cache = {}    # artist_name -> QID or None
death_cache = {}  # QID -> Timestamp or None

rows = []
processed_new = 0
start = time.time()
last_save = time.time()

# for "recent" speed measurement
recent_processed = 0
recent_start = time.time()

for name in names:
    if name in done_set:
        continue

    qid, vital_status, death_dt = get_vitals(name, qid_cache, death_cache)

    rows.append({
        "artist_name": name,
        "qid": qid,
        "vital_status": vital_status,
        "death_date": death_dt  # Timestamp or None
    })

    done_set.add(name)
    processed_new += 1
    recent_processed += 1

    # pacing
    time.sleep(SLEEP_BASE + random.random() * SLEEP_JITTER)

    # time-based checkpoint (APPEND instead of rewrite)
    if rows and (time.time() - last_save) >= SAVE_EVERY_SECONDS:
        out = pd.DataFrame(rows)

        t0 = time.time()
        out.to_csv(
            PARTIAL_CSV,
            mode="a",
            header=not os.path.exists(PARTIAL_CSV),
            index=False
        )
        save_secs = time.time() - t0

        rows = []
        last_save = time.time()

        elapsed = time.time() - start
        overall_sec_per = elapsed / processed_new if processed_new else float("inf")

        recent_elapsed = time.time() - recent_start
        recent_sec_per = recent_elapsed / max(recent_processed, 1)

        print(
            f"Checkpoint saved: total≈{len(done_set):,} | "
            f"recent={recent_sec_per:.2f} sec/artist | "
            f"overall={overall_sec_per:.2f} sec/artist | "
            f"save={save_secs:.2f}s"
        )

        # reset recent window
        recent_processed = 0
        recent_start = time.time()

# final flush (append remaining)
if rows:
    out = pd.DataFrame(rows)
    out.to_csv(
        PARTIAL_CSV,
        mode="a",
        header=not os.path.exists(PARTIAL_CSV),
        index=False
    )

# read partial, dedupe, save final
done = pd.read_csv(PARTIAL_CSV)
done["artist_name"] = done["artist_name"].astype(str).str.strip()
done = done.drop_duplicates(subset=["artist_name"], keep="last")

# ensure death_date is parsed cleanly
done["death_date"] = pd.to_datetime(done["death_date"], errors="coerce")

done.to_csv(FINAL_CSV, index=False)
print(f"Saved {len(done):,} rows to {FINAL_CSV}")

Resuming: already have 20,799 artists
Checkpoint saved: total≈20,893 | recent=1.29 sec/artist | overall=1.29 sec/artist | save=0.00s
Checkpoint saved: total≈21,001 | recent=1.11 sec/artist | overall=1.19 sec/artist | save=0.01s
Checkpoint saved: total≈21,226 | recent=0.54 sec/artist | overall=0.85 sec/artist | save=0.02s
Checkpoint saved: total≈21,458 | recent=0.52 sec/artist | overall=0.73 sec/artist | save=0.01s
Checkpoint saved: total≈21,681 | recent=0.54 sec/artist | overall=0.68 sec/artist | save=0.01s
Checkpoint saved: total≈21,913 | recent=0.52 sec/artist | overall=0.65 sec/artist | save=0.01s
Checkpoint saved: total≈22,144 | recent=0.52 sec/artist | overall=0.63 sec/artist | save=0.01s
Checkpoint saved: total≈22,343 | recent=0.61 sec/artist | overall=0.62 sec/artist | save=0.01s
Checkpoint saved: total≈22,544 | recent=0.60 sec/artist | overall=0.62 sec/artist | save=0.01s
Checkpoint saved: total≈22,743 | recent=0.61 sec/artist | overall=0.62 sec/artist | save=0.01s
Checkpoint s