In [24]:
import time
import requests
import pandas as pd

# =========================
# 0) API + Session settings
# =========================
BASE = "https://api.gdeltproject.org/api/v2/doc/doc"

session = requests.Session()
session.headers.update({
    # Change this string if you want (keep something informative)
    "User-Agent": "student-dataviz-project/1.0"
})

# ===========================================
# 1) Robust fetchers (handles 429 + non-JSON)
# ===========================================
def _get_json_with_backoff(params, retries=8, backoff_start=1, backoff_cap=60):
    """
    Low-level GET with:
    - 429 rate-limit handling (Retry-After if present + exponential backoff)
    - non-JSON detection (prints first chars of HTML/text errors)
    """
    backoff = backoff_start

    for attempt in range(1, retries + 1):
        r = session.get(BASE, params=params, timeout=60)

        # Rate limit: wait and retry
        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            sleep_s = int(retry_after) if (retry_after and retry_after.isdigit()) else backoff
            print(f"[429] Rate limited. Sleeping {sleep_s}s (attempt {attempt}/{retries})")
            time.sleep(sleep_s)
            backoff = min(backoff * 2, backoff_cap)
            continue

        # Other HTTP errors
        if r.status_code != 200:
            print(f"[HTTP {r.status_code}] First 300 chars:\n{r.text[:300]}")
            r.raise_for_status()

        # Ensure JSON
        ctype = (r.headers.get("Content-Type") or "").lower()
        if "json" not in ctype:
            print(f"[WARN] Non-JSON Content-Type: {ctype}. First 300 chars:\n{r.text[:300]}")
            raise RuntimeError("Response was not JSON (likely an HTML error page).")

        return r.json()

    raise RuntimeError("Still rate-limited after retries. Reduce calls or try later.")


def fetch_timeline(query, mode, start, end):
    """
    mode: "TimelineTone" or "TimelineVol"
    start/end: YYYYMMDDHHMMSS (UTC)
    """
    # Parameters you can/should change:
    # - query (your outlet + topic query)
    # - mode ("TimelineTone" for tone, "TimelineVol" for volume)
    # - start/end (time window; use 10 years for final run, shorter for testing)
    params = {
        "query": query,
        "mode": mode,
        "format": "json",
        "startdatetime": start,
        "enddatetime": end,
    }
    return _get_json_with_backoff(params)


# =========================================
# 1B) Timeline parser (GDELT response shape)
# =========================================
# GDELT Timeline* responses have the shape:
# data["timeline"] = [ {"series": "...", "data": [ {"date": "...", "value": ...}, ... ] } ]
# So the points live in timeline[0]["data"], NOT directly in timeline.
def extract_timeline_points(api_response):
    """Return list of {date, value} points from a GDELT Timeline* API response."""
    tl = api_response.get("timeline")
    if not isinstance(tl, list) or len(tl) == 0:
        return []

    first = tl[0]
    if not isinstance(first, dict):
        return []

    points = first.get("data")
    if not isinstance(points, list):
        return []

    clean = []
    for p in points:
        if isinstance(p, dict) and p.get("date") is not None and p.get("value") is not None:
            clean.append({"date": p["date"], "value": p["value"]})
    return clean

In [25]:
# ===========================================
# 2) Project knobs: outlets, topics, time span
# ===========================================

# (A) OUTLETS (edit list: add/remove domains)
US_OUTLETS = {
    "NYTimes": "domainis:nytimes.com",
    "FoxNews": "domainis:foxnews.com",
    "CNN": "domainis:cnn.com",
    "WashingtonPost": "domainis:washingtonpost.com",
    "NBCNews": "domainis:nbcnews.com",
    "Politico": "domainis:politico.com",
    "WSJ": "domainis:wsj.com",
    # Optional adds:
    # "NPR": "domainis:npr.org",
    # "Axios": "domainis:axios.com",
}

# (B) TOPICS (edit keywords; keep groups small & interpretable)
TOPICS = {
    "Elections": "(election OR vote OR campaign)",
    "Government": "(government OR congress OR white house OR senate)",
    "Immigration": "(immigration OR border OR migrant OR asylum)",
    "ForeignPolicy": "(war OR ukraine OR china OR israel)",
    # Optional:
    "Economy": "(inflation OR jobs OR unemployment OR recession)",
    "Political Figures": "(trump OR biden OR harris)",
}

# (C) TIME WINDOW (edit for testing vs final)
# - For a decade: e.g., 2015-01-01 to 2025-01-01
START = "20170101000000"  # <- change
END   = "20260101000000"  # <- change

# (D) OPTIONAL: add a general politics filter to keep results political.
# You can set this to "" to disable.
# POLITICS_FILTER = "(politic* OR election OR government OR congress OR senate OR president)"  # <- change/disable
POLITICS_FILTER = ""  # <- change/disable

# (E) Rate-limit friendliness: delay between successful calls (seconds)
SLEEP_BETWEEN_CALLS = 1.0  # <- increase (e.g., 2-5s) if you still get 429



In [27]:
# ===========================================
# 3) Build dataset (FIXED for long queries)
# ===========================================
# Strategy:
# - NEVER build one huge OR query across all topics
# - Run ONE QUERY PER (outlet × topic)
# - Aggregate tone later (weighted by volume)
#
# This avoids "query too long / too short" errors
# and is methodologically cleaner.

rows = []

# -------------------------------------------
# 3A) Per-outlet × per-topic TONE
# -------------------------------------------
# We compute tone separately for each topic.
# Later, "overall tone" can be computed as a
# volume-weighted average across topics.

for outlet_name, outlet_q in US_OUTLETS.items():
    for topic_name, topic_q in TOPICS.items():

        query = f"{outlet_q} {topic_q}"

        data = fetch_timeline(
            query=query,
            mode="TimelineTone",
            start=START,
            end=END
        )

        points = extract_timeline_points(data)

        for p in points:
            rows.append({
                "date": p["date"],
                "outlet": outlet_name,
                "topic": topic_name,
                "metric": "tone",
                "value": p["value"],
            })

        time.sleep(SLEEP_BETWEEN_CALLS)

# -------------------------------------------
# 3B) Per-outlet × per-topic VOLUME
# -------------------------------------------
# Used both directly for topic emphasis
# and later to weight tone.

for outlet_name, outlet_q in US_OUTLETS.items():
    for topic_name, topic_q in TOPICS.items():

        query = f"{outlet_q} {topic_q}"

        data = fetch_timeline(
            query=query,
            mode="TimelineVol",
            start=START,
            end=END
        )

        points = extract_timeline_points(data)

        for p in points:
            rows.append({
                "date": p["date"],
                "outlet": outlet_name,
                "topic": topic_name,
                "metric": "volume",
                "value": p["value"],
            })

        time.sleep(SLEEP_BETWEEN_CALLS)

# ===========================================
# 4) Assemble dataframe
# ===========================================
df = pd.DataFrame(rows)

df["value"] = pd.to_numeric(df["value"], errors="coerce")
df["year"] = df["date"].astype(str).str[:4]

df.head()

[429] Rate limited. Sleeping 1s (attempt 1/8)
[429] Rate limited. Sleeping 2s (attempt 2/8)
[429] Rate limited. Sleeping 4s (attempt 3/8)


Unnamed: 0,date,outlet,topic,metric,value,year
0,20170101T000000Z,NYTimes,Elections,tone,-2.2086,2017
1,20170102T000000Z,NYTimes,Elections,tone,-2.7811,2017
2,20170103T000000Z,NYTimes,Elections,tone,-2.149,2017
3,20170104T000000Z,NYTimes,Elections,tone,-1.7348,2017
4,20170105T000000Z,NYTimes,Elections,tone,-2.643,2017


In [28]:
# =========================
# 4) Assemble + save outputs
# =========================
df = pd.DataFrame(rows)

# Basic cleaning: ensure numeric
df["value"] = pd.to_numeric(df["value"], errors="coerce")

# Optional: create a year column for easier grouping in viz tools
df["year"] = df["date"].astype(str).str[:4]

# Optional: compute topic share per outlet-date (only for volume rows)
vol = df[df["metric"] == "volume"].copy()
tot = vol.groupby(["outlet", "date"], as_index=False)["value"].sum().rename(columns={"value": "total_volume"})
vol = vol.merge(tot, on=["outlet", "date"], how="left")
vol["topic_share"] = vol["value"] / vol["total_volume"]

# Save:
# - df: long format containing tone + volumes
# - vol: volume rows with topic_share computed
df.to_csv("gdelt_us_politics_tone_and_topics_long.csv", index=False)
vol.to_csv("gdelt_us_politics_topic_share.csv", index=False)

df.head(), vol.head()

(               date   outlet      topic metric   value  year
 0  20170101T000000Z  NYTimes  Elections   tone -2.2086  2017
 1  20170102T000000Z  NYTimes  Elections   tone -2.7811  2017
 2  20170103T000000Z  NYTimes  Elections   tone -2.1490  2017
 3  20170104T000000Z  NYTimes  Elections   tone -1.7348  2017
 4  20170105T000000Z  NYTimes  Elections   tone -2.6430  2017,
                date   outlet      topic  metric   value  year  total_volume  \
 0  20170101T000000Z  NYTimes  Elections  volume  0.0049  2017        0.0289   
 1  20170102T000000Z  NYTimes  Elections  volume  0.0049  2017        0.0264   
 2  20170103T000000Z  NYTimes  Elections  volume  0.0071  2017        0.0361   
 3  20170104T000000Z  NYTimes  Elections  volume  0.0070  2017        0.0409   
 4  20170105T000000Z  NYTimes  Elections  volume  0.0070  2017        0.0419   
 
    topic_share  
 0     0.169550  
 1     0.185606  
 2     0.196676  
 3     0.171149  
 4     0.167064  )