In [11]:
"""
wgb_scraper.py   •  tested May‑2025

Scrapes https://www.worldgovernmentbonds.com/world-credit-ratings/
and every linked country page, saving:

    world_credit_ratings_current.csv
    historical_ratings_sp.csv
    historical_ratings_moodys.csv
    historical_ratings_fitch.csv
    historical_ratings_dbrs.csv
"""

import re
import time
from collections import defaultdict

import pandas as pd
from slugify import slugify          # pip install python-slugify
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager


WORLD_URL = "https://www.worldgovernmentbonds.com/world-credit-ratings/"


# ─────────────────── Selenium helpers ────────────────────
def start_browser(headless: bool = True) -> webdriver.Chrome:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")      # Chrome ≥118
    opts.add_argument("--disable‑gpu")
    opts.add_argument("--no‑sandbox")
    opts.add_argument("--window‑size=1920,1080")
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=opts,
    )


def dismiss_banners(driver: webdriver.Chrome) -> None:
    """Close cookie / consent banners if present."""
    for btn in driver.find_elements(By.TAG_NAME, "button"):
        if re.search(r"(accept|agree|ok|got it)", btn.text, re.I):
            try:
                btn.click()
            except Exception:
                pass


# ───────────────────  overview page  ─────────────────────
def scrape_overview(driver: webdriver.Chrome) -> pd.DataFrame:
    driver.get(WORLD_URL)
    dismiss_banners(driver)

    # 1️⃣  wait for the CREDIT‑RATINGS iframe to arrive …
    frame = WebDriverWait(driver, 30).until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "iframe[src*='world-credit-ratings']")
        )
    )
    driver.switch_to.frame(frame)

    # 2️⃣  … then wait until at least one REAL rating row is present
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located(
            (By.XPATH, "//tbody/tr/td[2][matches(text(), '^[A-Z]{1,3}$')]"),
        )
    )

    rows = driver.find_elements(By.CSS_SELECTOR, "tbody tr")
    records = []
    for r in rows:
        tds = r.find_elements(By.TAG_NAME, "td")
        if len(tds) < 5:   # a defensive guard
            continue

        country_name = tds[0].text.strip()
        # some rows have <a>, some don't
        try:
            link = tds[0].find_element(By.TAG_NAME, "a").get_attribute("href")
        except Exception:
            link = f"https://www.worldgovernmentbonds.com/credit-rating/{slugify(country_name)}/"

        sp, moodys, fitch, dbrs = (c.text.strip() for c in tds[1:5])
        records.append(
            dict(
                country=country_name,
                sp=sp,
                moodys=moodys,
                fitch=fitch,
                dbrs=dbrs,
                url=link,
            )
        )

    # very important: leave the iframe context
    driver.switch_to.default_content()
    return pd.DataFrame(records)


# ─────────────────  country pages  ────────────────────────
def scrape_country_history(
    driver: webdriver.Chrome, url: str
) -> dict[str, pd.DataFrame]:
    driver.get(url)
    dismiss_banners(driver)

    # wait for any agency card to be fully populated
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located(
            (By.XPATH, "//div[contains(@class,'card')]//tbody/tr/td[2][matches(text(),'^[A-Z]{1,3}$')]")
        )
    )

    agency_frames = {}
    for card in driver.find_elements(By.CSS_SELECTOR, "div.card"):
        header = card.find_element(By.CSS_SELECTOR, "h2").text.strip()
        match = re.match(r"(S&P|Moody|Fitch|DBRS)", header)
        if not match:
            continue

        agency = match.group(1)  # 'S&P', "Moody", etc.
        dates, ratings, outlooks = [], [], []
        for tr in card.find_elements(By.CSS_SELECTOR, "tbody tr"):
            td = tr.find_elements(By.TAG_NAME, "td")
            if len(td) < 3:
                continue
            dates.append(td[0].text.strip())
            ratings.append(td[1].text.strip())
            outlooks.append(td[2].text.strip())

        df = pd.DataFrame({"date": dates, "rating": ratings, "outlook": outlooks})
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        agency_frames[agency] = df

    return agency_frames


# ────────────────────  main  ──────────────────────────────
def main() -> None:
    driver = start_browser(headless=True)

    print("Fetching world overview …", flush=True)
    overview = scrape_overview(driver)
    if overview.empty:
        raise RuntimeError("Overview table scraped empty – selectors may need adjustment.")

    overview.to_csv("world_credit_ratings_current.csv", index=False)
    print(f"Saved overview  ({len(overview)} rows) → world_credit_ratings_current.csv", flush=True)

    history: dict[str, list[pd.DataFrame]] = defaultdict(list)

    print("Scraping individual country histories …", flush=True)
    for _, row in tqdm(overview.iterrows(), total=len(overview)):
        country = row["country"]
        try:
            for agency, df in scrape_country_history(driver, row["url"]).items():
                df.insert(0, "country", country)
                history[agency].append(df)
        except Exception as exc:
            # log & continue
            print(f"⚠️  {country}: {exc}", flush=True)

    driver.quit()

    for agency, dfs in history.items():
        if not dfs:
            continue
        full = pd.concat(dfs, ignore_index=True)
        fname = f"historical_ratings_{slugify(agency)}.csv"
        full.to_csv(fname, index=False)
        print(f"Saved {agency} history → {fname}  ({len(full)} rows)", flush=True)


# ────────────────────  auto‑run inside notebooks ──────────
if __name__ == "__main__":
    main()


Fetching world overview …


InvalidSelectorException: Message: invalid selector: Unable to locate an element with the xpath expression //tbody/tr/td[2][matches(text(), '^[A-Z]{1,3}$')] because of the following error:
SyntaxError: Failed to execute 'evaluate' on 'Document': The string '//tbody/tr/td[2][matches(text(), '^[A-Z]{1,3}$')]' is not a valid XPath expression.
  (Session info: chrome=135.0.7049.115); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalid-selector-exception
Stacktrace:
0   chromedriver                        0x0000000102deaa54 cxxbridge1$str$ptr + 2803960
1   chromedriver                        0x0000000102de2cf0 cxxbridge1$str$ptr + 2771860
2   chromedriver                        0x000000010292e864 cxxbridge1$string$len + 93028
3   chromedriver                        0x0000000102934468 cxxbridge1$string$len + 116584
4   chromedriver                        0x00000001029367a0 cxxbridge1$string$len + 125600
5   chromedriver                        0x000000010293681c cxxbridge1$string$len + 125724
6   chromedriver                        0x0000000102936848 cxxbridge1$string$len + 125768
7   chromedriver                        0x0000000102974fe0 cxxbridge1$string$len + 381664
8   chromedriver                        0x00000001029b6480 cxxbridge1$string$len + 649088
9   chromedriver                        0x00000001029697ec cxxbridge1$string$len + 334572
10  chromedriver                        0x0000000102dafccc cxxbridge1$str$ptr + 2562928
11  chromedriver                        0x0000000102db2f98 cxxbridge1$str$ptr + 2575932
12  chromedriver                        0x0000000102d902c4 cxxbridge1$str$ptr + 2433384
13  chromedriver                        0x0000000102db3810 cxxbridge1$str$ptr + 2578100
14  chromedriver                        0x0000000102d812f0 cxxbridge1$str$ptr + 2371988
15  chromedriver                        0x0000000102dd357c cxxbridge1$str$ptr + 2708512
16  chromedriver                        0x0000000102dd3708 cxxbridge1$str$ptr + 2708908
17  chromedriver                        0x0000000102de293c cxxbridge1$str$ptr + 2770912
18  libsystem_pthread.dylib             0x000000018cc72034 _pthread_start + 136
19  libsystem_pthread.dylib             0x000000018cc6ce3c thread_start + 8


In [12]:
import json, pandas as pd, requests, time, pathlib

BASE = "https://www.worldgovernmentbonds.com/data"

world   = requests.get(f"{BASE}/world_credit_ratings.json").json()
world_df = pd.DataFrame(world["data"])     # inspect to confirm keys
world_df.to_csv("world_credit_ratings_current.csv", index=False)

hist_dir = pathlib.Path("history_json")
hist_dir.mkdir(exist_ok=True)

for c in world_df["country_code"]:         # whatever key they use
    url = f"{BASE}/ratings_history/{c}.json"
    r   = requests.get(url)
    if r.status_code != 200:
        print("skip", c)
        continue
    with open(hist_dir / f"{c}.json", "wb") as f:
        f.write(r.content)
    time.sleep(0.3)                        # be polite


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [5]:
main()

Fetching world overview …
Saved overview → world_credit_ratings_current.csv
Scraping individual country histories …


0it [00:00, ?it/s]


In [None]:
try:
    print("main =", main)
except NameError:
    print("main not defined")


In [None]:
print('hello')