In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WorldGovernmentBonds – sovereign credit-rating history scraper
--------------------------------------------------------------
• Works headless or visible (undetected-chromedriver preferred)
• Resumes automatically (progress + error logs)
• Output: credit_ratings_historical_data.csv   columns =
    Country | Agency | Date | Rating | Outlook
"""

# --------------------------------------------------------------------------- #
# 1. imports
# --------------------------------------------------------------------------- #
import os, sys, re, csv, time, random, logging
from datetime import datetime

import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException

# choose chrome driver flavour
try:
    import undetected_chromedriver as uc          # > pip install undetected-chromedriver
    ChromeImpl = "uc"
except ImportError:
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    ChromeImpl = "selenium"


# --------------------------------------------------------------------------- #
# 2. helpers
# --------------------------------------------------------------------------- #
def setup_logging():
    os.makedirs("logs", exist_ok=True)
    fn = f"logs/credit_rating_scraper_{datetime.now():%Y%m%d_%H%M%S}.log"
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
        handlers=[logging.FileHandler(fn, encoding="utf-8"), logging.StreamHandler(sys.stdout)],
    )
    return logging.getLogger(__name__)


def setup_driver(headless: bool = True):
    if ChromeImpl == "uc":
        opts = uc.ChromeOptions()
        if headless:
            opts.add_argument("--headless=new")
        opts.add_argument("--window-size=1920,1080")
        driver = uc.Chrome(options=opts)
    else:
        from selenium.webdriver.chrome.options import Options
        opts = Options()
        if headless:
            opts.add_argument("--headless=new")
        opts.add_argument("--window-size=1920,1080")
        driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()), options=opts
        )
    driver.set_page_load_timeout(60)
    return driver


# --------------------------------------------------------------------------- #
# 3. country-link scraper  (FIXED VERSION)
# --------------------------------------------------------------------------- #
def get_country_links(driver, base_url: str, log):
    log.info(f"Loading {base_url}")
    driver.get(base_url)

    # wait until at least one table is present (JS has rendered)
    WebDriverWait(driver, 30).until(
        lambda d: len(d.find_elements(By.TAG_NAME, "table")) > 0
    )
    time.sleep(2)  # breathing room

    tables = driver.find_elements(By.TAG_NAME, "table")
    log.info(f"{len(tables)} tables found")

    if not tables:
        return []

    target = max(tables, key=lambda t: len(t.find_elements(By.TAG_NAME, "tr")))
    links = target.find_elements(By.XPATH, ".//a[contains(@href,'/credit-rating/')]")

    pairs = []
    for a in links:
        name, href = a.text.strip(), a.get_attribute("href")
        if name and href:
            pairs.append((name, href))

    log.info(f"{len(pairs)} country links harvested")
    return pairs


# --------------------------------------------------------------------------- #
# 4. historical table parser
# --------------------------------------------------------------------------- #
def clean_date(s: str) -> str:
    # handle YYYY-MM-DD already fine
    if re.match(r"\d{4}-\d{2}-\d{2}", s):
        return s
    # MM/DD/YYYY or DD/MM/YYYY -> assume MDY (US site)
    if re.match(r"\d{1,2}/\d{1,2}/\d{4}", s):
        m, d, y = s.split("/")
        return f"{y}-{int(m):02d}-{int(d):02d}"
    # MMM DD, YYYY
    mth = {"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,
           "Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}
    m = re.match(r"([A-Za-z]{3})\s+(\d{1,2}),?\s+(\d{4})", s)
    if m:
        mon = mth.get(m.group(1)[:3], 1)
        return f"{m.group(3)}-{mon:02d}-{int(m.group(2)):02d}"
    return s


def scrape_country(driver, name: str, url: str, log) -> list[dict]:
    log.info(f"  ↳ {name}")
    driver.get(url)

    # wait for tables (some pages slow)
    try:
        WebDriverWait(driver, 30).until(
            lambda d: len(d.find_elements(By.TAG_NAME, "table")) > 0
        )
    except TimeoutException:
        log.warning("    table timeout")
        return []

    tables = driver.find_elements(By.TAG_NAME, "table")
    # heuristic: the biggest table with first col labelled "Date"
    table = None
    for t in sorted(tables, key=lambda t: -len(t.find_elements(By.TAG_NAME, "tr"))):
        headers = [h.text.strip().lower() for h in t.find_elements(By.TAG_NAME, "th")]
        if headers and "date" in headers[0]:
            table = t
            break
    if table is None:
        log.warning("    no suitable table")
        return []

    rows = table.find_elements(By.TAG_NAME, "tr")
    hdrs = [h.text.strip() for h in rows[0].find_elements(By.TAG_NAME, "th")]
    agency_cols = {i: hdr.split()[0] for i, hdr in enumerate(hdrs) if any(a in hdr for a in ("S&P","Moody","Fitch","DBRS"))}
    if not agency_cols:
        log.warning("    no agency columns")
        return []

    data = []
    for r in rows[1:]:
        cells = r.find_elements(By.TAG_NAME, "td")
        if not cells: continue
        date_txt = cells[0].text.strip()
        if not date_txt or date_txt == "-": continue
        date_txt = clean_date(date_txt)
        for idx, agency in agency_cols.items():
            if idx >= len(cells): continue
            val = cells[idx].text.strip()
            if not val or val == "-": continue
            outlook = "neutral"
            if "↓" in val: outlook, val = "negative", val.replace("↓","")
            elif "↑" in val: outlook, val = "positive", val.replace("↑","")
            val = re.sub(r"[^A-Za-z0-9+\-]", "", val)
            data.append(dict(Country=name, Agency=agency, Date=date_txt,
                             Rating=val, Outlook=outlook))
    log.info(f"    {len(data)} rows")
    return data


# --------------------------------------------------------------------------- #
# 5. main orchestrator
# --------------------------------------------------------------------------- #
def main():
    log = setup_logging()
    log.info("=== Scraper started ===")

    OUTPUT = "credit_ratings_historical_data.csv"
    PROGRESS = "credit_ratings_done.txt"
    ERRORS = "credit_ratings_failed.txt"

    os.makedirs("data", exist_ok=True)
    OUTPUT = os.path.join("data", OUTPUT)
    PROGRESS = os.path.join("data", PROGRESS)
    ERRORS = os.path.join("data", ERRORS)

    done = set()
    if os.path.exists(PROGRESS):
        done = {l.strip() for l in open(PROGRESS, encoding="utf-8")}
        log.info(f"Resume mode: {len(done)} countries already scraped")

    failed_prior = set()
    if os.path.exists(ERRORS):
        failed_prior = {l.strip() for l in open(ERRORS, encoding="utf-8")}
        if failed_prior:
            log.info(f"{len(failed_prior)} countries previously failed")

    driver = setup_driver(headless=True)
    base = "https://www.worldgovernmentbonds.com/world-credit-ratings/"

    try:
        links = get_country_links(driver, base, log)
        if not links:
            log.error("No country links – aborting")
            return

        all_rows = []
        for i, (country, url) in enumerate(links, 1):
            if country in done:
                log.info(f"[{i}/{len(links)}] skip {country} (done)")
                continue
            rows = scrape_country(driver, country, url, log)
            if rows:
                all_rows.extend(rows)
                pd.DataFrame(rows).to_csv(
                    OUTPUT, mode="a", index=False,
                    header=not os.path.exists(OUTPUT), quoting=csv.QUOTE_ALL
                )
                with open(PROGRESS, "a", encoding="utf-8") as f:
                    f.write(country + "\n")
                done.add(country)
            else:
                with open(ERRORS, "a", encoding="utf-8") as f:
                    f.write(country + "\n")
            # polite delay
            time.sleep(random.uniform(3, 7))

            # refresh driver every 15 countries (memory leak avoidance)
            if i % 15 == 0:
                log.info("Refreshing driver …")
                driver.quit()
                driver = setup_driver(headless=True)

        log.info("=== Finished ===")
        log.info(f"Countries done: {len(done)}   failed: {len(links) - len(done)}")
        log.info(f"CSV rows total: {sum(1 for _ in open(OUTPUT, encoding='utf-8'))-1}")
    finally:
        driver.quit()


if __name__ == "__main__":
    main()


2025-05-12 00:13:05,073 - INFO - === Scraper started ===
2025-05-12 00:13:05,265 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-12 00:13:05,492 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-12 00:13:05,527 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-12 00:13:05,623 - INFO - WebDriver version 136.0.7103.92 selected
2025-05-12 00:13:05,626 - INFO - Modern chrome version https://storage.googleapis.com/chrome-for-testing-public/136.0.7103.92/mac-arm64/chromedriver-mac-arm64.zip
2025-05-12 00:13:05,626 - INFO - About to download new driver from https://storage.googleapis.com/chrome-for-testing-public/136.0.7103.92/mac-arm64/chromedriver-mac-arm64.zip
2025-05-12 00:13:05,915 - INFO - Driver downloading response is 200
2025-05-12 00:13:07,196 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-12 00:13:07,311 - INFO - Driver has been saved in cache [/Users/mac/.wdm/drivers/chromedriver/mac64/136.0.7103.92]
2025-05-

In [9]:
if __name__ == "__main__":
    main()


2025-05-11 23:45:35,665 - INFO - === Credit Rating Scraper Started ===
2025-05-11 23:45:35,667 - INFO - Created new output file: credit_rating_data/credit_ratings_historical_data.csv
2025-05-11 23:45:35,667 - INFO - Initializing WebDriver...
2025-05-11 23:45:36,015 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-11 23:45:36,051 - INFO - Get LATEST chromedriver version for google-chrome
2025-05-11 23:45:36,078 - INFO - Driver [/Users/mac/.wdm/drivers/chromedriver/mac64/136.0.7103.92/chromedriver-mac-arm64/chromedriver] found in cache
2025-05-11 23:45:37,125 - INFO - WebDriver successfully initialized
2025-05-11 23:45:37,126 - INFO - Retrieving country links from https://www.worldgovernmentbonds.com/world-credit-ratings/...
2025-05-11 23:45:37,126 - INFO - Navigating to main page: https://www.worldgovernmentbonds.com/world-credit-ratings/
2025-05-11 23:45:41,005 - INFO - Waiting for country table to load...
2025-05-11 23:46:11,386 - ERROR - Timed out waiting for countr