## Plan
Soo.. Snopes has a montly sitemap that contains all their articles and urls. My code pulls the 7 (Jan-July 2025) sitemap XMLs--- to get every URL for each month. Then, I need to visit each link only once and get the data.

In [1]:
import time, json, csv, re
from pathlib import Path
from datetime import datetime
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dateparser
from tqdm import tqdm

In [7]:
BASE = "https://media.snopes.com/sitemaps"
MONTHS = [f"{m:02d}" for m in range(1, 8)]  # Jan–Jul
SITEMAPS = [f"{BASE}/sitemap-articles-2025-{m}.xml" for m in MONTHS]

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DyuthiSnopesScraper/1.0; +https://example.org/)",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}

OUT_CSV = Path("snopes_2025_JanJul.csv") #Output as CSV--- my preferred output.
OUT_JSONL = Path("snopes_2025_JanJul.jsonl")

## Functions

In [13]:
def fetch(url, kind="html", timeout=20):
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text if kind in ("html","xml") else r.content

def parse_sitemap(xml_text):
    soup = BeautifulSoup(xml_text, "xml")
    out = []
    for url in soup.find_all("url"):
        loc = url.loc.get_text(strip=True)
        lastmod = url.lastmod.get_text(strip=True) if url.lastmod else None
        out.append({"url": loc, "lastmod": lastmod})
    return out

def extract_jsonld(soup):
    # Grab the most complete JSON-LD block (Article/NewsArticle)
    data = {}
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            block = json.loads(tag.get_text(strip=True))
        except json.JSONDecodeError:
            continue
        # Some pages wrap JSON-LD in a list
        blocks = block if isinstance(block, list) else [block]
        for b in blocks:
            t = (b.get("@type") or "").lower()
            if "article" in t or "newsarticle" in t:
                data = b
                return data
    return data

def clean_text(txt):
    txt = re.sub(r"\s+", " ", txt or "").strip()
    return txt

def extract_article(url):
    html = fetch(url, "html")
    soup = BeautifulSoup(html, "lxml")

    # Prefer JSON-LD for structured fields
    ld = extract_jsonld(soup)

    title = (ld.get("headline") or
             (soup.find("h1").get_text(strip=True) if soup.find("h1") else None))

    author = None
    if ld.get("author"):
        if isinstance(ld["author"], list) and ld["author"]:
            author = ld["author"][0].get("name")
        elif isinstance(ld["author"], dict):
            author = ld["author"].get("name")
        elif isinstance(ld["author"], str):
            author = ld["author"]

    date_published = ld.get("datePublished") or ld.get("dateCreated") or None
    if not date_published:
        # Fallback: OpenGraph/article meta
        meta = soup.find("meta", attrs={"property": "article:published_time"})
        if meta and meta.get("content"):
            date_published = meta["content"]

    # Body: prefer JSON-LD 'articleBody', else collect <article> paragraphs
    body = ld.get("articleBody")
    if not body:
        article_tag = soup.find("article")
        if article_tag:
            paras = [p.get_text(" ", strip=True) for p in article_tag.find_all(["p","li"]) if p.get_text(strip=True)]
            body = "\n".join(paras) if paras else None

    # Category (tag), rating (if fact-check)
    category = None
    rating = None
    # Often present in breadcrumbs or JSON-LD
    if ld.get("articleSection"):
        category = ld["articleSection"] if isinstance(ld["articleSection"], str) else ", ".join(ld["articleSection"])

    # Rating sometimes appears in page badges—heuristic scrape:
    badge = soup.select_one('[class*="rating"], [class*="badge"]')
    if badge:
        rating = badge.get_text(" ", strip=True)

    # Normalize date
    pub_dt = None
    if date_published:
        try:
            pub_dt = dateparser.parse(date_published)
        except Exception:
            pub_dt = None

    return {
        "url": url,
        "title": clean_text(title),
        "author": clean_text(author),
        "date_published": pub_dt.isoformat() if pub_dt else date_published,
        "category": clean_text(category),
        "rating": clean_text(rating),
        "body": body.strip() if body else None,
    }

def in_range_2025_jan_jul(dt_iso):
    if not dt_iso:
        return True  # keep if unknown (sitemaps already month-scoped)
    try:
        d = dateparser.parse(dt_iso)
        return datetime(2025,1,1) <= d <= datetime(2025,7,31,23,59,59)
    except Exception:
        return True



## Run Fns- Let's scrapee!!

In [None]:

def main():
    # 1) Collect URLs from monthly sitemaps
    print("Fetching sitemaps…")
    urls = []
    for sm in SITEMAPS:
        try:
            xml = fetch(sm, "xml")
            urls += parse_sitemap(xml)
        except Exception as e:
            print(f"Failed sitemap {sm}: {e}")

    # Deduplicate while keeping latest modification of article
    seen = {}
    for u in urls:
        seen[u["url"]] = u.get("lastmod")
    url_list = list(seen.keys())
    print(f"Found {len(url_list)} article URLs from Jan–Jul 2025 sitemaps.")

    # 2) Visit links & extract
    rows = []
    with open(OUT_JSONL, "w", encoding="utf-8") as jf:
        for url in tqdm(url_list, desc="Scraping"):
            try:
                art = extract_article(url)
                if in_range_2025_jan_jul(art["date_published"]):
                    rows.append(art)
                    jf.write(json.dumps(art, ensure_ascii=False) + "\n")
            except Exception as e:
                rows.append({"url": url, "error": str(e)})
            time.sleep(0.8)  # slowwwwwwwwly to respect site rates

 # 3) Save CSV
    fieldnames = ["url","title","author","date_published","category","rating","body"]
    with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k, "") for k in fieldnames})

    print(f"Wrote {len(rows)} records to {OUT_CSV} and {OUT_JSONL}")


if __name__ == "__main__":
    main()

Fetching sitemaps…
Found 1809 article URLs from Jan–Jul 2025 sitemaps.


Scraping:  99%|███████████████████████████████████████████████████████████████████▌| 1797/1809 [31:10<00:11,  1.02it/s]