In [1]:
%pip install requests tqdm --quiet

import os
import json
import time
import math
import requests
from datetime import datetime, timezone
from tqdm import tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
OUTPUT_PATH = "../data/wiki_content.jsonl" 
CUTOFF_DATE = "2023-11-01T00:00:00Z"              
TARGET_ARTICLES = 1165                             
MIN_WORDS = 100                                    
REQUESTS_PER_SECOND = 2.0                          
API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
SLEEP_BETWEEN = 1.0 / REQUESTS_PER_SECOND

In [3]:
session = requests.Session()
session.headers.update({"User-Agent": "RAG-Replication/1.0 (research use)"})

def api_get(params, retries=3, backoff=1.5):
    for attempt in range(retries):
        r = session.get(API_ENDPOINT, params=params, timeout=30)
        if r.status_code == 200:
            return r.json()
        time.sleep(backoff ** attempt)
    r.raise_for_status()

def iter_recent_new_pages(rcstart_iso=None, rcend_iso=CUTOFF_DATE, batch_limit=500):
    """
    Iterate 'new' page creations (namespace=0) from rcstart (now if None) going older until rcend.
    Yields dicts with 'title', 'pageid', 'timestamp'.
    """
    # Start from 'now' if not provided
    if rcstart_iso is None:
        rcstart_iso = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    params = {
        "action": "query",
        "list": "recentchanges",
        "rcprop": "title|timestamp|ids",
        "rctype": "new",
        "rcnamespace": 0,           # articles only
        "rcdir": "older",           # from rcstart -> older -> rcend
        "rcstart": rcstart_iso,
        "rcend": rcend_iso,
        "rclimit": batch_limit,
        "format": "json",
    }

    cont = {}
    while True:
        q = params.copy()
        q.update(cont)
        data = api_get(q)
        changes = data.get("query", {}).get("recentchanges", [])
        if not changes:
            break
        for rc in changes:
            yield {"title": rc["title"], "pageid": rc.get("pageid"), "timestamp": rc["timestamp"]}
        cont = data.get("continue", {})
        if not cont:
            break
        time.sleep(SLEEP_BETWEEN)

def fetch_plaintext_by_title(title):
    """
    Fetch plaintext extract for a given title. Returns '' if missing.
    Uses extracts API (plaintext).
    """
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": 1,
        "redirects": 1,
        "titles": title,
        "format": "json",
    }
    data = api_get(params)
    pages = data.get("query", {}).get("pages", {})
    for _, page in pages.items():
        # skip missing pages
        if "missing" in page:
            return ""
        return page.get("extract", "") or ""
    return ""


In [4]:
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

seen_titles = set()
total_words = 0
written = 0

# Resume: if file exists, load titles & recompute stats
if os.path.exists(OUTPUT_PATH):
    with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            t = obj["title"]
            seen_titles.add(t)
            total_words += len(obj["text"].split())
            written += 1

print(f"Resume status → found {written} articles already ({total_words:,} words).")

Resume status → found 0 articles already (0 words).


In [5]:
# Cell 4: Main crawl loop — from most recent → older until we hit TARGET_ARTICLES
to_write = TARGET_ARTICLES - written
if to_write <= 0:
    print("Target already reached; nothing to do.")
else:
    print(f"Need to add {to_write} more articles…")

    # Start from 'now' going older to CUTOFF_DATE
    start_from = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    with open(OUTPUT_PATH, "a", encoding="utf-8") as fout:
        pbar = tqdm(total=to_write, desc=f"Collecting articles | {total_words:,} words", unit="art")
        for rc in iter_recent_new_pages(rcstart_iso=start_from, rcend_iso=CUTOFF_DATE, batch_limit=500):
            title = rc["title"]
            if title in seen_titles:
                continue

            # Fetch article text
            txt = fetch_plaintext_by_title(title)
            time.sleep(SLEEP_BETWEEN)

            if not txt:
                continue

            # Filter out short articles (< MIN_WORDS)
            words = txt.split()
            if len(words) < MIN_WORDS:
                continue

            # Write JSONL line
            rec = {"title": title, "text": txt}
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
            fout.flush()

            # Update counters
            seen_titles.add(title)
            written += 1
            total_words += len(words)
            pbar.update(1)

            # Update tqdm description dynamically
            pbar.set_description(f"Collected {written} articles | {total_words:,} words")

            # Print occasional summaries (every 50 articles)
            if written % 50 == 0:
                print(f"Progress: {written} articles, {total_words:,} total words")

            # Stop when target reached
            if written >= TARGET_ARTICLES:
                pbar.close()
                break

        if written < TARGET_ARTICLES:
            print("\nReached the cutoff without hitting the target count. "
                  "You can relax the cutoff, or continue from an earlier start time (older history).")

print(f"\nFinal totals → Articles: {written}, Words: {total_words:,}")

Need to add 1165 more articles…


Collected 50 articles | 113,358 words:   4%|▍         | 50/1165 [00:38<11:27,  1.62art/s]

Progress: 50 articles, 113,358 total words


Collected 100 articles | 155,553 words:   9%|▊         | 100/1165 [01:27<15:00,  1.18art/s]

Progress: 100 articles, 155,553 total words


Collected 150 articles | 192,300 words:  13%|█▎        | 150/1165 [02:02<13:12,  1.28art/s]

Progress: 150 articles, 192,300 total words


Collected 200 articles | 231,679 words:  17%|█▋        | 200/1165 [02:48<10:52,  1.48art/s]

Progress: 200 articles, 231,679 total words


Collected 250 articles | 294,600 words:  21%|██▏       | 250/1165 [03:25<11:27,  1.33art/s]

Progress: 250 articles, 294,600 total words


Collected 300 articles | 349,516 words:  26%|██▌       | 300/1165 [04:03<12:52,  1.12art/s]

Progress: 300 articles, 349,516 total words


Collected 350 articles | 404,236 words:  30%|███       | 350/1165 [04:37<09:00,  1.51art/s]

Progress: 350 articles, 404,236 total words


Collected 400 articles | 453,952 words:  34%|███▍      | 400/1165 [05:17<10:06,  1.26art/s]

Progress: 400 articles, 453,952 total words


Collected 450 articles | 504,393 words:  39%|███▊      | 450/1165 [05:56<07:35,  1.57art/s]

Progress: 450 articles, 504,393 total words


Collected 500 articles | 590,986 words:  43%|████▎     | 500/1165 [06:35<06:56,  1.60art/s]

Progress: 500 articles, 590,986 total words


Collected 550 articles | 683,431 words:  47%|████▋     | 550/1165 [07:08<07:21,  1.39art/s]

Progress: 550 articles, 683,431 total words


Collected 600 articles | 784,421 words:  52%|█████▏    | 600/1165 [07:50<08:46,  1.07art/s]

Progress: 600 articles, 784,421 total words


Collected 650 articles | 864,833 words:  56%|█████▌    | 650/1165 [08:29<05:28,  1.57art/s]

Progress: 650 articles, 864,833 total words


Collected 700 articles | 928,868 words:  60%|██████    | 700/1165 [09:13<08:08,  1.05s/art]

Progress: 700 articles, 928,868 total words


Collected 750 articles | 996,357 words:  64%|██████▍   | 750/1165 [09:52<09:22,  1.36s/art]

Progress: 750 articles, 996,357 total words


Collected 800 articles | 1,114,396 words:  69%|██████▊   | 800/1165 [10:44<03:30,  1.74art/s]

Progress: 800 articles, 1,114,396 total words


Collected 850 articles | 1,171,858 words:  73%|███████▎  | 850/1165 [11:18<03:13,  1.63art/s]

Progress: 850 articles, 1,171,858 total words


Collected 900 articles | 1,218,694 words:  77%|███████▋  | 900/1165 [12:01<03:50,  1.15art/s]

Progress: 900 articles, 1,218,694 total words


Collected 950 articles | 1,278,273 words:  82%|████████▏ | 950/1165 [12:53<03:24,  1.05art/s]

Progress: 950 articles, 1,278,273 total words


Collected 1000 articles | 1,365,206 words:  86%|████████▌ | 1000/1165 [13:28<01:37,  1.70art/s]

Progress: 1000 articles, 1,365,206 total words


Collected 1050 articles | 1,445,861 words:  90%|█████████ | 1050/1165 [14:04<01:10,  1.64art/s]

Progress: 1050 articles, 1,445,861 total words


Collected 1100 articles | 1,558,005 words:  94%|█████████▍| 1100/1165 [14:39<00:58,  1.11art/s]

Progress: 1100 articles, 1,558,005 total words


Collected 1150 articles | 1,606,880 words:  99%|█████████▊| 1150/1165 [15:14<00:08,  1.71art/s]

Progress: 1150 articles, 1,606,880 total words


Collected 1165 articles | 1,612,106 words: 100%|██████████| 1165/1165 [15:25<00:00,  1.26art/s]


Final totals → Articles: 1165, Words: 1,612,106





In [7]:
print(f"Saved to: {OUTPUT_PATH}")
print(f"Total articles: {written}")
print(f"Total words: {total_words:,}")

sample = []
with open(OUTPUT_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 3:
            break
        sample.append(json.loads(line))
sample

Saved to: ../data/wiki_content.jsonl
Total articles: 1165
Total words: 1,612,106


[{'title': 'Their Bats Have Been Broken',
  'text': 'On 13 November 1990, Deputy Prime Minister of the United Kingdom Geoffrey Howe delivered his resignation speech in the House of Commons after resigning on 1 November. While Howe initially worded carefully his resignation letter and criticism of the Prime Minister\'s overall handling of United Kingdom relations with the European Community, 10 Downing Street claimed that his disagreement with Thatcher on Europe were differences only in style, rather than substance. Said attempts were largely successful and Howe decided to send a powerful message of dissent.\nIn his speech, he attacked Thatcher for running increasingly serious risks for the country\'s future and criticised her for undermining the policies on EMU proposed by her chancellor and governor of the Bank of England. Although Howe\'s intention was only to constrain any shift in European policy by the Cabinet under the existing prime minister, his speech is widely seen as the key