In [122]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import difflib
import json
import time
from tqdm import tqdm
from urllib.parse import quote

In [116]:
control_articles = [
    "Pop music",
    "Rock and roll",
    "Eric Clapton",
    "Rolling Stone",
    "Jazz",
    "Swing",
    "Classical music",
    "Ludwig van Beethoven",
    "Wolfgang Amadeus Mozart",
    "Joseph Haydn",
    "Country music",
    "BTS (groupe)",
    "K-Pop",
    "Electronic music",
    "Daft Punk",
    "Paul Kalkbrenner",
    "Trumpet",
    "Music theory",
    "Fender",
    "Marshall Amplification",
    "Jimi Hendrix",
    "Bob Marley",
    "Edith Piaf",
    "Royal Albert Hall",
    "Piano",
    "Saxophone",
    "Pink Floyd",
    "Nirvana (band)",
    "Nina Simone",
    "Music of Africa",
    "Major scale",
    "Major chord",
    "Minor chord",
    "AC/DC",
    "Red Hot Chili Peppers",
    "Funk rock",
    "James Brown",
    "Dire Straits",
    "Mark Knofler",
    "John Frusciante",
    "Alan Clark",
    "Bob Dylan",
    "The Beatles",
    "Stevie Wonder",
    "Guitar"
]

45

In [131]:
import requests
import difflib
import json
import time
from tqdm import tqdm
from urllib.parse import quote

WIKI_API = "https://en.wikipedia.org/w/api.php"
USER_AGENT = "DH_Project/1.0 (https://www.epfl.ch/labs/dhlab/; maxime.garambois@epfl.ch)"
SLEEP_BETWEEN = 0.5  # seconds
OUTPUT_FILE = "../datas/interim/Control Analysis/control_group_diffs.jsonl"

# ----------------------------------------
# STEP 1: Fetch all revisions for an article
# ----------------------------------------
def fetch_revisions(title):
    session = requests.Session()
    session.headers.update({"User-Agent": USER_AGENT})

    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": title,
        "rvprop": "ids|timestamp|user|comment|content|parentid",
        "rvslots": "main",
        "rvlimit": "500",
        "rvdir": "newer",
        "formatversion": "2"
    }

    all_revs = []
    cont = {}
    loop = 0

    while True:
        loop += 1
        resp = session.get(WIKI_API, params={**params, **cont}, timeout=60)
        resp.raise_for_status()
        data = resp.json()

        page = data.get("query", {}).get("pages", [{}])[0]
        revs = page.get("revisions", []) or []
        for r in revs:
            if "slots" in r and "main" in r["slots"]:
                r["content"] = r["slots"]["main"].get("content", "")
            else:
                r["content"] = ""
        all_revs.extend(revs)

        if "continue" in data:
            cont = data["continue"]  # ✅ use the full continuation object
            time.sleep(SLEEP_BETWEEN)
        else:
            break

    all_revs.sort(key=lambda r: (r["timestamp"], r["revid"]))
    print(f"✅ {title}: fetched {len(all_revs)} revisions across {loop} loops")
    return all_revs



# ----------------------------------------
# STEP 2: Compute diffs between successive revisions
# ----------------------------------------
def compute_diffs(title, revisions):
    """
    Compute unified diffs between successive revisions.
    Returns a list of dicts representing edit events.
    """
    edit_records = []

    for i in range(1, len(revisions)):
        prev_rev = revisions[i - 1]
        curr_rev = revisions[i]

        before = prev_rev["content"]
        after = curr_rev["content"]

        diff = list(
            difflib.unified_diff(
                before.splitlines(),
                after.splitlines(),
                fromfile=f"{title}@{prev_rev['revid']}",
                tofile=f"{title}@{curr_rev['revid']}",
                lineterm=""
            )
        )

        edit_records.append({
            "article_title": title,
            "prev_rev_id": prev_rev["revid"],
            "curr_rev_id": curr_rev["revid"],
            "timestamp": curr_rev["timestamp"],
            "user": curr_rev.get("user", "(unknown)"),
            "comment": curr_rev.get("comment", ""),
            "diff_chunks": diff,
            "pre_edit_text": before,
            "post_edit_text": after
        })

    return edit_records


# ----------------------------------------
# STEP 3: Main pipeline
# ----------------------------------------
def main():
    with open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:
        for title in tqdm(control_articles, desc="Processing articles"):
            print(f"\nFetching revisions for: {title}")
            revisions = fetch_revisions(title)
            print(f"  → Retrieved {len(revisions)} revisions")

            if len(revisions) < 2:
                continue  # nothing to diff

            diffs = compute_diffs(title, revisions)
            print(f"  → Computed {len(diffs)} edit diffs")

            for rec in diffs:
                out_f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    print(f"\n✅ All diffs saved to {OUTPUT_FILE}")

main()

Processing articles:   0%|                               | 0/45 [00:00<?, ?it/s]


Fetching revisions for: Pop music


Processing articles:   0%|                               | 0/45 [00:09<?, ?it/s]


KeyboardInterrupt: 