# Control Group Analysis

The idea is to create a control group (corpus of 42 articles related to music) so that we can compare and assess the credibility and reliability of the results we have on weaponisation.

## Requirements

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import difflib
import json
import glob
import time
from tqdm import tqdm
from urllib.parse import quote
import os
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed

## Control articles 

In [5]:
# I REMOVE KNOPFLER, PIAF AND K-POP FROM THE LIST

pages = [
    "Pop music",
    "Rock and roll",
    "Eric Clapton",
    "Rolling Stone",
    "Jazz",
    "Swing",
    "Classical music",
    "Ludwig van Beethoven",
    "Wolfgang Amadeus Mozart",
    "Joseph Haydn",
    "Country music",
    "BTS",
    "Electronic music",
    "Daft Punk",
    "Paul Kalkbrenner",
    "Trumpet",
    "Music theory",
    "Fender",
    "Marshall Amplification",
    "Jimi Hendrix",
    "Bob Marley",
    "Royal Albert Hall",
    "Piano",
    "Saxophone",
    "Pink Floyd",
    "Nirvana (band)",
    "Nina Simone",
    "Music of Africa",
    "Major scale",
    "Major chord",
    "Minor chord",
    "Red Hot Chili Peppers",
    "Funk rock",
    "James Brown",
    "Dire Straits",
    "John Frusciante",
    "Alan Clark",
    "Stevie Wonder",
    "Guitar",
    "Jean-Jacques Goldman"
]

pages = sorted(pages)

## Compute the edit diff (Mohammed's code)

For each article of the control set and for each edit among each article, we compute the difference between the old version and the new version of the edit. We store those differences in a ```.jsonl```files. We can now ask ChatGPT for each edit diff to classify as weaponising or not weaponising

In [6]:
# ----------------------------------------
# CONFIG
# ----------------------------------------
WIKI_API = "https://en.wikipedia.org/w/api.php"
USER_AGENT = "DH_Project/1.0 (https://www.epfl.ch/labs/dhlab/; maxime.garambois@epfl.ch)"
SLEEP_BETWEEN = 0.5

# WHERE TO SAVE INDIVIDUAL PAGE FILES
REV_DIR = "../datas/interim/Control Analysis"
os.makedirs(REV_DIR, exist_ok=True)


# ----------------------------------------
# Session helper
# ----------------------------------------
def get_session():
    S = requests.Session()
    S.headers.update({"User-Agent": USER_AGENT})
    return S

# ----------------------------------------
# Fetch all revisions for a page
# ----------------------------------------
def fetch_revisions(title):
    S = get_session()

    params = {
        "action": "query",
        "format": "json",
        "formatversion": "2",
        "prop": "revisions",
        "titles": title,
        "rvprop": "ids|timestamp|user|comment|content",
        "rvslots": "main",
        "rvlimit": "max",
        "rvdir": "newer"
    }

    all_revs = []
    cont = {}

    while True:
        resp = S.get(WIKI_API, params={**params, **cont}, timeout=30)
        resp.raise_for_status()
        data = resp.json()

        pages_data = data.get("query", {}).get("pages", [])
        if pages_data:
            page = pages_data[0]
            revs = page.get("revisions", [])
            all_revs.extend(revs)

        if "continue" in data:
            cont = data["continue"]
            time.sleep(SLEEP_BETWEEN)
        else:
            break

    return all_revs


# ----------------------------------------
# Extract text content from revision
# ----------------------------------------
def get_content(r):
    # Formatversion=2 uses:
    # r["slots"]["main"]["content"]
    return (
        r.get("slots", {})
         .get("main", {})
         .get("content", "")
    )

# ----------------------------------------
# Compute unified diff
# ----------------------------------------
def compute_diff(old, new):
    a = old.splitlines() if old else []
    b = new.splitlines() if new else []
    d = list(difflib.unified_diff(a, b, lineterm=""))
    return "\n".join(d) if d else "No changes"


# ----------------------------------------
# Process one page
# ----------------------------------------
def process_page(title):
    print(f"[+] Fetching {title}")
    revs = fetch_revisions(title)

    if not revs:
        print(f"[!] No revisions found for {title}")
        return

    # Make sure they are sorted by timestamp (just in case)
    revs = sorted(revs, key=lambda r: r["timestamp"])

    first = get_content(revs[0])
    last = get_content(revs[-1])

    out_path = os.path.join(REV_DIR, f"{title.replace('/', '_')}.jsonl")

    with open(out_path, "w", encoding="utf-8") as jf:

        # First version
        jf.write(json.dumps(
            {"version": "first_version", "Content": first},
            ensure_ascii=False
        ) + "\n")

        # All diffs
        prev_text = None
        for r in revs:
            ts = r.get("timestamp")
            user = r.get("user", "(unknown)")
            comment = r.get("comment", "")

            text = get_content(r)

            if prev_text is None:
                diff = "Initial revision"
            else:
                diff = compute_diff(prev_text, text)

            jf.write(json.dumps({
                "version": "diff",
                "Timestamp": ts,
                "User": user,
                "Comment": comment,
                "Diff": diff
            }, ensure_ascii=False) + "\n")

            prev_text = text

        # Last version
        jf.write(json.dumps(
            {"version": "last_version", "Content": last},
            ensure_ascii=False
        ) + "\n")

    print(f"[✓] Done: {title}")


# ----------------------------------------
# RUN IN PARALLEL
# ----------------------------------------
cpu_cores = multiprocessing.cpu_count()
max_workers = cpu_cores * 2
print(f"Using {max_workers} workers")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_page, p): p for p in pages}
    for future in as_completed(futures):
        page = futures[future]
        try:
            future.result()
        except Exception as exc:
            print(f"[!] {page} raised an exception: {exc}")

print("\nAll done. Check the revisions/ folder!")

Using 16 workers
[+] Fetching Alan Clark
[+] Fetching BTS
[+] Fetching Bob Marley
[+] Fetching Classical music
[+] Fetching Country music
[+] Fetching Daft Punk
[+] Fetching Dire Straits
[+] Fetching Electronic music
[+] Fetching Eric Clapton
[+] Fetching Fender
[+] Fetching Funk rock
[+] Fetching Guitar
[+] Fetching James Brown
[+] Fetching Jazz
[+] Fetching Jean-Jacques Goldman
[+] Fetching Jimi Hendrix
[✓] Done: Fender
[+] Fetching John Frusciante
[✓] Done: Jean-Jacques Goldman
[+] Fetching Joseph Haydn
[✓] Done: Funk rock
[+] Fetching Ludwig van Beethoven
[✓] Done: Alan Clark
[+] Fetching Major chord
[✓] Done: Major chord
[+] Fetching Major scale
[✓] Done: Major scale
[+] Fetching Marshall Amplification
[✓] Done: Marshall Amplification
[+] Fetching Minor chord
[✓] Done: Minor chord
[+] Fetching Music of Africa
[✓] Done: Dire Straits
[+] Fetching Music theory
[✓] Done: Electronic music
[+] Fetching Nina Simone
[✓] Done: Music of Africa
[+] Fetching Nirvana (band)
[✓] Done: Classical

## Metrics 

We count the total number of edits so we get an idea of how costly would it be to use ChatGPT API

In [10]:
folder = "../../datas/interim/Control Analysis/*.jsonl"

total_edits = 0
per_page = {}

for file in glob.glob(folder):
    edits = 0
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            if rec.get("version") == "diff":
                edits += 1

    page_name = file.split("/")[-1].replace(".jsonl", "")
    per_page[page_name] = edits
    total_edits += edits

print("Total edits:", total_edits)
print("\nEdits per page:")
for page, num in per_page.items():
    print(page, num)


Total edits: 250687

Edits per page:
Alan Clark 894
James Brown 8480
Music theory 3383
Guitar 6906
Music of Africa 2733
Red Hot Chili Peppers 21645
Eric Clapton 10102
Major scale 918
Royal Albert Hall 2112
Major chord 199
Pop music 9681
Minor chord 199
Jean-Jacques Goldman 528
Wolfgang Amadeus Mozart 8547
Nirvana (band) 13978
Joseph Haydn 5337
Paul Kalkbrenner 367
Classical music 5607
Rolling Stone 4117
Trumpet 7430
Daft Punk 8954
Funk rock 911
Fender 84
BTS 8480
Saxophone 6499
Electronic music 4393
Nina Simone 3461
Pink Floyd 18111
Jazz 11820
Dire Straits 4131
Rock and roll 8832
Piano 7927
Country music 8486
Stevie Wonder 5822
Swing 406
John Frusciante 7306
Bob Marley 7142
Marshall Amplification 1483
Ludwig van Beethoven 9109
Jimi Hendrix 14167
