# Control Group Analysis

The idea is to create a control group (corpus of 42 articles related to music) so that we can compare and assess the credibility and reliability of the results we have on weaponisation.

## Requirements

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import difflib
import json
import glob
import time
from tqdm import tqdm
from urllib.parse import quote
import os
import multiprocessing
from concurrent.futures import ThreadPoolExecutor, as_completed

## Control articles 

In [1]:
pages = [
    "Pop music",
    "Rock and roll",
    "Eric Clapton",
    "Rolling Stone",
    "Jazz",
    "Swing",
    "Classical music",
    "Ludwig van Beethoven",
    "Wolfgang Amadeus Mozart",
    "Joseph Haydn",
    "Country music",
    "BTS",
    "K-Pop",
    "Electronic music",
    "Daft Punk",
    "Paul Kalkbrenner",
    "Trumpet",
    "Music theory",
    "Fender",
    "Marshall Amplification",
    "Jimi Hendrix",
    "Bob Marley",
    "Edith Piaf",
    "Royal Albert Hall",
    "Piano",
    "Saxophone",
    "Pink Floyd",
    "Nirvana (band)",
    "Nina Simone",
    "Music of Africa",
    "Major scale",
    "Major chord",
    "Minor chord",
    "Red Hot Chili Peppers",
    "Funk rock",
    "James Brown",
    "Dire Straits",
    "Mark Knofler",
    "John Frusciante",
    "Alan Clark",
    "Stevie Wonder",
    "Guitar"
]

pages = sorted(pages)

## Compute the edit diff (Mohammed's code)

For each article of the control set and for each edit among each article, we compute the difference between the old version and the new version of the edit. We store those differences in a ```.jsonl```files. We can now ask ChatGPT for each edit diff to classify as weaponising or not weaponising

In [5]:
# ----------------------------------------
# CONFIG
# ----------------------------------------
WIKI_API = "https://en.wikipedia.org/w/api.php"
USER_AGENT = "DH_Project/1.0 (https://www.epfl.ch/labs/dhlab/; maxime.garambois@epfl.ch)"
SLEEP_BETWEEN = 0.5

# WHERE TO SAVE INDIVIDUAL PAGE FILES
REV_DIR = "../datas/interim/Control Analysis"
os.makedirs(REV_DIR, exist_ok=True)


# ----------------------------------------
# Session helper
# ----------------------------------------
def get_session():
    S = requests.Session()
    S.headers.update({"User-Agent": USER_AGENT})
    return S

# ----------------------------------------
# Fetch all revisions for a page
# ----------------------------------------
def fetch_revisions(title):
    S = get_session()

    params = {
        "action": "query",
        "format": "json",
        "formatversion": "2",
        "prop": "revisions",
        "titles": title,
        "rvprop": "ids|timestamp|user|comment|content",
        "rvslots": "main",
        "rvlimit": "max",
        "rvdir": "newer"
    }

    all_revs = []
    cont = {}

    while True:
        resp = S.get(WIKI_API, params={**params, **cont}, timeout=30)
        resp.raise_for_status()
        data = resp.json()

        pages_data = data.get("query", {}).get("pages", [])
        if pages_data:
            page = pages_data[0]
            revs = page.get("revisions", [])
            all_revs.extend(revs)

        if "continue" in data:
            cont = data["continue"]
            time.sleep(SLEEP_BETWEEN)
        else:
            break

    return all_revs


# ----------------------------------------
# Extract text content from revision
# ----------------------------------------
def get_content(r):
    # Formatversion=2 uses:
    # r["slots"]["main"]["content"]
    return (
        r.get("slots", {})
         .get("main", {})
         .get("content", "")
    )

# ----------------------------------------
# Compute unified diff
# ----------------------------------------
def compute_diff(old, new):
    a = old.splitlines() if old else []
    b = new.splitlines() if new else []
    d = list(difflib.unified_diff(a, b, lineterm=""))
    return "\n".join(d) if d else "No changes"


# ----------------------------------------
# Process one page
# ----------------------------------------
def process_page(title):
    print(f"[+] Fetching {title}")
    revs = fetch_revisions(title)

    if not revs:
        print(f"[!] No revisions found for {title}")
        return

    # Make sure they are sorted by timestamp (just in case)
    revs = sorted(revs, key=lambda r: r["timestamp"])

    first = get_content(revs[0])
    last = get_content(revs[-1])

    out_path = os.path.join(REV_DIR, f"{title.replace('/', '_')}.jsonl")

    with open(out_path, "w", encoding="utf-8") as jf:

        # First version
        jf.write(json.dumps(
            {"version": "first_version", "Content": first},
            ensure_ascii=False
        ) + "\n")

        # All diffs
        prev_text = None
        for r in revs:
            ts = r.get("timestamp")
            user = r.get("user", "(unknown)")
            comment = r.get("comment", "")

            text = get_content(r)

            if prev_text is None:
                diff = "Initial revision"
            else:
                diff = compute_diff(prev_text, text)

            jf.write(json.dumps({
                "version": "diff",
                "Timestamp": ts,
                "User": user,
                "Comment": comment,
                "Diff": diff
            }, ensure_ascii=False) + "\n")

            prev_text = text

        # Last version
        jf.write(json.dumps(
            {"version": "last_version", "Content": last},
            ensure_ascii=False
        ) + "\n")

    print(f"[✓] Done: {title}")


# ----------------------------------------
# RUN IN PARALLEL
# ----------------------------------------
cpu_cores = multiprocessing.cpu_count()
max_workers = cpu_cores * 2
print(f"Using {max_workers} workers")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_page, p): p for p in pages}
    for future in as_completed(futures):
        page = futures[future]
        try:
            future.result()
        except Exception as exc:
            print(f"[!] {page} raised an exception: {exc}")

print("\nAll done. Check the revisions/ folder!")

Using 16 workers
[+] Fetching Eric Clapton
[+] Fetching Rolling Stone
[+] Fetching Jazz
[+] Fetching Swing
[+] Fetching Country music
[+] Fetching BTS
[+] Fetching Daft Punk
[+] Fetching Trumpet
[+] Fetching Jimi Hendrix
[+] Fetching Royal Albert Hall
[+] Fetching Piano
[+] Fetching Saxophone
[+] Fetching Pink Floyd
[+] Fetching Nirvana (band)
[+] Fetching Nina Simone
[+] Fetching AC/DC
[✓] Done: Swing
[+] Fetching Red Hot Chili Peppers
[✓] Done: Royal Albert Hall
[+] Fetching James Brown
[✓] Done: Nina Simone
[+] Fetching Dire Straits
[✓] Done: Rolling Stone
[+] Fetching John Frusciante
[✓] Done: Saxophone
[+] Fetching Alan Clark
[✓] Done: Trumpet
[+] Fetching Bob Dylan
[✓] Done: Alan Clark
[+] Fetching The Beatles
[✓] Done: Dire Straits
[+] Fetching Stevie Wonder
[✓] Done: Piano
[+] Fetching Guitar
[✓] Done: Daft Punk
[✓] Done: Country music
[✓] Done: John Frusciante
[✓] Done: BTS
[✓] Done: Eric Clapton
[✓] Done: Stevie Wonder
[✓] Done: James Brown
[✓] Done: Jazz
[✓] Done: Nirvana (b

## Metrics 

We count the total number of edits so we get an idea of how costly would it be to use ChatGPT API

In [8]:
folder = "../datas/interim/Control Analysis/*.jsonl"

total_edits = 0
per_page = {}

for file in glob.glob(folder):
    edits = 0
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            if rec.get("version") == "diff":
                edits += 1

    page_name = file.split("/")[-1].replace(".jsonl", "")
    per_page[page_name] = edits
    total_edits += edits

print("Total edits:", total_edits)
print("\nEdits per page:")
for page, num in per_page.items():
    print(page, num)

Total edits: 250083

Edits per page:
Alan Clark 890
James Brown 8471
Music theory 3378
Guitar 6906
Music of Africa 2731
Red Hot Chili Peppers 21644
Eric Clapton 10100
Major scale 918
Royal Albert Hall 2112
Major chord 199
Pop music 9666
Minor chord 199
Wolfgang Amadeus Mozart 8533
Nirvana (band) 13977
Joseph Haydn 5337
Paul Kalkbrenner 366
Classical music 5607
K-Pop 11
Rolling Stone 4117
Trumpet 7430
Daft Punk 8954
Funk rock 911
Fender 84
BTS 8480
Saxophone 6498
Electronic music 4391
Nina Simone 3461
Mark Knofler 1
Pink Floyd 18104
Edith Piaf 6
Jazz 11819
Dire Straits 4129
Rock and roll 8829
Piano 7927
Country music 8482
Stevie Wonder 5820
Swing 406
John Frusciante 7304
Bob Marley 7127
Marshall Amplification 1483
Ludwig van Beethoven 9109
Jimi Hendrix 14166


In [37]:
ukraine_keywords_small = [
    "ukraine", "ukrainian", "kyiv", "kiev", "crimea", "crimean", "kuban", "donbas", "donetsk", "luhansk",
    "maidan", "yanukovych", "yushchenko", "zelenskyy", "poroshenko", "catherine", "bukovina", "bessarabia", "eastern",
    "euromaidan", "dnipro", "odessa", "sevastopol", "putin", "rus'", "russia", "russian", "moscow", "kremlin", "soviet"
]

ukraine_keywords_large = [
    # core country and people
    "ukraine", "ukrainian", "kyiv", "kiev", "crimea", "crimean", "kuban", "donbas", "donetsk", "luhansk",
    "maidan", "yanukovych", "yushchenko", "zelenskyy", "poroshenko", "catherine", "bukovina", "bessarabia",
    "eastern", "euromaidan", "dnipro", "odessa", "sevastopol", "putin", "rus'", "russia", "russian", "moscow",
    "kremlin", "soviet",

    # politics & government
    "verkhovna rada", "president", "prime minister", "parliament", "government", "cabinet", "federation",
    "referendum", "annexation", "independence", "revolution", "reforms", "corruption", "sanctions",
    "occupation", "treaty", "agreement", "ceasefire", "negotiations", "elections", "coup", "unification",

    # geography & regions
    "zaporizhzhia", "mariupol", "kharkiv", "kherson", "mykolaiv", "chernihiv", "sumy", "poltava", "vinnytsia",
    "lviv", "ivano-frankivsk", "ternopil", "lutsk", "uzhhorod", "dnipropetrovsk", "donetsk oblast",
    "luhansk oblast", "transcarpathia", "prykarpattia", "galicia", "novorossiya", "black sea", "azov sea",

    # historical references
    "kyivan rus", "tsar", "imperial", "empire", "ussr", "communist", "lenin", "stalin", "bolshevik",
    "cold war", "perestroika", "glasnost", "collapse", "partition", "catherine the great", "brezhnev",
    "chernobyl", "orange revolution", "revolution of dignity", "holodomor", "soviet union",

    # war and military
    "invasion", "occupation", "annexed", "frontline", "offensive", "defense", "army", "forces", "military",
    "russian troops", "ukrainian forces", "separatist", "rebels", "paramilitary", "nato", "eu", "un", "war",
    "conflict", "shelling", "bombing", "airstrike", "occupation forces", "mobilization", "martial law",

    # culture, identity & language
    "language", "identity", "heritage", "culture", "orthodox", "church", "patriarch", "ukrainian language",
    "russian language", "minority", "bilingual", "autonomy", "nationalism", "independence day", "flag",
    "anthem", "symbol", "national identity", "sovereignty",

    # current / modern references
    "donbas war", "russian invasion", "ukrainian front", "crimea bridge", "moskva cruiser", "ukrainian army",
    "russian army", "zelensky", "kremlin propaganda", "occupation administration", "territorial defense",
    "european union", "eu membership", "nato membership", "nato expansion", "eu sanctions", "ukraine war",
    "full-scale invasion", "special military operation", "mobilisation", "referendum in crimea",

    # other
    "gas pipeline", "north stream", "energy crisis", "grain corridor", "black sea fleet", "peace talks",
    "donetsk people's republic", "luhansk people's republic", "kyiv oblast", "liberation", "resistance",
    "occupation zone", "ukrainian refugees", "mariupol steel plant", "azovstal", "bucha", "irpin", "kharkiv offensive"
]

top_user_contribs_df = pd.read_csv('../datas/interim/All Users Analysis/top_user_contribs.csv')

def is_ukraine_related(title):
    if not isinstance(title, str):
        return False
    title_lower = title.lower()
    return any(kw in title_lower for kw in ukraine_keywords_small)

top_user_contribs_df["ukraine_related"] = top_user_contribs_df["title"].apply(is_ukraine_related)

user_distribution = (
    top_user_contribs_df
    .groupby(["username", "ukraine_related"])
    .size()
    .reset_index(name="count")
)

top_user_contribs_df = top_user_contribs_df[top_user_contribs_df['ukraine_related'] == True]
test = top_user_contribs_df['title'].unique().tolist() 

bad_substrings = ['template:', 'talk:', 'wikipedia:']

test2 = [
    s for s in test
    if not any(bad in s.lower() for bad in bad_substrings)
]

print(len(test), len(test2))

11854 9122


In [47]:
API = "https://en.wikipedia.org/w/api.php"
USER_AGENT = "DH_Project/1.0 (https://www.epfl.ch/labs/dhlab/; maxime.garambois@epfl.ch)"
HEADERS = {
    "User-Agent": USER_AGENT
}


def get_revision_id(title, direction):
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": title,
        "rvlimit": 1,
        "rvdir": direction,
        "rvprop": "ids",
        "format": "json"
    }

    response = requests.get(API, params=params, headers=HEADERS)

    # If Wikipedia returns HTML instead of JSON
    if "application/json" not in response.headers.get("Content-Type", ""):
        return None

    data = response.json()

    try:
        pages = data["query"]["pages"]
        page = next(iter(pages.values()))
        return page["revisions"][0]["revid"]
    except Exception as e:
        return None


def estimate_revision_count(title):
    oldest = get_revision_id(title, "newer")
    newest = get_revision_id(title, "older")

    if oldest is None or newest is None:
        return None

    estimated = newest - oldest + 1
    return estimated


total = 0
for title in test2[:1000]:
    est = estimate_revision_count(title)
    if est is not None:
        total += est

print(total)

682963819096


In [54]:
import requests
import time

for t in test2[:10]:
    url = f"https://xtools.wmcloud.org/api/page/pageinfo/en.wikipedia.org/{t}"
    params = {"format": "json"}
    headers = {"accept": "application/json"}
    
    response = requests.get(url, params=params, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        print(data)
    else:
        print("Error:", response.status_code, response.text)


{'project': 'en.wikipedia.org', 'page': 'COVID-19 pandemic in Russia', 'format': 'json', 'watchers': 106, 'pageviews': 2872, 'pageviews_offset': 30, 'revisions': 3164, 'editors': 473, 'anon_edits': 358, 'minor_edits': 278, 'creator': 'Lopifalko', 'creator_editcount': 87542, 'created_at': '2020-03-02T16:12:54Z', 'created_rev_id': 943564845, 'modified_at': '2025-09-06T01:42:23Z', 'secs_since_last_edit': 6271130, 'modified_rev_id': 1309795870, 'assessment': {'value': 'C', 'color': '#FFFF66', 'category': 'Category:C-Class articles', 'badge': 'https://upload.wikimedia.org/wikipedia/commons/e/e6/Symbol_c_class.svg'}, 'elapsed_time': 0.631}
{'project': 'en.wikipedia.org', 'page': 'COVID-19 pandemic in Ukraine', 'format': 'json', 'watchers': 77, 'pageviews': 598, 'pageviews_offset': 30, 'revisions': 1700, 'editors': 185, 'anon_edits': 76, 'minor_edits': 164, 'creator': 'Dying', 'creator_editcount': 11773, 'created_at': '2020-03-03T23:46:58Z', 'created_rev_id': 943796970, 'modified_at': '2025-0