In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import requests
import time
import csv

In [76]:
# Articles List
control_articles = [
    "Pop music",
    "Rock and roll",
    "Eric Clapton",
    "Rolling Stone",
    "Jazz",
    "Swing",
    "Classical music",
    "Ludwig van Beethoven",
    "Wolfgang Amadeus Mozart",
    "Joseph Haydn",
    "Country music",
    "BTS (groupe)",
    "K-Pop",
    "Electronic music",
    "Daft Punk",
    "Paul Kalkbrenner",
    "Trumpet",
    "Music theory",
    "Fender",
    "Marshall Amplification",
    "Jimi Hendrix",
    "Bob Marley",
    "Edith Piaf",
    "Royal Albert Hall",
    "Piano",
    "Saxophone",
    "Pink Floyd",
    "Nirvana (band)",
    "Nina Simone",
    "Music of Africa",
    "Major scale",
    "Major chord",
    "Minor chord",
    "AC/DC",
    "Red Hot Chili Peppers",
    "Funk rock",
    "James Brown",
    "Dire Straits",
    "Mark Knofler",
    "John Frusciante",
    "Alan Clark",
    "Bob Dylan",
    "The Beatles",
    "Stevie Wonder",
    "Guitar"
]

articles = [
    "COVID-19 pandemic in Ukraine",
    "History of Ukraine",
    "Crimea",
    "Russian annexation of Crimea",
    "2004 Ukrainian presidential election",
    "Football in Ukraine",
    "Bessarabia",
    "2014 pro-Russian unrest in Ukraine",
    "Communist Party of the Soviet Union",
    "English Civil War",
    "Christianity in Russia",
    "History of Christianity in Ukraine",
    "Flag of Ukraine",
    "Alexander II of Russia",
    "Eastern Front (World War II)",
    "Bukovina",
    "Epiphanius I of Ukraine",
    "History of Crimea",
    "Dissolution of the Soviet Union",
    "Crimean Tatars",
    "Catherine the Great",
    "Culture of Ukraine",
    "Abortion in Ukraine",
    "Christmas in Ukraine",
    "Armed Forces of Ukraine",
    "Demographics of Ukraine",
    "History of Kyiv",
    "Foreign relations of Ukraine",
    "Eastern Front (World War I)",
    "Economy of Ukraine",
    "Galicia (Eastern Europe)",
    "Euromaidan",
    "History of the Russian Orthodox Church",
    "Government of Ukraine",
    "Geography of Ukraine",
    "Censuses in Ukraine",
    "Administrative divisions of Ukraine",
    "Government of the Ukrainian People's Republic in exile",
    "Education in Ukraine",
    "2022 Russian invasion of Ukraine",
    "Buddhism in Ukraine"
]

In [97]:
import requests
import mwparserfromhell
import re

API = "https://en.wikipedia.org/w/api.php"
HEADERS = {
    "User-Agent": "DH_Project/1.0 (maxime.garambois@epfl.ch)"
}

CONTENTIOUS_KEYWORDS = ["contentious topics/"]

def get_protection_status(title):
    params = {
        "action": "query",
        "titles": "title",
        "prop": "info",
        "inprop": "protection",
        "format": "json",
        "formatversion" : "2"
    }

    RES = requests.get(API, params=params, headers=HEADERS)
    DATA = RES.json()

    page = data["query"]["pages"][0]

    if "missing" in page:
        return {"error": "Page does not exist"}

    protection = page.get("protection", [])

    # Clean up into simpler dict form
    protection_clean = [
        {
            "type": p["type"],
            "level": p["level"],
            "expiry": p["expiry"]
        }
        for p in protection
    ]

    return {
        "title": page["title"],
        "protection": protection_clean
    }

def get_talk_wikitext(title):
    """Retrieve raw wikitext of the Talk page."""
    params = {
        "action": "query",
        "titles": f"Talk:{title}",
        "prop": "revisions",
        "rvslots": "main",
        "rvprop": "content",
        "formatversion": "2",
        "format": "json"
    }
    response = requests.get(API, params=params, headers=HEADERS)
    data = response.json()

    page = data["query"]["pages"][0]
    if "missing" in page:
        return None  # talk page doesn't exist

    return page["revisions"][0]["slots"]["main"]["content"]


def parse_assessments(wikitext):
    """Parse class, importance values, and contentious-topic status from wikitext."""
    code = mwparserfromhell.parse(wikitext)

    results = {
        "class": None,
        "importance": {},
        "contentious": False
    }

    for template in code.filter_templates():
        name = template.name.strip().lower()

        # GLOBAL CLASS (from banner shell)
        if "banner shell" in name:
            if template.has("class"):
                results["class"] = str(template.get("class").value).strip()

        # PER-WIKIPROJECT IMPORTANCE
        if "wikiproject" in name and not "banner shell" in name:
            project = template.name.strip().replace("WikiProject", "").strip()

            # look for either "importance" or "priority"
            if template.has("importance"):
                imp = str(template.get("importance").value).strip()
                results["importance"][project] = imp
            elif template.has("priority"):
                # Some projects use "priority" (e.g., Mathematics)
                imp = str(template.get("priority").value).strip()
                results["importance"][project] = imp

        # CONTENTIOUS TOPICS DETECTION
        temp_text = str(template).lower()
        if any(keyword in name for keyword in CONTENTIOUS_KEYWORDS):
            results["contentious"] = True

    return results


def get_article_assessment(title):
    """Main wrapper: fetch talk page and parse assessment."""
    wikitext = get_talk_wikitext(title)
    if not wikitext:
        return {"error": "Talk page does not exist"}

    return parse_assessments(wikitext)

def extract_relevant_importance(importance_dict):
    """
    From all WikiProject importance values:
    - If 'Ukraine' exists -> return that value.
    - Else -> return the first value in the dict.
    - If dict empty -> return None.
    """
    if not importance_dict:
        return None

    # Prefer Ukraine rating if present
    if "Ukraine" in importance_dict:
        return importance_dict["Ukraine"]

    # Otherwise take the first key in the dict
    first_key = next(iter(importance_dict))
    return importance_dict[first_key]

def get_data(articles_list, out_csv):
    rows = []

    for article in articles_list:
        assessment = get_article_assessment(article)

        # Extract global class
        article_class = assessment.get("class")

        # Extract importance from rules
        importance = extract_relevant_importance(assessment.get("importance", {}))

        # Contentious topic boolean
        contentious = assessment.get("contentious")

        rows.append({
            "article": article,
            "class": article_class,
            "importance": importance,
            "contentious": contentious
        })

    # Save CSV
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["article", "class", "importance", "contentious"])
        writer.writeheader()
        writer.writerows(rows)

    print(f"✅ CSV saved at: {out_csv}")

OUT_CSV = '../datas/interim/Policy Analysis/policy_analysis_articles.csv'
get_data(articles, OUT_CSV)
status = []
for article in articles:
    d = get_protection_status(article)
    status.append(d["protection"])

✅ CSV saved at: ../datas/interim/Policy Analysis/policy_analysis_articles.csv


In [74]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re

API = "https://en.wikipedia.org/w/api.php"
HEADERS = {"User-Agent": "DH_Project/1.0 (maxime.garambois@epfl.ch)"}

def mw_normalize_and_redirects(title):
    """Return canonical title + any redirects (all with underscores)."""
    params = {
        "action": "query",
        "titles": title,
        "redirects": "1",
        "format": "json",
        "formatversion": "2",
    }
    r = requests.get(API, params=params, headers=HEADERS).json()
    pages = r.get("query", {}).get("pages", [])
    if not pages or "missing" in pages[0]:
        # fall back to the provided title
        return {title.replace(" ", "_")}
    canonical = pages[0]["title"].replace(" ", "_")
    candidates = {canonical}
    for redir in r["query"].get("redirects", []):
        candidates.add(redir["from"].replace(" ", "_"))
        candidates.add(redir["to"].replace(" ", "_"))
    return candidates

def parse_page_html(title):
    """Fetch parsed HTML for a wiki page title."""
    params = {"action": "parse", "page": title, "prop": "text", "format": "json"}
    r = requests.get(API, params=params, headers=HEADERS).json()
    if "error" in r:
        return None
    return r["parse"]["text"]["*"]

def collect_level_subpages(level):
    """
    From the root VA page for a level, collect all subpages like:
    Wikipedia:Vital articles/Level/<level>/People, /History, etc.
    Include the root too (some levels have direct links).
    """
    root = f"Wikipedia:Vital articles/Level/{level}"
    html = parse_page_html(root)
    subpages = set()
    if html:
        soup = BeautifulSoup(html, "html.parser")
        for a in soup.find_all("a"):
            href = a.get("href", "")
            title = a.get("title", "")
            # Prefer title (cleaner), but fall back to href if needed
            if title.startswith(f"Wikipedia:Vital articles/Level/{level}/"):
                subpages.add(title)
            elif href.startswith("/wiki/Wikipedia:Vital_articles/Level/"):
                # Extract after /wiki/
                target = href[len("/wiki/"):]
                if re.match(rf"Wikipedia:Vital_articles/Level/{level}\b", target):
                    subpages.add(target)
    subpages.add(root)
    return subpages

def vital_level_via_lists(article_title):
    """
    Search Vital Articles lists (levels 1..5) and return the level number
    where the article appears, or None if not found.
    """
    acceptable = {t.lower() for t in mw_normalize_and_redirects(article_title)}

    for level in range(1, 6):
        for subpage in collect_level_subpages(level):
            html = parse_page_html(subpage)
            if not html:
                continue
            soup = BeautifulSoup(html, "html.parser")
            for a in soup.find_all("a"):
                # Use the title attribute: it's the canonical page title
                if a.has_attr("title"):
                    link_title = a["title"].replace(" ", "_").lower()
                    if link_title in acceptable:
                        return level
                else:
                    # Fallback to href if no title (rare)
                    href = a.get("href", "")
                    if href.startswith("/wiki/"):
                        target = href[len("/wiki/"):].split("#", 1)[0]
                        target = unquote(target).replace(" ", "_").lower()
                        if target in acceptable:
                            return level
    return None


for article in articles:
    level = get_vital_level_by_list(article)
    print(article, level)

COVID-19 pandemic in Ukraine None
History of Ukraine None
Crimea None
Annexation of Crimea by the Russian Federation None
2004 Ukrainian presidential election None
Football in Ukraine None
Bessarabia None
2014 pro-Russian unrest in Ukraine None
Communist Party of the Soviet Union None


KeyboardInterrupt: 

In [99]:
df = pd.read_csv('../datas/interim/Policy Analysis/policy_analysis_articles.csv')
df.loc[26, 'contentious'] = True
df.loc[26, 'importance'] = 'High'
df.loc[16, 'importance'] = None
df.loc[37, 'importance'] = None
df.loc[37, 'contentious'] = False
df.loc[37, 'class'] = 'Start'
df.loc[40, 'importance'] = None
df['protection'] = status
df

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [None]:
df.to_csv('../datas/interim/Policy Analysis/policy_analysis_articles.csv', index=False)

In [104]:

API = "https://en.wikipedia.org/w/api.php"
HEADERS = {
    "User-Agent": "DH_Project/1.0 (maxime.garambois@epfl.ch)"
}

CONTENTIOUS_KEYWORDS = ["contentious topics/"]

def get_protection_status(title):
    params = {
        "action": "query",
        "titles": title,
        "prop": "info",
        "inprop": "protection",
        "format": "json",
        "formatversion" : "2"
    }

    RES = requests.get(API, params=params, headers=HEADERS)
    DATA = RES.json()

    page = DATA["query"]["pages"][0]

    if "missing" in page:
        return {"error": "Page does not exist"}

    protection = page.get("protection", [])

    # Clean up into simpler dict form
    protection_clean = [
        {
            "type": p["type"],
            "level": p["level"],
            "expiry": p["expiry"]
        }
        for p in protection
    ]

    return {
        "title": page["title"],
        "protection": protection_clean
    }

title = 'Albert Einstein'
get_protection_status(title)
d

{'title': 'Talk:Albert Einstein', 'protection': []}