# Set up session

In [1]:
import bz2
from collections import defaultdict
import csv
import json
import os
import re
import time
import yaml

import requests

## Functions

In [2]:
def exec_mariadb_stat2(query, db, filename=None, verbose=True):
    """Query MariaDB."""
    if db in DB_METADATA:
        node = DB_METADATA[db]['node']
    else:
        raise NotImplementedError("Don't know mapping of db {0} to mysql node.".format(db))
    cmd = ('mysql --defaults-extra-file=/etc/mysql/conf.d/analytics-research-client.cnf '
           '-h s{0}-analytics-replica.eqiad.wmnet -P 331{0} -A --database {1} -e "{2}"'.format(node, db, query))
    if filename:
        cmd = cmd + " > " + filename
    if verbose:
        print(' '.join(cmd.split()))
    ret = os.system(cmd)
    return ret

def exec_hive_stat2(query, filename=None, priority=False, verbose=True, nice=False, large=False):
    """Query Hive."""
    if priority:
        query = "SET mapreduce.job.queuename=priority;" + query
    elif large:
        query = "SET mapreduce.job.queuename=nice; SET mapreduce.map.memory.mb=4096;" + query # SET mapreduce.map.memory.mb=4096
    elif nice:
        query = "SET mapreduce.job.queuename=nice;" + query
        # if issues: SET mapred.job.queue.name=nice;
    cmd = """hive -e \" """ + query + """ \""""
    if filename:
        cmd = cmd + " > " + filename
    if verbose:
        print(' '.join(cmd.split()))
    ret = os.system(cmd)
    return ret

In [3]:
def norm_wp_name_ar(wp):
    ns_local = 'ويكيبيديا'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ":", "").replace('مشروع ويكي', '').strip())

def norm_wp_name_en(wp):
    ns_local = 'wikipedia'
    wp_prefix = 'wikiproject'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ":", "").replace(wp_prefix, "").strip())

def norm_wp_name_hu(wp):
    ns_local = 'wikipédia'
    to_strip = [ns_local + ":", 'témájú', 'kapcsolatos', 'műhelyek', 'műhely', '-es ', '-', 'országgal', 'ország']
    hardcoded_matches = {'Wikipédia:Harry Potter-műhely':'Harry Potterrel kapcsolatos',
                         'Wikipédia:USA-műhely':'USA-val kapcsolatos',
                         'Wikipédia:Anime- és mangaműhely':'anime-manga témájú',
                         'Wikipédia:Első világháború műhely':'első világháborús témájú'}
    for m in hardcoded_matches:
        if wp == m:
            wp = hardcoded_matches[m]
    wp = wp.lower()
    for s in to_strip:
        wp = wp.replace(s, ' ')
    return re.sub("\s\s+", " ", wp.strip())

def norm_wp_name_fr(wp):
    ns_local = 'projet'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ':', "").strip())

def norm_wp_name_tr(wp):
    ns_local = 'vikiproje'
    wp_prefix = 'vikipedi'
    return re.sub("\s\s+", " ", wp.lower().replace(wp_prefix, "").replace(ns_local, '').replace(':', '').strip())

In [4]:
def generate_wp_to_labels(wp_taxonomy):
    wp_to_labels = defaultdict(set)
    for wikiproject_name, label in _invert_wp_taxonomy(wp_taxonomy):
        wp_to_labels[norm_wp_name_en(wikiproject_name)].add(label)
    return wp_to_labels


def _invert_wp_taxonomy(wp_taxonomy, path=None):
    catch_all = None
    catch_all_wikiprojects = []
    for key, value in wp_taxonomy.items():
        path_keys = (path or []) + [key]
        if key[-1] == "*":
            # this is a catch-all
            catch_all = path_keys
            catch_all_wikiprojects.extend(value)
            continue
        elif isinstance(value, list):
            catch_all_wikiprojects.extend(value)
            for wikiproject_name in value:
                yield wikiproject_name, ".".join(path_keys)
        else:
            yield from _invert_wp_taxonomy(value, path=path_keys)
    if catch_all is not None:
        for wikiproject_name in catch_all_wikiprojects:
            yield wikiproject_name, ".".join(catch_all)

def get_topics(wikiprojects, topics_taxonomy, topic_counts):
    topics = set()
    for wp in wikiprojects:
        for wp_part in wp.split('/'):
            if wp_part not in topic_counts:
                topic_counts[wp_part] = 0
            wp_part_normed = norm_wp_name_en(wp_part)
            for t in topics_taxonomy.get(wp_part_normed, {}):
                topics.add(t)
                topic_counts[wp_part] += 1
    return sorted(topics)

def chunk(pageids, batch_size=50):
    """Batch pageIDS into sets of 50 for the Mediawiki API."""
    chunks = []
    for i in range(0, len(pageids), batch_size):
        chunks.append([str(p) for p in pageids[i:i+batch_size]])
    return chunks

def get_sitelinks_wikiprojects(wikiproject_tsv, output_json):
    """Mapping of WikiProjects to go from enwiki -> other languages
    This allows for limited groundtruth topic building in other languages.
    To generate wikiproject_tsv via SPARQL, start with: https://w.wiki/x3c
    """
    qids = set()
    with open(wikiproject_tsv, 'r') as fin:
        tsvreader = csv.reader(fin, delimiter='\t')
        assert next(tsvreader) == ['WikiProject']
        for line in tsvreader:
            qid = line[0].split('/')[-1]
            qids.add(qid)
    print("{0} WikiProject QIDs".format(len(qids)))
    base_url = 'https://wikidata.org/w/api.php'
    base_params = {"action": "wbgetentities",
                   "props": "sitelinks",
                   "format": "json",
                   "formatversion": 2}
    sitelinks = {}
    with requests.session() as session:
        for qid_set in chunk(list(qids), 50):
            params = base_params.copy()
            params['ids'] = '|'.join(qid_set)
            res = session.get(url=base_url, params=params).json()
            for q in res['entities']:
                qid = res['entities'][q]['id']
                q_slinks = {k:res['entities'][q]['sitelinks'][k]['title'] for k in res['entities'][q].get('sitelinks', {})}
                sitelinks[qid] = {}
                sitelinks[qid]['qid'] = qid
                sitelinks[qid]['sitelinks'] = q_slinks
            time.sleep(1)  # be kind to API
    with open(output_json, 'w') as fout:
        for qid in sitelinks:
            fout.write(json.dumps(sitelinks[qid]) + '\n')

# Build Groundtruth Dataset

In [12]:
# parameters
DB_METADATA = {'enwiki':{'node':1, 'norm':norm_wp_name_en},  # 21M pages
               'frwiki':{'node':6, 'norm':norm_wp_name_fr},  #  2.7M pages
               'arwiki':{'node':7, 'norm':norm_wp_name_ar},  #  2.8M pages
               'huwiki':{'node':7, 'norm':norm_wp_name_hu},  #    330K pages
               'trwiki':{'node':2, 'norm':norm_wp_name_tr}   #    280K pages
               }

page_assessments_tsv = './groundtruth/page_assessments.tsv'
db = 'enwiki'
dbname_snapshot = '2020-12'
pid_to_qid_snapshot = '2021-01-04'
pid_to_qid_tsv = './groundtruth/pid_to_qid.tsv'
topics_yaml = './groundtruth/wikiproject_taxonomy.yaml'
wikiprojects_sitelinks_json = './groundtruth/wikiprojects_sitelinks.json'
output_json = f'./groundtruth/labeled_{db}_with_topics_metadata.json.bz2'
norm_fn = DB_METADATA[db]['norm']
sep = '||'

In [9]:
!wget -O {topics_yaml} https://raw.githubusercontent.com/wikimedia/wikitax/master/datasets/wikiproject_taxonomy.halfak_20191202.yaml

--2021-01-26 02:25:18--  https://raw.githubusercontent.com/wikimedia/wikitax/master/datasets/wikiproject_taxonomy.halfak_20191202.yaml
Resolving webproxy.eqiad.wmnet (webproxy.eqiad.wmnet)... 2620:0:861:1:208:80:154:32, 208.80.154.32
Connecting to webproxy.eqiad.wmnet (webproxy.eqiad.wmnet)|2620:0:861:1:208:80:154:32|:8080... connected.
Proxy request sent, awaiting response... 200 OK
Length: 41350 (40K) [text/plain]
Saving to: ‘./groundtruth/wikiproject_taxonomy.yaml’


2021-01-26 02:25:19 (35.8 MB/s) - ‘./groundtruth/wikiproject_taxonomy.yaml’ saved [41350/41350]



In [10]:
# get mapping of pageID to list of all associated WikiProjects via page_assessments table in MariaDB
if not os.path.exists(page_assessments_tsv):
    print("Gathering page assessments data and writing to:", page_assessments_tsv)
    start_time = time.time()
    query = """
    SELECT pa.pa_page_id AS article_pid,
           GROUP_CONCAT(DISTINCT pap.pap_project_title SEPARATOR '{0}') AS wp_templates,
           MAX(p.page_latest) AS article_revid,
           MAX(p.page_title) AS title,
           MAX(ptalk.page_id) AS talk_pid,
           MAX(ptalk.page_latest) AS talk_revid
      FROM page_assessments pa
     INNER JOIN page_assessments_projects pap
           ON (pa.pa_project_id = pap.pap_project_id)
     INNER JOIN page p
           ON (pa.pa_page_id = p.page_id AND p.page_namespace = 0 and p.page_is_redirect = 0)
     INNER JOIN page ptalk
           ON (p.page_title = ptalk.page_title AND ptalk.page_namespace = 1)
     GROUP BY pa.pa_page_id
    """.format(sep)
    exec_mariadb_stat2(query=query, db=db, filename=page_assessments_tsv, verbose=True)
    print("Page assessments complete after {0:.1f} minutes!".format((time.time() - start_time) / 60))

Gathering page assessments data and writing to: ./groundtruth/page_assessments.tsv
mysql --defaults-extra-file=/etc/mysql/conf.d/analytics-research-client.cnf -h s1-analytics-replica.eqiad.wmnet -P 3311 -A --database enwiki -e " SELECT pa.pa_page_id AS article_pid, GROUP_CONCAT(DISTINCT pap.pap_project_title SEPARATOR '||') AS wp_templates, MAX(p.page_latest) AS article_revid, MAX(p.page_title) AS title, MAX(ptalk.page_id) AS talk_pid, MAX(ptalk.page_latest) AS talk_revid FROM page_assessments pa INNER JOIN page_assessments_projects pap ON (pa.pa_project_id = pap.pap_project_id) INNER JOIN page p ON (pa.pa_page_id = p.page_id AND p.page_namespace = 0 and p.page_is_redirect = 0) INNER JOIN page ptalk ON (p.page_title = ptalk.page_title AND ptalk.page_namespace = 1) GROUP BY pa.pa_page_id " > ./groundtruth/page_assessments.tsv
Page assessments complete after 3.6 minutes!


In [11]:
pids_to_metadata = {}
with open(page_assessments_tsv, 'r') as fin:
    tsvreader = csv.reader(fin, delimiter='\t')
    assert next(tsvreader) == ['article_pid', 'wp_templates', 'article_revid', 'title', 'talk_pid', 'talk_revid']
    for line in tsvreader:
        pid = int(line[0])
        wp_templates = line[1].split(sep)
        rid = int(line[2])
        title = line[3]
        tpid = int(line[4])
        trid = int(line[5])
        pids_to_metadata[pid] = {'wp_templates':wp_templates,
                                 'article_revid':rid,
                                 'title':title,
                                 'talk_pid':tpid,
                                 'talk_revid':trid}
print("{0} pages with WikiProject assessments in {1}.".format(len(pids_to_metadata), db))

5970598 pages with WikiProject assessments in enwiki.


In [13]:
# get data for QIDs / sitelinks
if not os.path.exists(pid_to_qid_tsv):
    print("Gathering PID / QID mapping and writing to:", pid_to_qid_tsv)
    start_time = time.time()
    query = f"""
    WITH wikipedia_projects AS (
        SELECT DISTINCT
          dbname
        FROM wmf_raw.mediawiki_project_namespace_map
        WHERE
          snapshot = '{dbname_snapshot}'
          AND hostname LIKE '%wikipedia%'
    )
    SELECT
      item_id,
      page_id,
      wiki_db
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '{pid_to_qid_snapshot}'
      AND page_namespace = 0
    """
    exec_hive_stat2(query, filename=pid_to_qid_tsv, priority=False, verbose=True, nice=True, large=False)
    print("PID / QID mapping complete after {0:.1f} minutes!".format((time.time() - start_time) / 60))

Gathering PID / QID mapping and writing to: ./groundtruth/pid_to_qid.tsv
hive -e " SET mapreduce.job.queuename=nice; WITH wikipedia_projects AS ( SELECT DISTINCT dbname FROM wmf_raw.mediawiki_project_namespace_map WHERE snapshot = '2020-12' AND hostname LIKE '%wikipedia%' ) SELECT item_id, page_id, wiki_db FROM wmf.wikidata_item_page_link wd INNER JOIN wikipedia_projects wp ON (wd.wiki_db = wp.dbname) WHERE snapshot = '2021-01-04' AND page_namespace = 0 " > ./groundtruth/pid_to_qid.tsv
PID / QID mapping complete after 5.7 minutes!


In [14]:
qid_to_pids = {}
pid_to_qid = {}
with open(pid_to_qid_tsv, 'r') as fin:
    tsvreader = csv.reader(fin, delimiter='\t')
    assert next(tsvreader) == ['item_id', 'page_id', 'wiki_db']
    for line in tsvreader:
        qid = line[0]
        pid = int(line[1])
        wiki_db = line[2]
        if qid not in qid_to_pids:
            qid_to_pids[qid] = {}
        qid_to_pids[qid][wiki_db] = pid
        if wiki_db == db:
            pid_to_qid[pid] = qid
print("{0} pages in {1} with Wikidata IDs".format(len(pid_to_qid), db))

6493524 pages in enwiki with Wikidata IDs


In [15]:
found = 0
for pid in pids_to_metadata:
    if pid in pid_to_qid:
        found += 1
        qid = pid_to_qid[pid]
        pids_to_metadata[pid]['sitelinks'] = qid_to_pids[qid]
        pids_to_metadata[pid]['qid'] = qid
print("{0} sitelink sets found out of {1}".format(found, len(pids_to_metadata)))

5930971 sitelink sets found out of 5970598


In [16]:
with open(topics_yaml, 'r') as fin:
    taxonomy = yaml.safe_load(fin)

wikiproject_to_topic = generate_wp_to_labels(taxonomy)
topics = set()
for wp in wikiproject_to_topic:
    for topic in wikiproject_to_topic[wp]:
        topics.add(topic)
        
if db != 'enwiki':
    if not os.path.exists(wikiprojects_sitelinks_json):
        get_sitelinks_wikiprojects(wikiproject_tsv, wikiprojects_sitelinks_json)
    db_to_enwiki = {}
    with open(wikiprojects_sitelinks_json, 'r') as fin:
        for line in fin:
            lj = json.loads(line)
            if 'enwiki' in lj['sitelinks'] and db in lj['sitelinks']:
                db_to_enwiki[norm_fn(lj['sitelinks'][db])] = lj['sitelinks']['enwiki']
print("{0} WikiProjects and {1} topics".format(len(wikiproject_to_topic), len(topics)))

1139 WikiProjects and 64 topics


In [17]:
# dump articles to bzipped JSON with metadata and associated topics
topic_counts = {}
topic_dist = {}
with bz2.open(output_json, 'wt') as fout:
    for pid in pids_to_metadata:
        wp_templates = pids_to_metadata[pid]['wp_templates']
        if db != 'enwiki':
            wp_templates = [db_to_enwiki[norm_fn(t)] for t in wp_templates if norm_fn(t) in db_to_enwiki]
        topics = get_topics(wp_templates, wikiproject_to_topic, topic_counts)
        topic_dist[len(topics)] = topic_dist.get(len(topics), 0) + 1
        pids_to_metadata[pid]['topics'] = topics
        fout.write(json.dumps(pids_to_metadata[pid]) + "\n")

In [18]:
topic_counts = [(t, topic_counts[t]) for t in sorted(topic_counts, key=topic_counts.get, reverse=True)]
if db == 'enwiki':
    topic_counts = [t[0] for t in topic_counts if
                    t[1] == 0 and 'task' not in t[0].lower() and 'force' not in t[0].lower()]
    print("WikiProjects w/o topics:", sorted(topic_counts))
else:
    topic_counts = [t[0] for t in topic_counts if
                    t[1] > 0 and 'task' not in t[0].lower() and 'force' not in t[0].lower()]
    print("WikiProjects w/ topics:", sorted(topic_counts))

print("Topic distribution:", topic_dist)

WikiProjects w/o topics: ['1000 Women in Religion', 'A Cappella', 'A Song of Ice and Fire', 'Abandoned Articles', 'Abkhazia', 'Abortion', 'Accessibility', 'Addictions and recovery', 'Adele', 'Adelie Land work group', 'Adoption, fostering, orphan care and displacement', 'Aerosmith', 'Aesthetics', 'Air sports project', 'Airsoft', 'Ajman work group', 'Alexandra Stan', 'Alien', 'Alphabet', 'Alternate Reality Gaming', 'American Football League', 'American Samoa', 'American Samoa work group', 'Americas', 'Anabaptist work group', 'Analytic philosophy', 'Anarchism', 'Ancient philosophy', 'Anglo-Saxon Kingdoms', 'Anguilla work group', 'Animation', 'Antarctic Chilean Territory work group', 'Aramea', 'Archives of American Art', 'Arena Football League', 'Argentine Antarctica work group', 'Ariana Grande', 'Artemis Fowl', 'Article Rescue Squadron', 'Articles for creation', 'Articles for improvement', 'Arts', 'Artsakh', 'Artsakh Republic', 'Aruba work group', 'Australian Antarctic Territory work grou

In [23]:
!bzless {output_json} | head

------> ./groundtruth/labeled_enwiki_with_topics_metadata.json.bz2 <------
{"wp_templates": ["Alternative Views", "Anarchism", "Libertarianism", "Philosophy", "Philosophy/Anarchism", "Philosophy/Contemporary philosophy", "Philosophy/Modern philosophy", "Philosophy/Social and political philosophy", "Politics", "Politics/Liberalism task force", "Socialism", "Sociology", "Wikipedia 1.0", "Wikipedia 1.0/Version 0.5", "Wikipedia 1.0/Vital articles"], "article_revid": 1001500220, "title": "Anarchism", "talk_pid": 416541, "talk_revid": 1001050151, "sitelinks": {"huwiki": 23360, "xmfwiki": 25422, "vepwiki": 26233, "jvwiki": 1826, "ltwiki": 9660, "bewiki": 1307, "mwlwiki": 804, "jamwiki": 111, "jbowiki": 5591, "bawiki": 66139, "azwiki": 116205, "mywiki": 24051, "alswiki": 36011, "hrwiki": 12913, "hywiki": 83723, "pmswiki": 87742, "mswiki": 8448, "kawiki": 30705, "bswiki": 4522, "fawiki": 61138, "mgwiki": 190863, "srwiki": 1911, "vecwiki": 28405, "pswiki": 9824, "zawiki": 9976, "yiwiki": 20972, 

^C


In [27]:
num_articles_with_topics = 0
for pid in pids_to_metadata:
    if pids_to_metadata[pid].get('topics'):
        num_articles_with_topics += len(pids_to_metadata[pid].get('sitelinks', []))

In [28]:
num_articles_with_topics

30581076