# Set up session

In [39]:
import bz2
from collections import defaultdict
import csv
import json
import os
import re
import sys
import time
import yaml

import requests

In [40]:
csv.field_size_limit(sys.maxsize)

131072

## Functions

In [2]:
def exec_mariadb_stat2(query, db, filename=None, verbose=True):
    """Query MariaDB."""
    if db in DB_METADATA:
        node = DB_METADATA[db]['node']
    else:
        raise NotImplementedError("Don't know mapping of db {0} to mysql node.".format(db))
    cmd = ('mysql --defaults-extra-file=/etc/mysql/conf.d/analytics-research-client.cnf '
           '-h s{0}-analytics-replica.eqiad.wmnet -P 331{0} -A --database {1} -e "{2}"'.format(node, db, query))
    if filename:
        cmd = cmd + " > " + filename
    if verbose:
        print(' '.join(cmd.split()))
    ret = os.system(cmd)
    return ret

def exec_hive_stat2(query, filename=None, priority=False, verbose=True, nice=False, large=False):
    """Query Hive."""
    if priority:
        query = "SET mapreduce.job.queuename=priority;" + query
    elif large:
        query = "SET mapreduce.job.queuename=nice; SET mapreduce.map.memory.mb=4096;" + query # SET mapreduce.map.memory.mb=4096
    elif nice:
        query = "SET mapreduce.job.queuename=nice;" + query
        # if issues: SET mapred.job.queue.name=nice;
    cmd = """hive -e \" """ + query + """ \""""
    if filename:
        cmd = cmd + " > " + filename
    if verbose:
        print(' '.join(cmd.split()))
    ret = os.system(cmd)
    return ret

In [3]:
def norm_wp_name_ar(wp):
    ns_local = 'ويكيبيديا'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ":", "").replace('مشروع ويكي', '').strip())

def norm_wp_name_en(wp):
    ns_local = 'wikipedia'
    wp_prefix = 'wikiproject'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ":", "").replace(wp_prefix, "").strip())

def norm_wp_name_hu(wp):
    ns_local = 'wikipédia'
    to_strip = [ns_local + ":", 'témájú', 'kapcsolatos', 'műhelyek', 'műhely', '-es ', '-', 'országgal', 'ország']
    hardcoded_matches = {'Wikipédia:Harry Potter-műhely':'Harry Potterrel kapcsolatos',
                         'Wikipédia:USA-műhely':'USA-val kapcsolatos',
                         'Wikipédia:Anime- és mangaműhely':'anime-manga témájú',
                         'Wikipédia:Első világháború műhely':'első világháborús témájú'}
    for m in hardcoded_matches:
        if wp == m:
            wp = hardcoded_matches[m]
    wp = wp.lower()
    for s in to_strip:
        wp = wp.replace(s, ' ')
    return re.sub("\s\s+", " ", wp.strip())

def norm_wp_name_fr(wp):
    ns_local = 'projet'
    return re.sub("\s\s+", " ", wp.lower().replace(ns_local + ':', "").strip())

def norm_wp_name_tr(wp):
    ns_local = 'vikiproje'
    wp_prefix = 'vikipedi'
    return re.sub("\s\s+", " ", wp.lower().replace(wp_prefix, "").replace(ns_local, '').replace(':', '').strip())

# Build Groundtruth Dataset

In [15]:
# parameters
DB_METADATA = {'enwiki':{'node':1, 'norm':norm_wp_name_en},  # 21M pages
               'frwiki':{'node':6, 'norm':norm_wp_name_fr},  #  2.7M pages
               'arwiki':{'node':7, 'norm':norm_wp_name_ar},  #  2.8M pages
               'huwiki':{'node':7, 'norm':norm_wp_name_hu},  #    330K pages
               'trwiki':{'node':2, 'norm':norm_wp_name_tr}   #    280K pages
               }

STANDARDIZE = {'top': 'Top',
               'Top': 'Top',
               'High': 'High',
               'high': 'High',
               'mid': 'Mid',
               'Mid': 'Mid',
               'Related': 'Low',
               'Bottom': 'Low',
               'low': 'Low',
               'Low': 'Low'}
IMP_PLACEHOLDER = None
IMP_RANKING = {'Top':1, 'High':2, 'Mid':3, 'Low':4, None:5}

page_assessments_tsv = './wikiproject_data.tsv'
db = 'enwiki'
dbname_snapshot = '2020-12'
pid_to_qid_snapshot = '2021-01-04'
pid_to_qid_tsv = './groundtruth/pid_to_qid.tsv'
output_tsv = './list_building_en_groundtruth.tsv.bz2'
norm_fn = DB_METADATA[db]['norm']

In [8]:
# get mapping of pageID to list of all associated WikiProjects via page_assessments table in MariaDB
if not os.path.exists(page_assessments_tsv):
    print("Gathering page assessments data and writing to:", page_assessments_tsv)
    start_time = time.time()
    query = """
    SELECT pa.pa_page_id AS article_pid,
           p.page_title AS title,
           pap.pap_project_title AS wp_template,
           pa.pa_importance AS importance
      FROM page_assessments pa
     INNER JOIN page_assessments_projects pap
           ON (pa.pa_project_id = pap.pap_project_id)
     INNER JOIN page p
           ON (pa.pa_page_id = p.page_id AND p.page_namespace = 0 and p.page_is_redirect = 0)
     ORDER BY pap.pap_project_title ASC
    """
    exec_mariadb_stat2(query=query, db=db, filename=page_assessments_tsv, verbose=True)
    print("Page assessments complete after {0:.1f} minutes!".format((time.time() - start_time) / 60))

In [5]:
# get data for QIDs / sitelinks
if not os.path.exists(pid_to_qid_tsv):
    print("Gathering PID / QID mapping and writing to:", pid_to_qid_tsv)
    start_time = time.time()
    query = f"""
    WITH wikipedia_projects AS (
        SELECT DISTINCT
          dbname
        FROM wmf_raw.mediawiki_project_namespace_map
        WHERE
          snapshot = '{dbname_snapshot}'
          AND hostname = 'en.wikipedia'
    )
    SELECT
      item_id,
      page_id,
      wiki_db
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '{pid_to_qid_snapshot}'
      AND page_namespace = 0
    """
    exec_hive_stat2(query, filename=pid_to_qid_tsv, priority=False, verbose=True, nice=True, large=False)
    print("PID / QID mapping complete after {0:.1f} minutes!".format((time.time() - start_time) / 60))

In [12]:
pid_to_qid = {}
with open(pid_to_qid_tsv, 'r') as fin:
    tsvreader = csv.reader(fin, delimiter='\t')
    assert next(tsvreader) == ['item_id', 'page_id', 'wiki_db']
    for line in tsvreader:
        wiki_db = line[2]
        if wiki_db == db:
            qid = line[0]
            pid = int(line[1])
            pid_to_qid[pid] = qid
print("{0} pages in {1} with Wikidata IDs".format(len(pid_to_qid), db))

6493524 pages in enwiki with Wikidata IDs


In [13]:
!head {page_assessments_tsv}

article_pid	title	wp_template	importance
29305	Sojourner_Truth	1000 Women in Religion	
391183	Frances_Harper	1000 Women in Religion	
478677	Eadburh_of_Bicester	1000 Women in Religion	
1607259	Vida_Goldstein	1000 Women in Religion	
1803216	Aldobrandesca	1000 Women in Religion	
3379384	Columba_of_Cornwall	1000 Women in Religion	
3388407	Columba_of_Sens	1000 Women in Religion	
3397671	Columba_of_Spain	1000 Women in Religion	
4566276	Áurea_of_San_Millán	1000 Women in Religion	


In [23]:
float('+inf')

inf

In [34]:
def sort_order(output_row):
    importance = output_row[4]
    imp_rank = IMP_RANKING[importance]
    qid = output_row[0]
    if qid is None:
        qid = 'Q+inf'
    qid_rank = float(qid[1:])
    pid = output_row[1]
    return (imp_rank, qid_rank, pid)

In [43]:
expected_header = ['article_pid', 'title', 'wp_template', 'importance']
nonstandard = {}
pids_matched = 0
written = 0
with bz2.open(output_tsv, 'wt') as fout:
    tsvwriter = csv.writer(fout, delimiter='\t')
    tsvwriter.writerow(['article_qid'] + expected_header)
    with open(page_assessments_tsv, 'r') as fin:
        tsvreader = csv.reader(fin, delimiter='\t')
        assert next(tsvreader) == expected_header
        current_wp = None
        data = []
        for line in tsvreader:
            pid = int(line[0])
            title = line[1]
            wp_template = line[2]
            importance = line[3]
            if importance in STANDARDIZE:
                importance = STANDARDIZE[importance]
            else:
                nonstandard[importance] = nonstandard.get(importance, 0) + 1
                importance = IMP_PLACEHOLDER
            qid = pid_to_qid.get(pid, None)
            if qid:
                pids_matched += 1
                
            if wp_template != current_wp:
                if data:
                    data = sorted(data, key=lambda x: sort_order(x))
                    for line in data:
                        tsvwriter.writerow(line)
                        written += 1
                data = []
                current_wp = wp_template

            data.append([qid, pid, title, wp_template, importance])

        if data:
            data = sorted(data, key=lambda x: sort_order(x))
            for line in data:
                tsvwriter.writerow(line)
                written += 1

In [44]:
print(f"{written} written. {pids_matched} had QIDs.")
print("\nNon-standard importance classes:")
for l in sorted(nonstandard, key=nonstandard.get, reverse=True):
    print(l, nonstandard[l])

16364421 written. 16252875 had QIDs.

Non-standard importance classes:
 4908597
Unknown 2886755
NA 113221
{{<span class="error 1
