# Language-Agnostic Quality Model
Building off a couple of papers:
* https://grouplens.org/site-content/uploads/2013/09/wikisym2013_warnckewang-cosley-riedl.pdf
* http://lewoniewski.info/files/bis2017_measure.pdf and https://www.mdpi.com/2227-9709/4/4/43/htm#B21-informatics-04-00043

## Imports / settings / etc.

In [1]:
import bz2
from datetime import datetime, timedelta
import os
import re
import sys

import mwparserfromhell
import wmfdata

In [2]:
!which python

/home/isaacj/.conda/envs/2021-03-18T15.28.24_isaacj/bin/python


In [3]:
spark = wmfdata.spark.get_session(app_name='pyspark large; quality-model-features',
                                  type='yarn-large', # local, yarn-regular, yarn-large
                                  )  

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [4]:
spark.sql("SET hive.exec.dynamic.partition.mode = nonstrict")

DataFrame[key: string, value: string]

In [5]:
print("Wikidata:")
spark.sql('show partitions wmf.wikidata_item_page_link').show(50, False)
print("\nMediawiki:")
spark.sql("show partitions wmf_raw.mediawiki_project_namespace_map").show(50, False)

Wikidata:
+-------------------+
|partition          |
+-------------------+
|snapshot=2021-03-22|
|snapshot=2021-03-29|
|snapshot=2021-04-05|
|snapshot=2021-04-12|
|snapshot=2021-04-26|
|snapshot=2021-05-03|
|snapshot=2021-05-10|
+-------------------+


Mediawiki:
+------------------------+
|partition               |
+------------------------+
|snapshot=2016-12_private|
|snapshot=2017-07_private|
|snapshot=2020-11        |
|snapshot=2020-12        |
|snapshot=2021-01        |
|snapshot=2021-02        |
|snapshot=2021-03        |
|snapshot=2021-04        |
+------------------------+



## Parameters

In [4]:
snapshot = '2021-04'  # data will be current to this date -- e.g., 2020-07 means data is up to 30 June 2020 (at least)
wd_snapshot = '2021-04-05'
year = 2021
last_month = 4
allwikis_tablename = 'isaacj.qual_features'
qual_preds_tablename = 'isaacj.qual_preds'
norm_pvs_tablename = 'isaacj.normed_pvs'

## Utils

In [7]:
sfn_templates = [t.lower() for t in ["Shortened footnote template", "sfn", "Sfnp", "Sfnm", "Sfnmp"]]
# NOTE: don't include citation templates like Cite or Harv because they are (or should be) wrapped in ref tags

def getNumRefs(wikitext):
    """Extract list of links from wikitext for an article via mwparserfromhell."""
    try:
        wt = mwparserfromhell.parse(wikitext)
        num_ref_tags = len([t.tag for t in wt.filter_tags() if t.tag == 'ref'])
        num_sftn_templates = len([t.name for t in wt.filter_templates() if t.name.lower() in sfn_templates])
        return num_ref_tags + num_sftn_templates
    except Exception:
        return None
    
spark.udf.register('getNumRefs', getNumRefs, 'Int')

<function __main__.getNumRefs(wikitext)>

In [8]:
def getNumHeadings(wikitext, max_level=None):
    """Extract list of headings from wikitext for an article."""
    try:
        wt = mwparserfromhell.parse(wikitext)
        if max_level is None:
            return len([1 for l in wt.filter_headings()])
        else:
            return len([1 for l in wt.filter_headings() if l.level <= max_level])
    except Exception:
        return None
    
spark.udf.register('getNumHeadings', getNumHeadings, 'Int')

<function __main__.getNumHeadings(wikitext, max_level=None)>

## Create Table for Data

In [9]:
do_execute = True
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {allwikis_tablename} (
        page_id          INT     COMMENT 'Article page ID',
        page_len         INT     COMMENT 'Number of bytes in article wikitext',
        num_images       INT     COMMENT 'Number of unique images in article',
        num_refs         INT     COMMENT 'Number of references in article',
        num_headings     INT     COMMENT 'Number of Level-2 and Level-3 headings in article'
    )
    PARTITIONED BY (
        wiki_db          STRING  COMMENT 'Wiki dbname -- e.g., enwiki for English Wikipedia'
    )
    """

if do_execute:
    spark.sql(create_table_query)

## Generate feature data for all wikis

In [None]:
"""
ALL WIKIS!!

The quality model requires the following attributes:
* # bytes (page length)
* # of references
* # of images
* # of headers (just levels 2 + 3)

Explanation of CTEs:
* 
"""

print_for_hive = False
do_execute = True

query = f"""
WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '{snapshot}'
      AND hostname LIKE '%wikipedia%'
),
pages AS (
    SELECT
      wiki_db,
      page_id,
      COALESCE(page_len, 0) AS page_len
    FROM wmf_raw.mediawiki_page p
    INNER JOIN wikipedia_projects wp
      ON (p.wiki_db = wp.dbname)
    WHERE
      snapshot = '{snapshot}'
      AND page_namespace = 0
      AND NOT page_is_redirect
),
num_images AS (
    SELECT
      i.wiki_db,
      il_from AS page_id,
      COUNT(DISTINCT(il_to)) AS num_images
    FROM wmf_raw.mediawiki_imagelinks i
    INNER JOIN pages p
      ON (i.il_from = p.page_id
          AND i.wiki_db = p.wiki_db)
    WHERE
      snapshot = '{snapshot}'
      AND il_from_namespace = 0
    GROUP BY
      i.wiki_db,
      il_from
),
wikitext_stats AS (
    SELECT
      wt.wiki_db,
      wt.page_id,
      getNumRefs(revision_text) AS num_refs,
      getNumHeadings(revision_text, 3) AS num_headings
    FROM wmf.mediawiki_wikitext_current wt
    INNER JOIN pages p
      ON (wt.page_id = p.page_id
          AND wt.wiki_db = p.wiki_db)
    WHERE
      snapshot = '{snapshot}'
      AND page_namespace = 0
)
INSERT OVERWRITE TABLE {allwikis_tablename}
SELECT
  p.page_id,
  page_len,
  COALESCE(num_images, 0) AS num_images,
  COALESCE(num_refs, 0) AS num_refs,
  COALESCE(num_headings, 0) AS num_headings,
  p.wiki_db AS wiki_db
FROM pages p
LEFT JOIN num_images i
  ON (p.wiki_db = i.wiki_db
      AND p.page_id = i.page_id)
LEFT JOIN wikitext_stats wt
  ON (p.wiki_db = wt.wiki_db
      AND p.page_id = wt.page_id)
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '2021-04'
      AND hostname LIKE '%wikipedia%'
      AND dbname <> 'enwiki'
),
pages AS (
    SELECT
      wiki_db,
      page_id,
      COALESCE(page_len, 0) AS page_len
    FROM wmf_raw.mediawiki_page p
    INNER JOIN wikipedia_projects wp
      ON (p.wiki_db = wp.dbname)
    WHERE
      snapshot = '2021-04'
      AND page_namespace = 0
      AND NOT page_is_redirect
),
num_images AS (
    SELECT
      i.wiki_db,
      il_from AS page_id,
      COUNT(DISTINCT(il_to)) AS num_images
    FROM wmf_raw.mediawiki_imagelinks i
    INNER JOIN pages p
      ON (i.il_from = p.page_id
          AND i.wiki_db = p.wiki_db)
    WHERE
      snapshot = '2021-04'
      AND il_from_namespace = 0
    GROUP BY
      i.wiki_db,
      il_from
),
wikitext_stats AS (
    SELECT
      wt.wiki_db,
      wt.page_id,
      getNumRefs(revision_text) AS num_refs,
      getNum

## Misalignment for All Wikis

### Article Quality Scores

In [17]:
def predictQuality(loglength, num_images, num_headings, num_refs):
    """Predict quality of article."""
        
    coef_len = 0.258
    coef_img = 0.015
    coef_hea = 0.241
    coef_ref = 0.486
    try:
        pred = (coef_len * loglength) + (coef_img * num_images) + (coef_hea * num_headings) + (coef_ref * num_refs)
        return pred
    except Exception:
        return None
        
spark.udf.register('predictQuality', predictQuality, 'Float')

<function __main__.predictQuality(loglength, num_images, num_headings, num_refs)>

In [21]:
do_execute = True
create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {qual_preds_tablename} (
        page_id          INT     COMMENT 'Article page ID',
        item_id          STRING  COMMENT 'Associated Wikidata QID',
        pred_qual        FLOAT   COMMENT 'Predicted quality score [0-1]; 0 = no content; 1 = highest quality'
    )
    PARTITIONED BY (
        wiki_db          STRING  COMMENT 'Wiki dbname -- e.g., enwiki for English Wikipedia'
    )
    """

if do_execute:
    spark.sql(create_table_query)

In [None]:
print_for_hive = False
do_execute = True
qual_pctile = 0.95
MIN_IMAGES = 5
MIN_HEADINGS = 5
MIN_REFS = 10

query = f"""
WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname,
      SUBSTR(hostname, 1, LENGTH(hostname)-4) AS project
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '{snapshot}'
      AND hostname LIKE '%wikipedia%'
),
all_pages AS (
    SELECT
      wiki_db,
      page_id
    FROM wmf_raw.mediawiki_page p
    INNER JOIN wikipedia_projects wp
      ON (p.wiki_db = wp.dbname)
    WHERE
      snapshot = '{snapshot}'
      AND page_namespace = 0
      AND NOT page_is_redirect
),
wikidata_ids AS (
    SELECT
      wiki_db,
      page_id,
      item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '{wd_snapshot}'
      AND page_namespace = 0
),
max_vals AS (
    SELECT
      wiki_db,
      PERCENTILE_APPROX(LOG10(1 + page_len), {qual_pctile}) AS max_length,
      LEAST({MIN_IMAGES}, PERCENTILE(num_images, {qual_pctile})) AS max_images,
      LEAST({MIN_HEADINGS}, PERCENTILE(num_headings, {qual_pctile})) AS max_headings,
      LEAST({MIN_REFS}, PERCENTILE(num_refs, {qual_pctile})) AS max_refs
    FROM {allwikis_tablename} qf
    INNER JOIN wikipedia_projects wp
      ON (qf.wiki_db = wp.dbname)
    WHERE
      page_len IS NOT NULL
      AND num_images IS NOT NULL
      AND num_headings IS NOT NULL
      AND num_refs IS NOT NULL
    GROUP BY
      wiki_db
),
qual_features_trimmed AS (
    SELECT
      page_id,
      qf.wiki_db,
      COALESCE(LEAST(LOG10(1 + page_len), max_length), 0) / max_length AS len_x,
      COALESCE(LEAST(num_images, max_images), 0) / max_images AS images_x,
      COALESCE(LEAST(num_headings, max_headings), 0) / max_headings AS headings_x,
      COALESCE(LEAST(num_refs, max_refs), 0) / max_refs AS refs_x
    FROM {allwikis_tablename} qf
    INNER JOIN max_vals mv
      ON (qf.wiki_db = mv.wiki_db)
),
qual_predictions AS (
    SELECT
      wiki_db,
      page_id,
      COALESCE(predictQuality(len_x, images_x, headings_x, refs_x), 0) AS pred_qual
    FROM qual_features_trimmed
)
INSERT OVERWRITE TABLE {qual_preds_tablename}
SELECT
  ap.page_id,
  wd.item_id,
  COALESCE(pred_qual, 0),
  ap.wiki_db
FROM all_pages ap
LEFT JOIN qual_predictions qp
  ON (ap.wiki_db = qp.wiki_db
      AND ap.page_id = qp.page_id)
LEFT JOIN wikidata_ids wd
  ON (ap.wiki_db = wd.wiki_db
      AND ap.page_id = wd.page_id)
"""

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)
    #result.write.csv(path="/user/isaacj/quality-preds-allwikis", compression='gzip', header=True, sep="\t")


WITH wikipedia_projects AS (
    SELECT DISTINCT
      dbname,
      SUBSTR(hostname, 1, LENGTH(hostname)-4) AS project
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '2021-04'
      AND hostname LIKE '%wikipedia%'
),
all_pages AS (
    SELECT
      wiki_db,
      page_id
    FROM wmf_raw.mediawiki_page p
    INNER JOIN wikipedia_projects wp
      ON (p.wiki_db = wp.dbname)
    WHERE
      snapshot = '2021-04'
      AND page_namespace = 0
      AND NOT page_is_redirect
),
wikidata_ids AS (
    SELECT
      wiki_db,
      page_id,
      item_id
    FROM wmf.wikidata_item_page_link wd
    INNER JOIN wikipedia_projects wp
      ON (wd.wiki_db = wp.dbname)
    WHERE
      snapshot = '2021-04-05'
      AND page_namespace = 0
),
max_vals AS (
    SELECT
      wiki_db,
      PERCENTILE_APPROX(LOG10(1 + page_len), 0.95) AS max_length,
      LEAST(5, PERCENTILE(num_images, 0.95)) AS max_images,
      LEAST(5, PERCENTILE(num_headings, 0.95)) AS max_headings,
      L