## Setup
* For tracking progress: https://yarn.wikimedia.org/cluster/scheduler
* Style guide: https://gist.github.com/fredbenenson/7bb92718e19138c20591

In [1]:
spark

In [2]:
# Spark settings
spark.sql("SET spark.sql.shuffle.partitions = 512")
spark.sql("SET hive.exec.dynamic.partition.mode = nonstrict")  # necessary for dynamic wiki_db partition

DataFrame[key: string, value: string]

In [3]:
# Python packages
import re

In [4]:
# Important parameters
snapshot = '2020-11'
wiki_db = 'enwiki'
wikidata_snapshot = '2020-11-02'
se_captions_subset_tablename = 'isaacj.suggested_edits_commons'
se_wikidata_subset_tablename = 'isaacj.suggested_edits_wikidata'
start_date = '2020-05-18'

## Commons captions / tags

### Generate Image Edit Data

In [6]:
create_table_query = """
    CREATE TABLE IF NOT EXISTS {0} (
        page_id                         BIGINT        COMMENT 'Commons page ID',
        page_title                      STRING        COMMENT 'Page title (ignoring redirects)',
        user_id                         BIGINT        COMMENT 'User ID; -1 if anonymous',
        user_text                       STRING        COMMENT 'User text; IP address if anonymous',
        revision_id                     BIGINT        COMMENT 'Revision ID',
        revision_timestamp              TIMESTAMP     COMMENT 'Revision timestamp (UTC)',
        revision_comment                STRING        COMMENT 'User-generated edit summary',
        revision_is_identity_reverted   BOOLEAN       COMMENT 'Was this revision reverted via identity revert?',
        revision_tags                   ARRAY<STRING> COMMENT 'See Special:Tags',
        suggested_edit                  INT           COMMENT '1 if suggestededit',
        image_tag                       INT           COMMENT '1 if image tag',
        image_caption                   INT           COMMENT '1 if image caption',
        se_add                          INT           COMMENT '1 if add task',
        se_translate                    INT           COMMENT '1 if translate task',
        articles_in_use                 ARRAY<BIGINT> COMMENT 'List of enwiki page IDs using the file',
        items_in_use                    ARRAY<STRING> COMMENT 'List of corresponding QIDs for pages',
        bio_genders                     ARRAY<STRING> COMMENT 'List of gender values associated with enwiki pages',
        regions                         ARRAY<STRING> COMMENT 'List of regions associated with enwiki pages'
    )
""".format(se_captions_subset_tablename)

print(create_table_query)
spark.sql(create_table_query)


    CREATE TABLE IF NOT EXISTS isaacj.suggested_edits_commons (
        page_id                         BIGINT        COMMENT 'Commons page ID',
        page_title                      STRING        COMMENT 'Page title (ignoring redirects)',
        user_id                         BIGINT        COMMENT 'User ID; -1 if anonymous',
        user_text                       STRING        COMMENT 'User text; IP address if anonymous',
        revision_id                     BIGINT        COMMENT 'Revision ID',
        revision_timestamp              TIMESTAMP     COMMENT 'Revision timestamp (UTC)',
        revision_comment                STRING        COMMENT 'User-generated edit summary',
        revision_is_identity_reverted   BOOLEAN       COMMENT 'Was this revision reverted via identity revert?',
        revision_tags                   ARRAY<STRING> COMMENT 'See Special:Tags',
        suggested_edit                  INT           COMMENT '1 if suggestededit',
        image_tag           

DataFrame[]

In [8]:
print_for_hive = False
do_execute = True

# Relevant criteria:
# Only users -- no anon edits / bot edits (shouldn't exist but...)
# All android app edits -- some will be suggested edits and others won't (but are useful controls)
# Commonswiki:
# - tag/caption edits happen on File namespace (6)
# - tag: `add-depict` automatically added to edit comment (August 2020 on) and wbsetclaim automatically added before August
# - caption: `wbsetlabel-` automatically added to edit comment
# - caption: `#suggestededit-add` for add task; `#suggestededit-translate` for translate task

query = """
WITH se_edits AS (
  SELECT
    page_id,
    page_title,
    event_user_id AS user_id,
    REPLACE(event_user_text, ' ', '_') AS user_text,
    revision_id,
    CAST(event_timestamp AS TIMESTAMP) as revision_timestamp,
    event_comment AS revision_comment,
    revision_is_identity_reverted,
    revision_tags,
    IF(ARRAY_CONTAINS(revision_tags, 'apps-suggested-edits'), 1, 0) AS suggested_edit,
    IF(event_comment LIKE '%add-depict%' OR event_comment LIKE '%wbsetclaim-%', 1, 0) AS image_tag,
    IF(event_comment LIKE '%wbsetlabel-%', 1, 0) AS image_caption,
    IF(event_comment LIKE '%#suggestededit-add%', 1, 0) AS se_add,
    IF(event_comment LIKE '%#suggestededit-translate%', 1, 0) AS se_translate
  FROM wmf.mediawiki_history
  WHERE
    snapshot = '{0}'
    AND wiki_db = 'commonswiki'
    AND page_namespace = 6
    AND NOT event_user_is_anonymous
    AND NOT SIZE(event_user_is_bot_by) > 0
    AND event_type = 'create'
    AND event_entity = 'revision'
    AND CAST(event_timestamp AS DATE) > '{1}'
    AND ARRAY_CONTAINS(revision_tags, 'android app edit')
),
relevant_wikidata_items AS (
  SELECT
    item_id,
    page_id
  FROM wmf.wikidata_item_page_link
  WHERE
    snapshot = '{2}'
    AND wiki_db = '{3}'
    AND page_namespace = 0
),
relevant_wikidata_gender AS (
  SELECT
    g.item_id AS item_id,
    w.page_id AS page_id,
    g.value AS gender_value
  FROM isaacj.gender_wikidata g
  INNER JOIN relevant_wikidata_items w
    ON (g.item_id = w.item_id)
),
relevant_wikidata_regions AS (
  SELECT
    q.qid AS item_id,
    w.page_id AS page_id,
    q.region AS region
  FROM isaacj.qid_to_country q
  INNER JOIN relevant_wikidata_items w
    ON (q.qid = w.item_id)
),
relevant_imagelinks AS (
  SELECT
    il_to AS commons_image_title,
    COLLECT_LIST(il_from) AS wiki_page_ids,
    COLLECT_LIST(g.item_id) AS item_ids,
    COLLECT_LIST(g.gender_value) AS genders,
    COLLECT_LIST(r.region) AS regions
  FROM wmf_raw.mediawiki_imagelinks il
  LEFT JOIN relevant_wikidata_gender g
    ON (il.il_from = g.page_id)
  LEFT JOIN relevant_wikidata_regions r
    ON (il.il_from = r.page_id)
  WHERE
    snapshot = '{0}'
    AND wiki_db = '{3}'
    AND il_from_namespace = 0
  GROUP BY
    il_to
)
INSERT OVERWRITE TABLE {4}
  SELECT
    page_id,
    page_title,
    user_id,
    user_text,
    revision_id,
    revision_timestamp,
    revision_comment,
    revision_is_identity_reverted,
    revision_tags,
    suggested_edit,
    image_tag,
    image_caption,
    se_add,
    se_translate,
    wiki_page_ids AS articles_in_use,
    item_ids AS items_in_use,
    genders AS bio_genders,
    regions AS regions
  FROM se_edits se
  LEFT JOIN relevant_imagelinks il
    ON (se.page_title = il.commons_image_title)
""".format(snapshot, start_date, wikidata_snapshot, wiki_db, se_captions_subset_tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH se_edits AS (
  SELECT
    page_id,
    page_title,
    event_user_id AS user_id,
    REPLACE(event_user_text, ' ', '_') AS user_text,
    revision_id,
    CAST(event_timestamp AS TIMESTAMP) as revision_timestamp,
    event_comment AS revision_comment,
    revision_is_identity_reverted,
    revision_tags,
    IF(ARRAY_CONTAINS(revision_tags, 'apps-suggested-edits'), 1, 0) AS suggested_edit,
    IF(event_comment LIKE '%add-depict%' OR event_comment LIKE '%wbsetclaim-%', 1, 0) AS image_tag,
    IF(event_comment LIKE '%wbsetlabel-%', 1, 0) AS image_caption,
    IF(event_comment LIKE '%#suggestededit-add%', 1, 0) AS se_add,
    IF(event_comment LIKE '%#suggestededit-translate%', 1, 0) AS se_translate
  FROM wmf.mediawiki_history
  WHERE
    snapshot = '2020-11'
    AND wiki_db = 'commonswiki'
    AND page_namespace = 6
    AND NOT event_user_is_anonymous
    AND NOT SIZE(event_user_is_bot_by) > 0
    AND event_type = 'create'
    AND event_entity = 'revision'
    AND CAST(event_times

### Image Analyses

In [9]:
# Summary statistics about edits / images
print_for_hive = False
do_execute = True

query = """
WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_id,
      IF(FIRST_VALUE(articles_in_use, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS page_in_use,
      IF(SIZE(FIRST_VALUE(bio_genders, true) OVER (PARTITION BY page_id, suggested_edit)) > 0, 1, 0) AS genders,
      IF(SIZE(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit)) > 0, 1, 0) AS regional
    FROM {0}
),
page_counts AS (
    SELECT
      se,
      COUNT(page_id) AS num_pages,
      SUM(page_in_use) AS count_in_use,
      SUM(genders) AS is_gendered,
      SUM(regional) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_pages) AS pages,
  MAX(p.count_in_use) AS pages_in_use,
  MAX(p.is_gendered) AS pages_bios,
  MAX(p.is_geographic) AS pages_geo,
  COUNT(1) AS edits,
  SUM(s.image_tag) AS image_tag,
  SUM(s.image_caption) AS captions,
  SUM(s.se_add) AS add,
  SUM(s.se_translate) AS translate,
  SUM(IF(articles_in_use IS NOT NULL, 1, 0)) AS edits_in_use,
  SUM(IF(SIZE(bio_genders) > 0, 1, 0)) AS edits_to_bios,
  SUM(IF(SIZE(regions) > 0, 1, 0)) AS edits_to_geo
FROM {0} s
LEFT JOIN page_counts p
  ON (s.suggested_edit = p.se)
GROUP BY s.suggested_edit
""".format(se_captions_subset_tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    spark.sql(query).show(50, False)


WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_id,
      IF(FIRST_VALUE(articles_in_use, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS page_in_use,
      IF(SIZE(FIRST_VALUE(bio_genders, true) OVER (PARTITION BY page_id, suggested_edit)) > 0, 1, 0) AS genders,
      IF(SIZE(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit)) > 0, 1, 0) AS regional
    FROM isaacj.suggested_edits_commons
),
page_counts AS (
    SELECT
      se,
      COUNT(page_id) AS num_pages,
      SUM(page_in_use) AS count_in_use,
      SUM(genders) AS is_gendered,
      SUM(regional) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_pages) AS pages,
  MAX(p.count_in_use) AS pages_in_use,
  MAX(p.is_gendered) AS pages_bios,
  MAX(p.is_geographic) AS pages_geo,
  COUNT(1) AS edits,
  SUM(s.image_tag) AS image_tag,
  SUM(s.image_caption) AS captions,
  S

In [15]:
# Determine value that occurs the most in a list
# For example, if an article is associated with three articles about men and one article about a woman, this will return man
# The assumptions here should be further tested:
# * What types of images are associated with both men and women?
# * Should they be include in this analysis?
def mode(value_list):
    counts = {}
    for v in value_list:
        counts[v] = counts.get(v, 0) + 1
    mode = sorted(counts, key=counts.get, reverse=True)[0]
    return mode
    
spark.udf.register('mode', mode, 'string')

def list_to_set(value_list):
    return list(set([str(v) for v in value_list]))

spark.udf.register('list_to_set', list_to_set, 'ARRAY<STRING>')

<function __main__.list_to_set>

In [48]:
# gender breakdown of edits
query = """
WITH gender_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      page_id,
      MODE(bio_genders) AS gender
    FROM {0}
    WHERE
      bio_genders IS NOT NULL
      AND SIZE(bio_genders) > 0
)
SELECT
  is_suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  COUNT(DISTINCT(page_id)) AS num_pages
FROM gender_of_contrib
GROUP BY
  is_suggested_edit,
  gender
  """.format(se_captions_subset_tablename)

spark.sql(query).show(200, False)


+-----------------+---------+---------+---------+
|is_suggested_edit|gender   |num_edits|num_pages|
+-----------------+---------+---------+---------+
|0                |Q6581097 |142      |140      |
|1                |Q6581097 |8972     |6312     |
|1                |Q18116794|1        |1        |
|0                |Q6581072 |44       |44       |
|1                |Q6581072 |1659     |1158     |
|1                |Q12964198|1        |1        |
|1                |Q48270   |2        |2        |
|1                |Q1052281 |5        |4        |
+-----------------+---------+---------+---------+



In [24]:
# revert rate by gender
query = """
WITH gender_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      IF(revision_is_identity_reverted, 1, 0) AS reverted,
      MODE(bio_genders) AS gender
    FROM {0}
    WHERE
      bio_genders IS NOT NULL
      AND SIZE(bio_genders) > 0
)
SELECT
  is_suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  SUM(reverted) AS num_reverted,
  SUM(reverted) / COUNT(1) AS pct_reverted
FROM gender_of_contrib
GROUP BY
  is_suggested_edit,
  gender
  """.format(se_captions_subset_tablename)

spark.sql(query).show(200, False)


+-----------------+---------+---------+------------+-------------------+
|is_suggested_edit|gender   |num_edits|num_reverted|pct_reverted       |
+-----------------+---------+---------+------------+-------------------+
|0                |Q6581097 |142      |6           |0.04225352112676056|
|1                |Q6581097 |8972     |555         |0.06185911725367811|
|1                |Q18116794|1        |0           |0.0                |
|0                |Q6581072 |44       |0           |0.0                |
|1                |Q6581072 |1659     |117         |0.0705244122965642 |
|1                |Q12964198|1        |0           |0.0                |
|1                |Q48270   |2        |0           |0.0                |
|1                |Q1052281 |5        |0           |0.0                |
+-----------------+---------+---------+------------+-------------------+



In [12]:
# geographic breakdown of edits
query = """
WITH regions_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      page_id,
      revision_id,
      EXPLODE(regions) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
)
SELECT
  is_suggested_edit,
  region,
  COUNT(DISTINCT(revision_id)) AS num_edits,
  COUNT(DISTINCT(page_id)) AS num_pages
FROM regions_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_captions_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+---------+-----------+
|is_suggested_edit|region                                      |num_edits|num_pages|num_touches|
+-----------------+--------------------------------------------+---------+---------+-----------+
|1                |United States of America                    |5811     |4074     |986553     |
|1                |India                                       |2797     |1686     |179668     |
|1                |United Kingdom                              |2549     |1627     |356819     |
|1                |France                                      |2183     |1438     |229874     |
|1                |Germany                                     |1805     |1167     |287478     |
|1                |Italy                                       |1653     |1003     |247485     |
|1                |Canada                                      |1224     |712      |227499     |
|1                |Russia     

In [16]:
# revert rate by geography
query = """
WITH gender_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      IF(revision_is_identity_reverted, 1, 0) AS reverted,
      EXPLODE(list_to_set(regions)) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
)
SELECT
  is_suggested_edit,
  region,
  COUNT(1) AS num_edits,
  SUM(reverted) AS num_reverted,
  SUM(reverted) / COUNT(1) AS pct_reverted
FROM gender_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_captions_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+------------+--------------------+
|is_suggested_edit|region                                      |num_edits|num_reverted|pct_reverted        |
+-----------------+--------------------------------------------+---------+------------+--------------------+
|1                |United States of America                    |5811     |512         |0.08810875924969885 |
|1                |India                                       |2797     |221         |0.07901322845906328 |
|1                |United Kingdom                              |2549     |187         |0.07336210278540604 |
|1                |France                                      |2183     |104         |0.04764086120018323 |
|1                |Germany                                     |1805     |114         |0.06315789473684211 |
|1                |Italy                                       |1653     |81          |0.04900181488203267 |
|1                |

## Wikidata Descriptions

### Generate Wikidata Edit Data

In [5]:
create_table_query = """
    CREATE TABLE IF NOT EXISTS {0} (
        page_id                         BIGINT        COMMENT 'Wikidata page ID',
        page_title                      STRING        COMMENT 'Page title (QID)',
        user_id                         BIGINT        COMMENT 'User ID; -1 if anonymous',
        user_text                       STRING        COMMENT 'User text; IP address if anonymous',
        revision_id                     BIGINT        COMMENT 'Revision ID',
        revision_timestamp              TIMESTAMP     COMMENT 'Revision timestamp (UTC)',
        revision_comment                STRING        COMMENT 'User-generated edit summary',
        description_language            STRING        COMMENT 'Language of description',
        revision_is_identity_reverted   BOOLEAN       COMMENT 'Was this revision reverted via identity revert?',
        revision_tags                   ARRAY<STRING> COMMENT 'See Special:Tags',
        suggested_edit                  INT           COMMENT '1 if suggestededit',
        desc_add                        INT           COMMENT '1 if new description',
        desc_change                     INT           COMMENT '1 if changed description',
        se_add                          INT           COMMENT '1 if add task',
        se_translate                    INT           COMMENT '1 if translate task',
        sitelinks                       ARRAY<STRING> COMMENT 'List of languages using with Wikipedia articles for the item',
        gender                          STRING        COMMENT 'Gender value if exists and is human (QID)',
        regions                         ARRAY<STRING> COMMENT 'List of regions associated with items'
    )
""".format(se_wikidata_subset_tablename)

print(create_table_query)
spark.sql(create_table_query)


    CREATE TABLE IF NOT EXISTS isaacj.suggested_edits_wikidata (
        page_id                         BIGINT        COMMENT 'Wikidata page ID',
        page_title                      STRING        COMMENT 'Page title (QID)',
        user_id                         BIGINT        COMMENT 'User ID; -1 if anonymous',
        user_text                       STRING        COMMENT 'User text; IP address if anonymous',
        revision_id                     BIGINT        COMMENT 'Revision ID',
        revision_timestamp              TIMESTAMP     COMMENT 'Revision timestamp (UTC)',
        revision_comment                STRING        COMMENT 'User-generated edit summary',
        description_language            STRING        COMMENT 'Language of description',
        revision_is_identity_reverted   BOOLEAN       COMMENT 'Was this revision reverted via identity revert?',
        revision_tags                   ARRAY<STRING> COMMENT 'See Special:Tags',
        suggested_edit              

DataFrame[]

In [6]:
# Determine what language the description was edited in -- this assumes a single language was edited
def extract_language(revision_comment):
    if type(revision_comment) == str:
        for word in revision_comment.split():
            if word.startswith('wbsetdescription-'):
                try:
                    lang = word.split('|')[1]
                    return lang
                except IndexError:
                    continue
    return None
    
spark.udf.register('extract_language', extract_language, 'string')

<function __main__.extract_language>

In [8]:
print_for_hive = False
do_execute = True

# Relevant criteria:
# Only users -- no anon edits / bot edits (shouldn't exist but...)
# All android app edits -- some will be suggested edits and others won't (but are useful controls)
# Wikidatawik:
# - description edits happen on article namespace
# - `#suggestededit-add` for add task; `#suggestededit-translate` for translate task
# - `wbsetdescription-add` when description didn't exist prior in that language
# - `wbsetdescription-set` when there already was a description in that language


query = """
WITH se_edits AS (
  SELECT
    page_id,
    page_title,
    event_user_id AS user_id,
    REPLACE(event_user_text, ' ', '_') AS user_text,
    revision_id,
    CAST(event_timestamp AS TIMESTAMP) as revision_timestamp,
    event_comment AS revision_comment,
    extract_language(event_comment) AS description_language,
    revision_is_identity_reverted,
    revision_tags,
    IF(ARRAY_CONTAINS(revision_tags, 'apps-suggested-edits'), 1, 0) AS suggested_edit,
    IF(event_comment LIKE '%wbsetdescription-add%', 1, 0) AS desc_add,
    IF(event_comment LIKE '%wbsetdescription-set%', 1, 0) AS desc_change,
    IF(event_comment LIKE '%#suggestededit-add%', 1, 0) AS se_add,
    IF(event_comment LIKE '%#suggestededit-translate%', 1, 0) AS se_translate
  FROM wmf.mediawiki_history
  WHERE
    snapshot = '{0}'
    AND wiki_db = 'wikidatawiki'
    AND page_namespace = 0
    AND NOT event_user_is_anonymous
    AND NOT SIZE(event_user_is_bot_by) > 0
    AND event_type = 'create'
    AND event_entity = 'revision'
    AND CAST(event_timestamp AS DATE) > '{1}'
    AND ARRAY_CONTAINS(revision_tags, 'android app edit')
),
wikipedia_projects AS (
    SELECT DISTINCT
      dbname
    FROM wmf_raw.mediawiki_project_namespace_map
    WHERE
      snapshot = '{0}'
      AND hostname LIKE '%wikipedia%'
),
languages_in_use AS (
  SELECT
    item_id,
    COLLECT_LIST(wiki_db) AS sitelinks
  FROM wmf.wikidata_item_page_link w
  INNER JOIN wikipedia_projects m
    ON (w.wiki_db = m.dbname)
  WHERE
    snapshot = '{2}'
  GROUP BY
    item_id
),
regions AS (
  SELECT
    qid AS qid,
    COLLECT_LIST(region) AS regions
  FROM isaacj.qid_to_country
  GROUP BY
    qid
),
se_edits_with_gender_region AS (
  SELECT
    se.*,
    sl.sitelinks AS sitelinks,
    g.value AS gender,
    r.regions AS regions
  FROM se_edits se
  LEFT JOIN languages_in_use sl
    ON (se.page_title = sl.item_id)
  LEFT JOIN isaacj.gender_wikidata g
    ON (se.page_title = g.item_id)
  LEFT JOIN regions r
    ON (se.page_title = r.qid)
)
INSERT OVERWRITE TABLE {3}
  SELECT
    page_id,
    page_title,
    user_id,
    user_text,
    revision_id,
    revision_timestamp,
    revision_comment,
    description_language,
    revision_is_identity_reverted,
    revision_tags,
    suggested_edit,
    desc_add,
    desc_change,
    se_add,
    se_translate,
    sitelinks,
    gender,
    regions
  FROM se_edits_with_gender_region
""".format(snapshot, start_date, wikidata_snapshot, se_wikidata_subset_tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    result = spark.sql(query)


WITH se_edits AS (
  SELECT
    page_id,
    page_title,
    event_user_id AS user_id,
    REPLACE(event_user_text, ' ', '_') AS user_text,
    revision_id,
    CAST(event_timestamp AS TIMESTAMP) as revision_timestamp,
    event_comment AS revision_comment,
    extract_language(event_comment) AS description_language,
    revision_is_identity_reverted,
    revision_tags,
    IF(ARRAY_CONTAINS(revision_tags, 'apps-suggested-edits'), 1, 0) AS suggested_edit,
    IF(event_comment LIKE '%wbsetdescription-add%', 1, 0) AS desc_add,
    IF(event_comment LIKE '%wbsetdescription-set%', 1, 0) AS desc_change,
    IF(event_comment LIKE '%#suggestededit-add%', 1, 0) AS se_add,
    IF(event_comment LIKE '%#suggestededit-translate%', 1, 0) AS se_translate
  FROM wmf.mediawiki_history
  WHERE
    snapshot = '2020-11'
    AND wiki_db = 'wikidatawiki'
    AND page_namespace = 0
    AND NOT event_user_is_anonymous
    AND NOT SIZE(event_user_is_bot_by) > 0
    AND event_type = 'create'
    AND event_enti

### Wikidata Edit Analyses

In [10]:
# summary stats for all wikis
print_for_hive = False
do_execute = True

query = """
WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_title,
      IF(FIRST_VALUE(sitelinks, true) OVER (PARTITION BY page_title, suggested_edit) IS NOT NULL, 1, 0) AS item_in_use,
      IF(FIRST_VALUE(gender, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS gendered,
      IF(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS geographic
    FROM {0}
),
page_counts AS (
    SELECT
      se,
      COUNT(page_title) AS num_items,
      SUM(item_in_use) AS count_in_use,
      SUM(gendered) AS is_gendered,
      SUM(geographic) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_items) AS pages,
  MAX(p.count_in_use) AS items_in_use,
  MAX(p.is_gendered) AS item_is_bio,
  MAX(p.is_geographic) AS item_is_geo,
  COUNT(1) AS edits,
  SUM(s.desc_add) AS desc_add,
  SUM(s.desc_change) AS desc_change,
  SUM(s.se_add) AS add,
  SUM(s.se_translate) AS translate,
  SUM(IF(ARRAY_CONTAINS(sitelinks, CONCAT(description_language, 'wiki')), 1, 0)) as edits_in_use,
  SUM(IF(gender IS NOT NULL, 1, 0)) AS edits_to_bios,
  SUM(IF(regions IS NOT NULL, 1, 0)) AS edits_to_geos
FROM {0} s
LEFT JOIN page_counts p
  ON (s.suggested_edit = p.se)
GROUP BY s.suggested_edit
""".format(se_wikidata_subset_tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    spark.sql(query).show(50, False)


WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_title,
      IF(FIRST_VALUE(sitelinks, true) OVER (PARTITION BY page_title, suggested_edit) IS NOT NULL, 1, 0) AS item_in_use,
      IF(FIRST_VALUE(gender, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS gendered,
      IF(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS geographic
    FROM isaacj.suggested_edits_wikidata
),
page_counts AS (
    SELECT
      se,
      COUNT(page_title) AS num_items,
      SUM(item_in_use) AS count_in_use,
      SUM(gendered) AS is_gendered,
      SUM(geographic) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_items) AS pages,
  MAX(p.count_in_use) AS items_in_use,
  MAX(p.is_gendered) AS item_is_bio,
  MAX(p.is_geographic) AS item_is_geo,
  COUNT(1) AS edits,
  SUM(s.desc_add) AS desc_add,
  SUM(s.desc_change) AS desc_

In [11]:
# summary stats for just english
print_for_hive = False
do_execute = True

query = """
WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_title,
      IF(FIRST_VALUE(sitelinks, true) OVER (PARTITION BY page_title, suggested_edit) IS NOT NULL, 1, 0) AS item_in_use,
      IF(FIRST_VALUE(gender, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS gendered,
      IF(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS geographic
    FROM {0}
    WHERE
      description_language = 'en'
),
page_counts AS (
    SELECT
      se,
      COUNT(page_title) AS num_items,
      SUM(item_in_use) AS count_in_use,
      SUM(gendered) AS is_gendered,
      SUM(geographic) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_items) AS pages,
  MAX(p.count_in_use) AS items_in_use,
  MAX(p.is_gendered) AS item_is_bio,
  MAX(p.is_geographic) AS item_is_geo,
  COUNT(1) AS edits,
  SUM(s.desc_add) AS desc_add,
  SUM(s.desc_change) AS desc_change,
  SUM(s.se_add) AS add,
  SUM(s.se_translate) AS translate,
  SUM(IF(ARRAY_CONTAINS(sitelinks, CONCAT(description_language, 'wiki')), 1, 0)) as edits_in_use,
  SUM(IF(gender IS NOT NULL, 1, 0)) as edits_to_bios,
  SUM(IF(regions IS NOT NULL, 1, 0)) AS edits_to_geos
FROM {0} s
LEFT JOIN page_counts p
  ON (s.suggested_edit = p.se)
WHERE
  description_language = 'en'
GROUP BY s.suggested_edit
""".format(se_wikidata_subset_tablename)

if print_for_hive:
    print(re.sub(' +', ' ', re.sub('\n', ' ', query)).strip())
else:
    print(query)

if do_execute:
    spark.sql(query).show(50, False)


WITH in_use AS (
    SELECT DISTINCT
      suggested_edit AS se,
      page_title,
      IF(FIRST_VALUE(sitelinks, true) OVER (PARTITION BY page_title, suggested_edit) IS NOT NULL, 1, 0) AS item_in_use,
      IF(FIRST_VALUE(gender, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS gendered,
      IF(FIRST_VALUE(regions, true) OVER (PARTITION BY page_id, suggested_edit) IS NOT NULL, 1, 0) AS geographic
    FROM isaacj.suggested_edits_wikidata
    WHERE
      description_language = 'en'
),
page_counts AS (
    SELECT
      se,
      COUNT(page_title) AS num_items,
      SUM(item_in_use) AS count_in_use,
      SUM(gendered) AS is_gendered,
      SUM(geographic) AS is_geographic
    FROM in_use
    GROUP BY
      se
)
SELECT
  s.suggested_edit AS se,
  COUNT(DISTINCT(s.user_id)) AS users,
  MAX(p.num_items) AS pages,
  MAX(p.count_in_use) AS items_in_use,
  MAX(p.is_gendered) AS item_is_bio,
  MAX(p.is_geographic) AS item_is_geo,
  COUNT(1) AS edits,
  SUM(s.desc_add

In [64]:
# gender data for all languages
query = """
SELECT
  suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  COUNT(DISTINCT(page_title)) AS num_items
FROM {0}
WHERE
  gender IS NOT NULL
GROUP BY
  suggested_edit,
  gender
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(200, False)


+--------------+---------+---------+---------+
|suggested_edit|gender   |num_edits|num_items|
+--------------+---------+---------+---------+
|1             |Q1097630 |3        |1        |
|1             |Q2449503 |4        |4        |
|1             |Q179294  |2        |2        |
|0             |Q6581097 |161      |139      |
|1             |Q6581097 |40457    |35251    |
|1             |Q18116794|7        |3        |
|1             |Q96000630|1        |1        |
|0             |Q6581072 |27       |26       |
|1             |Q15145778|1        |1        |
|1             |Q6581072 |11322    |10000    |
|1             |Q48270   |10       |9        |
|1             |Q1052281 |79       |62       |
+--------------+---------+---------+---------+



In [63]:
# gender data for English
query = """
SELECT
  suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  COUNT(DISTINCT(page_title)) AS num_items
FROM {0}
WHERE
  gender IS NOT NULL
  AND description_language = 'en'
GROUP BY
  suggested_edit,
  gender
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(200, False)


+--------------+--------+---------+---------+
|suggested_edit|gender  |num_edits|num_items|
+--------------+--------+---------+---------+
|0             |Q6581097|23       |21       |
|1             |Q6581097|5091     |4831     |
|0             |Q6581072|1        |1        |
|1             |Q6581072|1260     |1195     |
|1             |Q1052281|1        |1        |
+--------------+--------+---------+---------+



In [66]:
# revert rate for all languages
query = """
SELECT
  suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  SUM(IF(revision_is_identity_reverted, 1, 0)) AS num_reverted,
  SUM(IF(revision_is_identity_reverted, 1, 0)) / COUNT(1) AS pct_reverted
FROM {0}
WHERE
  gender IS NOT NULL
GROUP BY
  suggested_edit,
  gender
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(200, False)


+--------------+---------+---------+------------+-------------------+
|suggested_edit|gender   |num_edits|num_reverted|pct_reverted       |
+--------------+---------+---------+------------+-------------------+
|1             |Q1097630 |3        |0           |0.0                |
|1             |Q2449503 |4        |0           |0.0                |
|1             |Q179294  |2        |0           |0.0                |
|0             |Q6581097 |161      |17          |0.10559006211180125|
|1             |Q6581097 |40457    |1036        |0.02560743505450231|
|1             |Q18116794|7        |2           |0.2857142857142857 |
|1             |Q96000630|1        |0           |0.0                |
|0             |Q6581072 |27       |4           |0.14814814814814814|
|1             |Q15145778|1        |0           |0.0                |
|1             |Q6581072 |11322    |258         |0.02278749337572867|
|1             |Q48270   |10       |0           |0.0                |
|1             |Q105

In [65]:
# revert rate for English
query = """
SELECT
  suggested_edit,
  gender,
  COUNT(1) AS num_edits,
  SUM(IF(revision_is_identity_reverted, 1, 0)) AS num_reverted,
  SUM(IF(revision_is_identity_reverted, 1, 0)) / COUNT(1) AS pct_reverted
FROM {0}
WHERE
  gender IS NOT NULL
  AND description_language = 'en'
GROUP BY
  suggested_edit,
  gender
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(200, False)


+--------------+--------+---------+------------+--------------------+
|suggested_edit|gender  |num_edits|num_reverted|pct_reverted        |
+--------------+--------+---------+------------+--------------------+
|0             |Q6581097|23       |3           |0.13043478260869565 |
|1             |Q6581097|5091     |245         |0.04812414064034571 |
|0             |Q6581072|1        |0           |0.0                 |
|1             |Q6581072|1260     |64          |0.050793650793650794|
|1             |Q1052281|1        |0           |0.0                 |
+--------------+--------+---------+------------+--------------------+



In [12]:
# geographic breakdown of edits for all wikis
query = """
WITH regions_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      page_id,
      revision_id,
      EXPLODE(regions) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
)
SELECT
  is_suggested_edit,
  region,
  COUNT(DISTINCT(revision_id)) AS num_edits,
  COUNT(DISTINCT(page_id)) AS num_pages
FROM regions_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+---------+
|is_suggested_edit|region                                      |num_edits|num_pages|
+-----------------+--------------------------------------------+---------+---------+
|1                |United States of America                    |23953    |20277    |
|1                |Japan                                       |8135     |7076     |
|1                |United Kingdom                              |7283     |6315     |
|1                |Germany                                     |6869     |6203     |
|1                |France                                      |5973     |5301     |
|1                |India                                       |5532     |4477     |
|1                |Italy                                       |4342     |3770     |
|1                |Turkey                                      |4207     |3292     |
|1                |Spain                                       |3

In [14]:
# geographic breakdown of edits for English Wikipedia
query = """
WITH regions_of_contrib AS (
    SELECT
      suggested_edit AS is_suggested_edit,
      page_id,
      revision_id,
      EXPLODE(regions) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
      AND description_language = 'en'
)
SELECT
  is_suggested_edit,
  region,
  COUNT(DISTINCT(revision_id)) AS num_edits,
  COUNT(DISTINCT(page_id)) AS num_pages
FROM regions_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+---------+
|is_suggested_edit|region                                      |num_edits|num_pages|
+-----------------+--------------------------------------------+---------+---------+
|1                |United States of America                    |5814     |5520     |
|1                |United Kingdom                              |1965     |1862     |
|1                |India                                       |1764     |1647     |
|1                |Canada                                      |797      |765      |
|1                |Australia                                   |688      |658      |
|1                |England                                     |579      |554      |
|1                |France                                      |579      |556      |
|1                |Japan                                       |500      |464      |
|1                |Italy                                       |4

In [17]:
# revert rate by geography for all wikis
query = """
WITH region_of_contrib AS (
    SELECT DISTINCT
      suggested_edit AS is_suggested_edit,
      revision_id,
      IF(revision_is_identity_reverted, 1, 0) AS reverted,
      EXPLODE(regions) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
)
SELECT
  is_suggested_edit,
  region,
  COUNT(1) AS num_edits,
  SUM(reverted) AS num_reverted,
  SUM(reverted) / COUNT(1) AS pct_reverted
FROM region_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+------------+---------------------+
|is_suggested_edit|region                                      |num_edits|num_reverted|pct_reverted         |
+-----------------+--------------------------------------------+---------+------------+---------------------+
|1                |United States of America                    |23953    |505         |0.021082954118482027 |
|1                |Japan                                       |8135     |104         |0.012784265519360787 |
|1                |United Kingdom                              |7283     |138         |0.01894823561719072  |
|1                |Germany                                     |6869     |120         |0.017469791818314165 |
|1                |France                                      |5973     |75          |0.012556504269211451 |
|1                |India                                       |5532     |120         |0.021691973969631236 |
|1        

In [18]:
# revert rate by geography for English only
query = """
WITH region_of_contrib AS (
    SELECT DISTINCT
      suggested_edit AS is_suggested_edit,
      IF(revision_is_identity_reverted, 1, 0) AS reverted,
      revision_id,
      EXPLODE(regions) AS region
    FROM {0}
    WHERE
      regions IS NOT NULL
      AND SIZE(regions) > 0
      AND description_language = 'en'
)
SELECT
  is_suggested_edit,
  region,
  COUNT(1) AS num_edits,
  SUM(reverted) AS num_reverted,
  SUM(reverted) / COUNT(1) AS pct_reverted
FROM region_of_contrib
GROUP BY
  is_suggested_edit,
  region
ORDER BY
  num_edits DESC
  """.format(se_wikidata_subset_tablename)

spark.sql(query).show(500, False)


+-----------------+--------------------------------------------+---------+------------+---------------------+
|is_suggested_edit|region                                      |num_edits|num_reverted|pct_reverted         |
+-----------------+--------------------------------------------+---------+------------+---------------------+
|1                |United States of America                    |5814     |170         |0.029239766081871343 |
|1                |United Kingdom                              |1965     |66          |0.03358778625954199  |
|1                |India                                       |1764     |68          |0.03854875283446712  |
|1                |Canada                                      |797      |23          |0.028858218318695106 |
|1                |Australia                                   |688      |25          |0.036337209302325583 |
|1                |France                                      |579      |19          |0.03281519861830743  |
|1        