# examples using anonymized parquet data
2019-05-31

In [1]:
# basic setup
import pyspark
import re
import pyspark.sql
from pyspark.sql import *
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import os.path
from pyspark.sql.functions import desc
from datetime import timedelta, date

%matplotlib inline
spark_hive = pyspark.sql.HiveContext(sc)

In [5]:
## basic data defaults

# set date ranges for all queries
start_date = date(2019, 3, 29)
end_date = date(2019, 4, 22)
date_format = '%Y-%m-%d'
start_date_string = start_date.strftime(date_format)
end_date_string = end_date.strftime(date_format)

# for iterating over the range of study dates (used in daily count of events queries)
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days + 1)):
        yield start_date + timedelta(n)

# convenience method for converting dates to 'YYYY-MM-DD%' for SQL queries
def date_to_dt(date):
    return str(date.year) + '-' + '{0:02d}'.format(date.month) + '-' + '{0:02d}'.format(date.day) + '%'

## common exclusion SQL
#
# exclude event data that is either: 
# - has page or revision ID of zero (pages not yet created as per bmansurov https://phabricator.wikimedia.org/T213969#4998281)
# - is 'extClick' but is an internal link improperly coded as external as per bmansurov https://phabricator.wikimedia.org/T213969#5003710
event_exclusion_sql = """
AND (citationusage.page_id = 0 OR citationusage.revision_id = 0) = FALSE
AND (citationusage.action = 'extClick' AND 
    (citationusage.link_url LIKE 'https://en.wikipedia.org%' 
    OR citationusage.link_url LIKE 'https://en.m.wikipedia.org%')) = FALSE
"""
# exclude pageload data that:
# - has page or revision ID of zero (pages not yet created as per bmansurov https://phabricator.wikimedia.org/T213969#4998281)
pageload_exclusion_sql = """
AND (citationusagepageload.page_id = 0 OR citationusagepageload.revision_id = 0) = FALSE
"""


In [6]:
parquetFilePageloads = spark.read.parquet("/user/piccardi/anonymous_pageloads_april.parquet")
parquetFilePageloads.createOrReplaceTempView("citationusagepageload")
parquetFileCitationusage = spark.read.parquet("/user/piccardi/anonymous_citationusage_april.parquet")
parquetFileCitationusage.createOrReplaceTempView("citationusage")

## pageloads data

In [20]:
# original pageloads data without bots
orig_pageloads_query = """
SELECT count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
AND (citationusagepageload.event.page_id = 0 OR citationusagepageload.event.revision_id = 0) = FALSE
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
"""
spark.sql(orig_pageloads_query.format(start_date_string, end_date_string)).show()

+----------+
|     count|
+----------+
|1401158765|
+----------+



In [22]:
# original pageloads data with bots
bots_pageloads_query = """
SELECT count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
AND (citationusagepageload.event.page_id = 0 OR citationusagepageload.event.revision_id = 0) = FALSE
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
"""
spark.sql(bots_pageloads_query.format(start_date_string, end_date_string)).show()

+----------+
|     count|
+----------+
|1402190827|
+----------+



In [23]:
# anonymized pageloads data
anon_pageloads_query = """
SELECT count(*) count
FROM citationusagepageload
WHERE wiki = 'enwiki'
AND (citationusagepageload.page_id = 0 OR citationusagepageload.revision_id = 0) = FALSE
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
"""
spark.sql(anon_pageloads_query.format(start_date_string, end_date_string)).show()

+----------+
|     count|
+----------+
|1401424489|
+----------+



## events data

In [14]:
# anonymized data
anon_events_query = """
SELECT action, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND (citationusage.page_id = 0 OR citationusage.revision_id = 0) = FALSE
AND (citationusage.action = 'extClick' AND 
    (citationusage.link_url LIKE 'https://en.wikipedia.org%' 
    OR citationusage.link_url LIKE 'https://en.m.wikipedia.org%')) = FALSE
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
GROUP BY action
"""
events = spark.sql(anon_events_query.format(start_date_string, end_date_string)).show()

+--------+--------+
|  action|   count|
+--------+--------+
| fnHover|29259814|
| fnClick|18977045|
| upClick|  813130|
|extClick|39515529|
+--------+--------+



In [17]:
# original data without bots
orig_events_query = """
SELECT event.action, COUNT(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND (citationusage.event.page_id = 0 OR citationusage.event.revision_id = 0) = FALSE
AND (citationusage.event.action = 'extClick' AND 
    (citationusage.event.link_url LIKE 'https://en.wikipedia.org%' 
    OR citationusage.event.link_url LIKE 'https://en.m.wikipedia.org%')) = FALSE
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY event.action
"""
spark.sql(orig_events_query.format(start_date_string, end_date_string)).show()

+--------+--------+
|  action|   count|
+--------+--------+
| fnHover|29259567|
| fnClick|18976858|
| upClick|  813101|
|extClick|39515221|
+--------+--------+



In [18]:
# original data with bots
bots_events_query = """
SELECT event.action, COUNT(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND (citationusage.event.page_id = 0 OR citationusage.event.revision_id = 0) = FALSE
AND (citationusage.event.action = 'extClick' AND 
    (citationusage.event.link_url LIKE 'https://en.wikipedia.org%' 
    OR citationusage.event.link_url LIKE 'https://en.m.wikipedia.org%')) = FALSE
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
GROUP BY event.action
"""
spark.sql(bots_events_query.format(start_date_string, end_date_string)).show()

+--------+--------+
|  action|   count|
+--------+--------+
| fnHover|29259814|
| fnClick|18977045|
| upClick|  813130|
|extClick|39515529|
+--------+--------+



## count of events for WP:M pages with ext links, limited to events w/ pageload data

In [25]:
# original data without bots
sampled_wpm_events_query = """
SELECT event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    AND to_date(dt) >= '{}' 
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    )
AND to_date(dt) >= '{}' 
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY event.action
"""

spark.sql(
        sampled_wpm_events_query.format(start_date_string, end_date_string,
                                        start_date_string, end_date_string,
                                        start_date_string, end_date_string)).show()

+--------+------+
|  action| count|
+--------+------+
| fnHover|396944|
| fnClick|228205|
| upClick| 12667|
|extClick|213878|
+--------+------+



In [26]:
# original data with bots
bots_sampled_wpm_events_query = """
SELECT event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    AND to_date(dt) >= '{}' 
    AND to_date(dt) <= '{}'
    )
AND to_date(dt) >= '{}' 
AND to_date(dt) <= '{}'
GROUP BY event.action
"""

spark.sql(
        bots_sampled_wpm_events_query.format(start_date_string, end_date_string,
                                        start_date_string, end_date_string,
                                        start_date_string, end_date_string)).show()

+--------+------+
|  action| count|
+--------+------+
| fnHover|396948|
| fnClick|228206|
| upClick| 12667|
|extClick|213878|
+--------+------+



In [28]:
# anonymized data
anon_sampled_wpm_events_query = """
SELECT action, count(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    AND to_date(event_time) >= '{}' 
    AND to_date(event_time) <= '{}'
    )
AND to_date(event_time) >= '{}' 
AND to_date(event_time) <= '{}'
GROUP BY action
"""

spark.sql(
        anon_sampled_wpm_events_query.format(start_date_string, end_date_string,
                                        start_date_string, end_date_string,
                                        start_date_string, end_date_string)).show()

+--------+------+
|  action| count|
+--------+------+
| fnHover|396734|
| fnClick|228125|
| upClick| 12661|
|extClick|213813|
+--------+------+

