## Pageload & Event Queries
**NOTE**: Data for all pages with external links began 3/29 whereas WPM pages with external links began 3/27 meaning a simple subtraction of WPM from all pages will not produce the same summary total as NOT WPM. Possible solutions: either limit study period to >= 3/29 OR use all page IDs found after we started collecting data and assume the deltas are minor (likely true for PWM, likely relatively true for W but numerically a greater difference).
#### Draft 1 2019-04-08

In [1]:
# basic setup
# use PySpark YARN kernel
import pyspark
import re
import pyspark.sql
from pyspark.sql import *
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import os.path
from pyspark.sql.functions import desc
from datetime import timedelta, date

%matplotlib inline
spark_hive = pyspark.sql.HiveContext(sc)

# required to iterate the range of dates
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [3]:
# set date ranges for all queries
start_date = date(2019, 3, 21)
end_date = date(2019, 4, 22)

## Citation Usage Overview

In [6]:
# show citationusage events by date and type

events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    daily_events = spark.sql(events_query.format(d.day, d.month, d.year))
    events_rdd = events_rdd.union(daily_events.rdd)

events_merged = sqlContext.createDataFrame(events_rdd)
events = events_merged.toPandas()

In [8]:
events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-21,450241,221977,310889,91402
2019-3-22,1590925,725377,1131859,149622
2019-3-23,1389404,667724,883147,20342
2019-3-24,1531353,750329,1018411,23969
2019-3-25,1774686,782508,1365524,82990
2019-3-26,1729764,737057,1342285,28909
2019-3-27,1693519,728824,1321898,27761
2019-3-28,1639009,705904,1267787,25736
2019-3-29,1522480,666005,1122572,23467
2019-3-30,1354622,649300,865928,19934


## Pageload Data
#### source: event.citationusagepageload

In [10]:
# basic overview
pageloads_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

pageloads_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    daily_pageloads = spark.sql(pageloads_query.format(d.day, d.month, d.year))
    pageloads_rdd = pageloads_rdd.union(daily_pageloads.rdd)

pageloads_merged = sqlContext.createDataFrame(pageloads_rdd)
pageloads = pageloads_merged.toPandas()

In [12]:
pageloads.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-3-21,14270583
2019-3-22,52417604
2019-3-23,51588547
2019-3-24,56845081
2019-3-25,58315949
2019-3-26,55937993
2019-3-27,55018474
2019-3-28,53933565
2019-3-29,51413672
2019-3-30,49759656


### data refresh required
The queries below use data from daily SQL queries. Data from ryanmax.pages_with_extlinks and ryanmax.projmed_with_extlinks will need to be refreshed before running these queries again.

In [33]:
# daily count of pageloads of WP:M pages with external links
pgload_wpm_extl_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

pgload_wpm_extl_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    daily_wpm_extl = spark.sql(pgload_wpm_extl_query.format(dt, d.day, d.month, d.year))
    pgload_wpm_extl_rdd = pgload_wpm_extl_rdd.union(daily_wpm_extl.rdd)

wpm_extl_pageloads_merged = sqlContext.createDataFrame(pgload_wpm_extl_rdd)
wpm_extl_pageloads = wpm_extl_pageloads_merged.toPandas()

In [34]:
wpm_extl_pageloads.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-3-27,1625817
2019-3-28,1562904
2019-3-29,1409126
2019-3-30,1204098
2019-3-31,1282420
2019-4-1,1580307
2019-4-2,1614142
2019-4-3,1582776
2019-4-4,1574178
2019-4-5,1438065


In [35]:
# daily count of pageloads of all pages with external links
pgload_extl_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

pgload_extl_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    daily_extl = spark.sql(pgload_extl_query.format(dt, d.day, d.month, d.year))
    pgload_extl_rdd = pgload_extl_rdd.union(daily_extl.rdd)

extl_pageloads_merged = sqlContext.createDataFrame(pgload_extl_rdd)
extl_pageloads = extl_pageloads_merged.toPandas()

In [36]:
extl_pageloads.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-3-29,50242541
2019-3-30,48728118
2019-3-31,54094263
2019-4-1,56399098
2019-4-2,54304072
2019-4-3,54024178
2019-4-4,53771787
2019-4-5,52672342
2019-4-6,50732159
2019-4-7,56048378


In [5]:
# top 1000 most visited WP:M pages with external links
top1k_query = """
SELECT event.page_id, page.page_title as title, count(*) count
FROM event.citationusagepageload
LEFT JOIN enwiki.page on event.page_id = page.page_id
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks)
AND useragent.is_bot = FALSE
AND (day >= {} AND month >= {} AND year >= {})
AND (day <= {} AND month <= {} AND year <= {})
GROUP BY event.page_id, page.page_title
ORDER BY count(*) desc
LIMIT 1000
"""

top1k_rdd = sc.emptyRDD()
top1k = spark.sql(top1k_query.format(start_date.day, start_date.month, start_date.year, end_date.day, end_date.month, end_date.year))

# write top1k data to a table for later use
top1k.createOrReplaceTempView("temp_top1k")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top1k_med");
sqlContext.sql("CREATE TABLE ryanmax.top1k_med AS SELECT * FROM temp_top1k");

top1k.limit(20).toPandas()

# NOTE: Pages may appear with a title of "None" because the page title data source used here is infrequently updated.
# Use the page_id to look up a title: https://en.wikipedia.org/?curid=56880920


Unnamed: 0,page_id,title,count
0,43573275,Elizabeth_Holmes,34193
1,41779862,Theranos,12632
2,56880920,,7698
3,4488176,Munchausen_syndrome_by_proxy,4552
4,322553,Aneurysm,4359
5,27546,Sexual_intercourse,3931
6,8303,Down_syndrome,3875
7,18079,Leonardo_da_Vinci,3690
8,37556,Asperger_syndrome,3687
9,4501,Black_Death,3250


## Event Data

#### source: event.citationusage (not limited to sampled pageloads)

In [26]:
# Total count of events for WP:M pages with ext links over study period
wpm_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    daily_wpm_events = spark.sql(wpm_events_query.format(dt, d.day, d.month, d.year))
    wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)

wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)
wpm_events = wpm_events_merged.toPandas()

In [27]:
# Total count of events for NOT WP:M pages with ext links over study period
not_wpm_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND event.page_id NOT IN (SELECT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

not_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    not_daily_wpm_events = spark.sql(not_wpm_events_query.format(dt, dt, d.day, d.month, d.year))
    not_wpm_events_rdd = not_wpm_events_rdd.union(not_daily_wpm_events.rdd)

not_wpm_events_merged = sqlContext.createDataFrame(not_wpm_events_rdd)
not_wpm_events = not_wpm_events_merged.toPandas()

In [28]:
# Total count of events for pages with ext links over study period
extl_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

extl_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    extl_daily_wpm_events = spark.sql(extl_events_query.format(dt, d.day, d.month, d.year))
    extl_events_rdd = extl_events_rdd.union(extl_daily_wpm_events.rdd)

extl_events_merged = sqlContext.createDataFrame(extl_events_rdd)
extl_events = extl_events_merged.toPandas()

#### Event Summaries (not limited to sampled pageloads)

In [42]:
# all pages with external links
extl_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,15500183
fnClick,7194951
fnHover,11411696
upClick,290014


In [41]:
# WPM:M pages with external links
wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,308836
fnClick,326011
fnHover,594126
upClick,11078


In [36]:
# NOT WPM:M pages with external links
not_wpm_events.groupby(['action']).sum()


Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,15250923
fnClick,6928947
fnHover,10933687
upClick,280826


#### Event Daily Counts (not limited to sampled pageloads)

In [37]:
# all pages with external links
extl_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-29,1503226,664333,1120120,23145
2019-3-30,1339988,647826,863890,19687
2019-3-31,1463198,734601,996934,22884
2019-4-1,1701876,784342,1345135,63736
2019-4-2,1687856,745116,1344704,39821
2019-4-3,1675255,736686,1385406,26609
2019-4-4,1651703,727856,1293235,25381
2019-4-5,1574435,696939,1165235,23301
2019-4-6,1388346,684512,882118,20061
2019-4-7,1514300,772740,1014919,25389


In [40]:
# WPM:M pages with external links
wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-27,29793,30716,59502,967
2019-3-28,29783,29291,56615,923
2019-3-29,24641,25739,46785,828
2019-3-30,18737,22598,36678,659
2019-3-31,21787,25259,42935,798
2019-4-1,29278,29581,56823,1390
2019-4-2,30467,30041,58525,1442
2019-4-3,29030,29090,57357,864
2019-4-4,28618,28030,54019,860
2019-4-5,25342,25858,47196,730


In [39]:
# NOT WPM:M pages with external links
not_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-29,1478585,638594,1073335,22317
2019-3-30,1321251,625228,827212,19028
2019-3-31,1441411,709342,953999,22086
2019-4-1,1672598,754761,1288312,62346
2019-4-2,1657389,715075,1286179,38379
2019-4-3,1646225,707596,1328049,25745
2019-4-4,1623085,699826,1239216,24521
2019-4-5,1549093,671081,1118039,22571
2019-4-6,1368857,661002,846461,19361
2019-4-7,1492429,746442,972885,24472


#### source: event.citationusage (limited to sampled pageloads from citationusagepageload)

In [18]:
# daily count of events for WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_wpm_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

sampled_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    sampled_daily_wpm_events = spark.sql(sampled_wpm_events_query.format(dt, d.day, d.month, d.year,d.day, d.month, d.year))
    sampled_wpm_events_rdd = sampled_wpm_events_rdd.union(sampled_daily_wpm_events.rdd)

sampled_wpm_events_merged = sqlContext.createDataFrame(sampled_wpm_events_rdd)
sampled_wpm_events = sampled_wpm_events_merged.toPandas()

In [23]:
# daily count of events for NOT WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_not_wpm_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND event.page_id NOT IN (SELECT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

sampled_not_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    sampled_daily_not_wpm_events = spark.sql(sampled_not_wpm_events_query.format(dt, dt, d.day, d.month, d.year,d.day, d.month, d.year))
    sampled_not_wpm_events_rdd = sampled_not_wpm_events_rdd.union(sampled_daily_not_wpm_events.rdd)

sampled_not_wpm_events_merged = sqlContext.createDataFrame(sampled_not_wpm_events_rdd)
sampled_not_wpm_events = sampled_not_wpm_events_merged.toPandas()

In [24]:
# daily count of events for all pages with ext links over study period
# limited to events w/ pageload data
sampled_events_query = """
SELECT CONCAT(year, '-', month, '-', day) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY year, month, day, event.action
ORDER BY year, month, day
"""

sampled_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = str(d.year) + '-' + '{0:02d}'.format(d.month) + '-' + '{0:02d}'.format(d.day) + '%'
    sampled_daily_events = spark.sql(sampled_events_query.format(dt, d.day, d.month, d.year,d.day, d.month, d.year))
    sampled_events_rdd = sampled_events_rdd.union(sampled_daily_events.rdd)

sampled_events_merged = sqlContext.createDataFrame(sampled_events_rdd)
sampled_events = sampled_events_merged.toPandas()

#### Event Summaries (limited to sampled pageloads)

In [26]:
# WPM:M pages with external links
sampled_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,102932
fnClick,107050
fnHover,192466
upClick,3599


In [27]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,5084314
fnClick,2301546
fnHover,3578965
upClick,91104


In [28]:
# all pages with external links
sampled_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,5167703
fnClick,2388613
fnHover,3734104
upClick,94069


#### Event Daily Counts (limited to sampled pageloads)

In [29]:
# WPM:M pages with external links
sampled_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-27,9734,10240,19020,307
2019-3-28,9809,9743,18307,327
2019-3-29,8429,8511,14972,313
2019-3-30,6266,7357,11734,220
2019-3-31,7322,8365,14018,253
2019-4-1,9749,9649,18655,524
2019-4-2,10176,9622,19335,291
2019-4-3,9625,9796,18947,253
2019-4-4,9665,9188,17144,288
2019-4-5,8314,8446,15222,228


In [30]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-29,493381,213568,351867,7586
2019-3-30,439327,208318,273163,6158
2019-3-31,481997,236050,314065,7503
2019-4-1,554641,248511,423358,18490
2019-4-2,552139,237538,423598,13549
2019-4-3,548462,234237,417269,8347
2019-4-4,541504,232591,408057,7906
2019-4-5,514797,223549,369539,7685
2019-4-6,458952,219654,277847,6284
2019-4-7,499114,247530,320202,7596


In [31]:
# all pages with external links
sampled_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-3-29,501810,222079,366839,7899
2019-3-30,445593,215675,284897,6378
2019-3-31,489319,244415,328083,7756
2019-4-1,564390,258160,442013,19014
2019-4-2,562315,247160,442933,13840
2019-4-3,558087,244033,436216,8600
2019-4-4,551169,241779,425201,8194
2019-4-5,523111,231995,384761,7913
2019-4-6,465448,227149,289209,6530
2019-4-7,506461,256168,333952,7945
