## Pageload & Event Queries
**NOTE**: Data for all pages with external links began 3/29 whereas WPM pages with external links began 3/27 meaning a simple subtraction of WPM from all pages will not produce the same summary total as NOT WPM. Possible solutions: either limit study period to >= 3/29 OR use all page IDs found after we started collecting data and assume the deltas are minor (likely true for PWM, likely relatively true for W but numerically a greater difference).
- v. 1.0.0 2019-04-08
- v. 1.0.1 2019-04-17
- v. 1.0.2 2019-04-23

In [1]:
# basic setup
# use PySpark YARN kernel
import pyspark
import re
import pyspark.sql
from pyspark.sql import *
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import os.path
from pyspark.sql.functions import desc
from datetime import timedelta, date

%matplotlib inline
spark_hive = pyspark.sql.HiveContext(sc)


In [2]:
## basic data defaults

# required to iterate the range of dates
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

# set date ranges for all queries
start_date = date(2019, 3, 21)
end_date = date(2019, 4, 22)
date_format = '%Y-%m-%d'
start_date_string = start_date.strftime(date_format)
end_date_string = end_date.strftime(date_format)

# convenience method for converting dates to 'YYYY-MM-DD%' for SQL queries
def date_to_dt(date):
    return str(date.year) + '-' + '{0:02d}'.format(date.month) + '-' + '{0:02d}'.format(date.day) + '%'

## common exclusion SQL
#
# exclude event data that is either: 
# - has page or revision ID of zero (pages not yet created as per bmansurov https://phabricator.wikimedia.org/T213969#4998281)
# - is 'extClick' but is an internal link improperly coded as external as per bmansurov https://phabricator.wikimedia.org/T213969#5003710
event_exclusion_sql = """
AND (citationusage.event.page_id = 0 OR citationusage.event.revision_id = 0) = FALSE
AND (citationusage.event.action = 'extClick' AND 
    (citationusage.event.link_url LIKE 'https://en.wikipedia.org%' 
    OR citationusage.event.link_url LIKE 'https://en.m.wikipedia.org%')) = FALSE
"""
# exclude pageload data that:
# - has page or revision ID of zero (pages not yet created as per bmansurov https://phabricator.wikimedia.org/T213969#4998281)
pageload_exclusion_sql = """
AND (citationusagepageload.event.page_id = 0 OR citationusagepageload.event.revision_id = 0) = FALSE
"""


## Citation Usage Overview

In [11]:
# show citationusage events by date and type

events_query = """
SELECT to_date(dt) date, event.action, COUNT(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY to_date(dt), event.action
ORDER BY to_date(dt)
"""

events = spark.sql(events_query.format(event_exlusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
events_pandas = events_df.toPandas()

In [12]:
events_pandas.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-21,441641,221977,310889,91402
2019-03-22,1564276,725377,1131859,149622
2019-03-23,1369397,667724,883147,20342
2019-03-24,1509326,750329,1018411,23969
2019-03-25,1744801,782508,1365524,82990
2019-03-26,1701660,737057,1342285,28909
2019-03-27,1666356,728824,1321898,27761
2019-03-28,1611850,705904,1267787,25736
2019-03-29,1498888,666005,1122572,23467
2019-03-30,1336032,649300,865928,19934


## Pageload Data
#### source: event.citationusagepageload

In [14]:
# basic pageloads overview
pageloads_query = """
SELECT to_date(dt) date, event.action, count(*) count
FROM event.citationusagepageload
WHERE wiki = 'enwiki'
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY to_date(dt), event.action
ORDER BY to_date(dt)
"""

pageloads = spark.sql(pageloads_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pageloads_rdd = pageloads.rdd
pageloads_df = sqlContext.createDataFrame(pageloads_rdd)
pageloads_pandas = pageloads_df.toPandas()


In [15]:
pageloads_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-21,14252491
2019-03-22,52351262
2019-03-23,51534585
2019-03-24,56785088
2019-03-25,58238950
2019-03-26,55863494
2019-03-27,54945634
2019-03-28,53862676
2019-03-29,51350113
2019-03-30,49709557


### data refresh required
The queries below use data from daily SQL queries which are then imported into hive. Tables that require refreshing and date they were last refreshed:
- ryanmax.pages_with_extlinks: 2019-04-15
- ryanmax.projmed_categories: 2019-04-12
- ryanmax.projmed_with_extlinks: 2019-04-15

In [16]:
# daily count of pageloads of WP:M pages with external links
## verified against day-at-a-time version of same query
pgload_wpm_extl_query = """
SELECT to_date(citationusagepageload.dt) date, event.action, count(*) count
FROM event.citationusagepageload, ryanmax.projmed_with_extlinks
WHERE wiki = 'enwiki'
AND event.page_id = projmed_with_extlinks.page_id
AND to_date(citationusagepageload.dt) = to_date(projmed_with_extlinks.dt)
{}
AND to_date(citationusagepageload.dt) >= '{}'
AND to_date(citationusagepageload.dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY to_date(citationusagepageload.dt), event.action
ORDER BY to_date(citationusagepageload.dt)
"""

pgload_wpm_extl = spark.sql(pgload_wpm_extl_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pgload_wpm_extl_rdd = pgload_wpm_extl.rdd
pgload_wpm_extl_df = sqlContext.createDataFrame(pgload_wpm_extl_rdd)
pgload_wpm_extl_pandas = pgload_wpm_extl_df.toPandas()

In [18]:
pgload_wpm_extl_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-27,1625817
2019-03-28,1562904
2019-03-29,1409126
2019-03-30,1204096
2019-03-31,1282419
2019-04-01,1580307
2019-04-02,1614142
2019-04-03,1582776
2019-04-04,1574177
2019-04-05,1438065


In [19]:
# daily count of pageloads of all pages with external links
## verified against day-at-a-time version of same query
pgload_extl_query = """
SELECT to_date(citationusagepageload.dt) date, event.action, count(*) count
FROM event.citationusagepageload, ryanmax.pages_with_extlinks
WHERE wiki = 'enwiki'
AND event.page_id = pages_with_extlinks.page_id
AND to_date(citationusagepageload.dt) = to_date(pages_with_extlinks.dt)
{}
AND to_date(citationusagepageload.dt) >= '{}'
AND to_date(citationusagepageload.dt) <= '{}'
AND useragent.is_bot = FALSE
GROUP BY to_date(citationusagepageload.dt), event.action
ORDER BY to_date(citationusagepageload.dt)
"""

pgload_extl = spark.sql(pgload_extl_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pgload_extl_rdd = pgload_extl.rdd
pgload_extl_df = sqlContext.createDataFrame(pgload_extl_rdd)
pgload_extl_pandas = pgload_extl_df.toPandas()

In [20]:
pgload_extl_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-29,50242532
2019-03-30,48728115
2019-03-31,54094252
2019-04-01,56399087
2019-04-02,54304069
2019-04-03,54024174
2019-04-04,53771778
2019-04-05,52672333
2019-04-06,50732149
2019-04-07,56048368


In [21]:
# top 1000 most visited WP:M pages with external links
top1k_query = """
SELECT event.page_id, page.page_title as title, count(*) count
FROM event.citationusagepageload, enwiki.page
WHERE event.page_id = page.page_id
AND wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
AND useragent.is_bot = FALSE
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
GROUP BY event.page_id, page.page_title
ORDER BY count(*) desc
LIMIT 1000
"""

top1k_rdd = sc.emptyRDD()
top1k = spark.sql(top1k_query.format(pageload_exclusion_sql, start_date_string, end_date_string))

# write top1k data to a table for later use
top1k.createOrReplaceTempView("temp_top1k")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top1k_med");
sqlContext.sql("CREATE TABLE ryanmax.top1k_med AS SELECT * FROM temp_top1k");

top1k.limit(20).toPandas()

# NOTE: Pages may appear with a title of "None" because the page title data source used here is infrequently updated.
# Use the page_id to look up a title: https://en.wikipedia.org/?curid=56880920


Unnamed: 0,page_id,title,count
0,43573275,Elizabeth_Holmes,326878
1,41779862,Theranos,132637
2,18079,Leonardo_da_Vinci,105264
3,58911,Measles,98451
4,27546,Sexual_intercourse,96251
5,37556,Asperger_syndrome,85467
6,791546,Ketogenic_diet,82855
7,4501,Black_Death,79951
8,4488176,Munchausen_syndrome_by_proxy,79782
9,52135,Pneumonia,76293


## Event Data

#### source: event.citationusage (limited to sampled pageloads from citationusagepageload)

In [26]:
# daily count of events for WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_wpm_events_query = """
SELECT to_date(dt) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY to_date(dt), event.action
ORDER BY to_date(dt)
"""

sampled_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_wpm_events = spark.sql(
        sampled_wpm_events_query.format(dt, pageload_exclusion_sql, d.day, d.month, d.year, 
                                        event_exclusion_sql, d.day, d.month, d.year))
    sampled_wpm_events_rdd = sampled_wpm_events_rdd.union(sampled_daily_wpm_events.rdd)

sampled_wpm_events_merged = sqlContext.createDataFrame(sampled_wpm_events_rdd)
sampled_wpm_events = sampled_wpm_events_merged.toPandas()

In [29]:
# daily count of events for NOT WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_not_wpm_events_query = """
SELECT to_date(dt) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND event.page_id NOT IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY to_date(dt), event.action
ORDER BY to_date(dt)
"""

sampled_not_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_not_wpm_events = spark.sql(
        sampled_not_wpm_events_query.format(dt, dt, 
                                            pageload_exclusion_sql, d.day, d.month, d.year,
                                            event_exclusion_sql, d.day, d.month, d.year))
    sampled_not_wpm_events_rdd = sampled_not_wpm_events_rdd.union(sampled_daily_not_wpm_events.rdd)

sampled_not_wpm_events_merged = sqlContext.createDataFrame(sampled_not_wpm_events_rdd)
sampled_not_wpm_events = sampled_not_wpm_events_merged.toPandas()

In [30]:
# daily count of events for all pages with ext links over study period
# limited to events w/ pageload data
sampled_events_query = """
SELECT to_date(dt) date, event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent.is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent.is_bot = FALSE
GROUP BY to_date(dt), event.action
ORDER BY to_date(dt)
"""

sampled_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_events = spark.sql(
        sampled_events_query.format(dt, 
                                    pageload_exclusion_sql, d.day, d.month, d.year,
                                    event_exclusion_sql, d.day, d.month, d.year))
    sampled_events_rdd = sampled_events_rdd.union(sampled_daily_events.rdd)

sampled_events_merged = sqlContext.createDataFrame(sampled_events_rdd)
sampled_events = sampled_events_merged.toPandas()

#### Event Summaries (limited to sampled pageloads)

In [27]:
# WPM:M pages with external links
sampled_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,163929
fnClick,171781
fnHover,307654
upClick,5547


In [31]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,9354609
fnClick,4286525
fnHover,6681416
upClick,154455


In [32]:
# all pages with external links
sampled_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,9499087
fnClick,4438323
fnHover,6951743
upClick,159368


#### Event Daily Counts (limited to sampled pageloads)

In [28]:
# WPM:M pages with external links
sampled_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-27,9692,10240,19020,307
2019-03-28,9759,9743,18307,327
2019-03-29,8396,8511,14972,313
2019-03-30,6233,7357,11734,220
2019-03-31,7290,8365,14018,253
2019-04-01,9703,9649,18655,524
2019-04-02,10126,9622,19334,291
2019-04-03,9573,9796,18947,253
2019-04-04,9620,9186,17143,287
2019-04-05,8267,8446,15222,228


In [33]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,491913,213567,351866,7586
2019-03-30,438044,208317,273162,6154
2019-03-31,480530,236050,314065,7503
2019-04-01,552810,248511,423350,18490
2019-04-02,550348,237537,423597,13549
2019-04-03,546818,234237,417267,8347
2019-04-04,539924,232588,408053,7906
2019-04-05,513166,223548,369539,7684
2019-04-06,457702,219654,277844,6284
2019-04-07,497672,247528,320200,7596


In [34]:
# all pages with external links
sampled_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,500309,222078,366838,7899
2019-03-30,444277,215674,284896,6374
2019-03-31,487820,244415,328083,7756
2019-04-01,562513,258160,442005,19014
2019-04-02,560474,247159,442931,13840
2019-04-03,556391,244033,436214,8600
2019-04-04,549544,241774,425196,8193
2019-04-05,521433,231994,384761,7912
2019-04-06,464154,227149,289206,6530
2019-04-07,504983,256166,333950,7945


In [35]:
# Total event count for top 1000 viewed WP:M pages with ext links by event type
sum_top1k_events_query = """
SELECT event.action, count(*) count
FROM event.citationusage
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT page_id FROM ryanmax.top1k_med)
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    )
GROUP BY event.action
"""

sum_top1k_events = spark.sql(
    sum_top1k_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
sum_top1k_events_rdd = sum_top1k_events.rdd
sum_top1k_events_df = sqlContext.createDataFrame(sum_top1k_events_rdd)
sum_top1k_events_pandas = sum_top1k_events_df.toPandas()
sum_top1k_events_pandas

Unnamed: 0,action,count
0,fnHover,252154
1,fnClick,136034
2,upClick,8994
3,extClick,73721


In [36]:
# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type
pm_category_events_query = """
SELECT projmed_categories.category, event.action, count(*) count
FROM 
    event.citationusage, 
    (SELECT DISTINCT page_id, category 
    FROM ryanmax.projmed_categories 
    WHERE projmed_categories.category LIKE '%Class_medicine_articles%') 
    AS projmed_categories
WHERE event.page_id = projmed_categories.page_id
    AND wiki = 'enwiki'
    AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY projmed_categories.category, event.action
ORDER BY projmed_categories.category, event.action
"""

pm_category_events = spark.sql(
    pm_category_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_category_events_rdd = pm_category_events.rdd
pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)
pm_category_events_pandas = pm_category_events_df.toPandas()
# set precision of count values so they don't appear with a decimal place ... likely an easier way to do this
pm_category_events_pandas['count'] = pm_category_events_pandas['count'].map(lambda x: '{0:.0f}'.format(x))
pm_category_events_pandas.pivot(index='category', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B-Class_medicine_articles,64780,108742,200929,6515.0
C-Class_medicine_articles,72021,81224,139554,6682.0
Category-Class_medicine_articles,5,3,10,
Disambig-Class_medicine_articles,55,8,9,1.0
FA-Class_medicine_articles,4784,9641,23282,276.0
FL-Class_medicine_articles,144,309,465,16.0
GA-Class_medicine_articles,8850,21617,46466,2551.0
List-Class_medicine_articles,4165,3569,5896,50.0
Redirect-Class_medicine_articles,47,38,43,1.0
Start-Class_medicine_articles,97977,64066,95878,2261.0


In [7]:
# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will not match [1] because we're limiting to namespace 0 pages with external links
# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics
pm_category_pages = """
SELECT category, COUNT(DISTINCT page_id) AS pages_w_links 
FROM ryanmax.projmed_categories 
WHERE category LIKE '%Class_medicine_articles%' 
AND page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
GROUP BY category
ORDER BY COUNT(*) DESC
"""
pm_cat_counts = spark.sql(pm_category_pages)
cats = sqlContext.createDataFrame(pm_cat_counts.rdd)
cats.toPandas()

Unnamed: 0,category,pages_w_links
0,Start-Class_medicine_articles,14493
1,Stub-Class_medicine_articles,9824
2,C-Class_medicine_articles,5308
3,B-Class_medicine_articles,2166
4,List-Class_medicine_articles,456
5,GA-Class_medicine_articles,240
6,FA-Class_medicine_articles,62
7,Disambig-Class_medicine_articles,17
8,Redirect-Class_medicine_articles,14
9,FL-Class_medicine_articles,12


In [4]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to W pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_query = """
SELECT parse_url(event.link_url,'HOST') AS host, event.action, COUNT(*) AS count 
FROM event.citationusage 
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks)
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    )
GROUP BY host, event.action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

# write data to hive for later use in R
top_hosts_events.createOrReplaceTempView("temp_top_hosts_w_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_w_events");
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_w_events AS SELECT * FROM temp_top_hosts_w_events");

top_hosts_events.limit(20).toPandas()


Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,11727961
1,en.m.wikipedia.org,fnClick,5334434
2,en.wikipedia.org,fnClick,2609309
3,www.imdb.com,extClick,1021641
4,en.m.wikipedia.org,fnHover,700456
5,web.archive.org,extClick,680688
6,tools.wmflabs.org,extClick,356619
7,en.wikipedia.org,upClick,355358
8,www.youtube.com,extClick,201009
9,books.google.com,extClick,197913


In [None]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to W:PM pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_wpm_query = """
SELECT parse_url(event.link_url,'HOST') AS host, event.action, COUNT(*) AS count 
FROM event.citationusage 
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    )
GROUP BY host, event.action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

# write data to hive for later use in R
top_hosts_wpm_events.createOrReplaceTempView("temp_top_hosts_wpm_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_wpm_events");
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_wpm_events AS SELECT * FROM temp_top_hosts_wpm_events");

top_hosts_wpm_events.limit(20).toPandas()



Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,486934
1,en.m.wikipedia.org,fnClick,176709
2,en.wikipedia.org,fnClick,116782
3,en.m.wikipedia.org,fnHover,30211
4,www.ncbi.nlm.nih.gov,extClick,28856
5,doi.org,extClick,25016
6,web.archive.org,extClick,17508
7,en.wikipedia.org,upClick,16083
8,books.google.com,extClick,6387
9,www.drugs.com,extClick,5391


In [4]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_notwpm_query = """
SELECT parse_url(event.link_url,'HOST') AS host, event.action, COUNT(*) AS count 
FROM event.citationusage 
WHERE wiki = 'enwiki'
AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks)
AND event.page_id NOT IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
{}
AND to_date(dt) >= '{}'
AND to_date(dt) <= '{}'
AND useragent.is_bot = FALSE
AND event.session_token in (
    SELECT event.session_token 
    FROM event.citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    )
GROUP BY host, event.action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

# write data to hive for later use in R
top_hosts_notwpm_events.createOrReplaceTempView("temp_top_hosts_notwpm_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_notwpm_events");
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_notwpm_events AS SELECT * FROM temp_top_hosts_notwpm_events");

top_hosts_notwpm_events.limit(20).toPandas()


Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,11361299
1,en.m.wikipedia.org,fnClick,5212881
2,en.wikipedia.org,fnClick,2521723
3,www.imdb.com,extClick,1032661
4,en.m.wikipedia.org,fnHover,676892
5,web.archive.org,extClick,669861
6,tools.wmflabs.org,extClick,359762
7,en.wikipedia.org,upClick,343324
8,www.youtube.com,extClick,201590
9,books.google.com,extClick,193475


In [4]:
# Total count of events (by all event types) for each Section ID for WP:M pages only
## data capture issue: data is NOT limited to top-level section IDs
## example: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms
## clicks on links under "Acute hepatitis" are captured with section_id Acute_hepatitis, not Signs_and_symptoms
pm_section_events_query = """
SELECT event.section_id, event.action, count(*) count
FROM 
    event.citationusage 
WHERE event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
    AND wiki = 'enwiki'
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY event.section_id, event.action
ORDER BY count desc
LIMIT 100
"""

pm_section_events = spark.sql(
    pm_section_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_section_events_rdd = pm_section_events.rdd
pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd)
pm_section_events_pandas = pm_section_events_df.toPandas()
pm_section_events_pandas
#pm_section_events_pandas.pivot(index='section_id', columns='action', values='count')

Unnamed: 0,section_id,action,count
0,,fnHover,174550
1,References,extClick,163066
2,,fnClick,120610
3,,extClick,48624
4,External_links,extClick,38400
5,References,upClick,16553
6,History,fnHover,16146
7,Signs_and_symptoms,fnHover,10174
8,History,fnClick,7959
9,Causes,fnHover,6218


In [3]:
# Total count of events (by all event types) in InfoBoxes for WP:M pages only
# Total count of events (by all event types) in the Main section for WP:M pages only
pm_infobox_events_query = """
SELECT event.in_infobox, event.action, count(*) count
FROM 
    event.citationusage 
WHERE wiki = 'enwiki'
    AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks)
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY event.in_infobox, event.action
"""

pm_infobox_events = spark.sql(
    pm_infobox_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_infobox_events_rdd = pm_infobox_events.rdd
pm_infobox_events_df = sqlContext.createDataFrame(pm_infobox_events_rdd)
pm_infobox_events_pandas = pm_infobox_events_df.toPandas()
pm_infobox_events_pandas

Unnamed: 0,in_infobox,action,count
0,False,fnHover,513866
1,False,extClick,230772
2,True,extClick,48239
3,False,fnClick,273298
4,False,upClick,17459
5,True,fnClick,23620
6,True,fnHover,8852


In [4]:
# Total count of events for freely accessible links in W
freely_events_query = """
SELECT event.freely_accessible, event.action, count(*) count
FROM 
    event.citationusage 
WHERE wiki = 'enwiki'
    AND event.action = 'extClick'
    AND event.page_id IN (SELECT page_id FROM ryanmax.pages_with_extlinks)
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY event.freely_accessible, event.action
"""

freely_events = spark.sql(
    freely_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
freely_events_rdd = freely_events.rdd
freely_events_df = sqlContext.createDataFrame(freely_events_rdd)
freely_events_pandas = freely_events_df.toPandas()
freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,17088312
1,True,extClick,12616


In [None]:
# Total count of events for freely accessible links in WP:M
pm_freely_events_query = """
SELECT event.freely_accessible, event.action, count(*) count
FROM 
    event.citationusage 
WHERE wiki = 'enwiki'
    AND event.action = 'extClick'
    AND event.page_id IN (SELECT page_id FROM ryanmax.projmed_with_extlinks)
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY event.freely_accessible, event.action
"""

pm_freely_events = spark.sql(
    pm_freely_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_freely_events_rdd = pm_freely_events.rdd
pm_freely_events_df = sqlContext.createDataFrame(pm_freely_events_rdd)
pm_freely_events_pandas = pm_freely_events_df.toPandas()
pm_freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,278151
1,True,extClick,860


In [3]:
# Total count of events for freely accessible links in WP:M
not_pm_freely_events_query = """
SELECT event.freely_accessible, event.action, count(*) count
FROM 
    event.citationusage 
WHERE wiki = 'enwiki'
    AND event.action = 'extClick'
    AND event.page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks)
    AND event.page_id NOT IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks)
    {}
    AND to_date(dt) >= '{}'
    AND to_date(dt) <= '{}'
    AND useragent.is_bot = FALSE
    AND event.session_token in (
        SELECT event.session_token 
        FROM event.citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        AND useragent.is_bot = FALSE
        )
GROUP BY event.freely_accessible, event.action
"""

not_pm_freely_events = spark.sql(
    not_pm_freely_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
not_pm_freely_events_rdd = not_pm_freely_events.rdd
not_pm_freely_events_df = sqlContext.createDataFrame(not_pm_freely_events_rdd)
not_pm_freely_events_pandas = not_pm_freely_events_df.toPandas()
not_pm_freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,16810163
1,True,extClick,11756
