## Pageload & Event Queries
- v. 1.0.0 2019-04-08: basic pageload and event data
- v. 1.0.1 2019-04-17: add category, hostname queries; improved date handling; remove unsampled event queries
- v. 1.0.2 2019-04-23: exclude miscoded events
- v. 1.0.3 2019-04-24: updated through study period close
- v. 1.0.4 2019-04-26: refreshed section query and added pointer to section ID notebook
- v. 1.0.5 2019-05-09: study period set to 2019-03-29 to 2019-04-22
- v. 1.0.6 2019-06-01: re-run against anonymized data
- v. 1.0.7 2019-06-14: add time-to-event queries
- v. 1.0.8 2019-07-26: add standard deviation and interquartile range to time-to-event queries


In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

## Citation Usage Overview

In [4]:
# show citationusage events by date and type

events_query = """
SELECT to_date(event_time) date, action, COUNT(*) count
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), action
ORDER BY to_date(event_time)
"""

events = spark.sql(events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
events_pandas = events_df.toPandas()

In [5]:
events_pandas.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,1498888,666005,1122572,23467
2019-03-30,1336032,649300,865928,19934
2019-03-31,1458753,736367,999087,23209
2019-04-01,1696205,786290,1347917,64128
2019-04-02,1682662,747037,1347737,40231
2019-04-03,1670454,738633,1388282,27035
2019-04-04,1646892,729689,1295965,25781
2019-04-05,1569885,698560,1167598,23560
2019-04-06,1384602,686494,884076,20321
2019-04-07,1509870,774512,1017164,25706


## Pageload Data
#### source: citationusagepageload

In [6]:
# basic pageloads overview
pageloads_query = """
SELECT to_date(event_time) date, action, count(*) count
FROM citationusagepageload
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), action
ORDER BY to_date(event_time)
"""

pageloads = spark.sql(pageloads_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pageloads_rdd = pageloads.rdd
pageloads_df = sqlContext.createDataFrame(pageloads_rdd)
pageloads_pandas = pageloads_df.toPandas()


In [7]:
pageloads_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-29,51320562
2019-03-30,49684008
2019-03-31,55126933
2019-04-01,57571733
2019-04-02,55483014
2019-04-03,55199994
2019-04-04,54940584
2019-04-05,53775886
2019-04-06,51721813
2019-04-07,57142779


### data refresh required
The queries below use data from daily SQL queries which are then imported into hive. Tables that require refreshing and date they were last refreshed:
- ryanmax.pages_with_extlinks: 2019-04-24
- ryanmax.projmed_categories: 2019-04-24
- ryanmax.projmed_with_extlinks: 2019-04-24

In [8]:
# daily count of pageloads of WP:M pages with external links
## verified against day-at-a-time version of same query
pgload_wpm_extl_query = """
SELECT to_date(citationusagepageload.event_time) date, action, count(*) count
FROM citationusagepageload, ryanmax.projmed_with_extlinks
WHERE wiki = 'enwiki'
AND citationusagepageload.page_id = projmed_with_extlinks.page_id
AND to_date(citationusagepageload.event_time) = to_date(projmed_with_extlinks.dt)
{}
AND to_date(citationusagepageload.event_time) >= '{}'
AND to_date(citationusagepageload.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(citationusagepageload.event_time), action
ORDER BY to_date(citationusagepageload.event_time)
"""

pgload_wpm_extl = spark.sql(pgload_wpm_extl_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pgload_wpm_extl_rdd = pgload_wpm_extl.rdd
pgload_wpm_extl_df = sqlContext.createDataFrame(pgload_wpm_extl_rdd)
pgload_wpm_extl_pandas = pgload_wpm_extl_df.toPandas()

In [9]:
pgload_wpm_extl_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-29,1408875
2019-03-30,1204016
2019-03-31,1282270
2019-04-01,1580067
2019-04-02,1613902
2019-04-03,1582605
2019-04-04,1573991
2019-04-05,1437823
2019-04-06,1217936
2019-04-07,1324835


In [10]:
# daily count of pageloads of all pages with external links
## verified against day-at-a-time version of same query
pgload_extl_query = """
SELECT to_date(citationusagepageload.event_time) date, action, count(*) count
FROM citationusagepageload, ryanmax.pages_with_extlinks
WHERE wiki = 'enwiki'
AND citationusagepageload.page_id = pages_with_extlinks.page_id
AND to_date(citationusagepageload.event_time) = to_date(pages_with_extlinks.dt)
{}
AND to_date(citationusagepageload.event_time) >= '{}'
AND to_date(citationusagepageload.event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(citationusagepageload.event_time), action
ORDER BY to_date(citationusagepageload.event_time)
"""

pgload_extl = spark.sql(pgload_extl_query.format(pageload_exclusion_sql,start_date_string, end_date_string))
pgload_extl_rdd = pgload_extl.rdd
pgload_extl_df = sqlContext.createDataFrame(pgload_extl_rdd)
pgload_extl_pandas = pgload_extl_df.toPandas()

In [11]:
pgload_extl_pandas.pivot(index='date', columns='action', values='count')

action,pageLoad
date,Unnamed: 1_level_1
2019-03-29,50213939
2019-03-30,48703397
2019-03-31,54066113
2019-04-01,56369207
2019-04-02,54275348
2019-04-03,53993951
2019-04-04,53741696
2019-04-05,52642685
2019-04-06,50701812
2019-04-07,56020786


In [12]:
# top 1000 most visited WP:M pages with external links
top1k_query = """
SELECT citationusagepageload.page_id, page.page_title as title, count(*) count
FROM citationusagepageload, enwiki.page
WHERE citationusagepageload.page_id = page.page_id
AND wiki = 'enwiki'
AND citationusagepageload.page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
AND useragent_is_bot = FALSE
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
GROUP BY citationusagepageload.page_id, page.page_title
ORDER BY count(*) desc
LIMIT 1000
"""

top1k_rdd = sc.emptyRDD()
top1k = spark.sql(top1k_query.format(start_date_string, end_date_string, pageload_exclusion_sql, start_date_string, end_date_string))

top1k.limit(20).toPandas()

# NOTE: Pages may appear with a title of "None" because the page title data source used here is infrequently updated.
# Use the page_id to look up a title: https://en.wikipedia.org/?curid=56880920


Unnamed: 0,page_id,title,count
0,43573275,Elizabeth_Holmes,180826
1,18079,Leonardo_da_Vinci,85207
2,58911,Measles,79840
3,41779862,Theranos,76707
4,27546,Sexual_intercourse,75459
5,7188999,Wiggers_diagram,72130
6,37556,Asperger_syndrome,67746
7,791546,Ketogenic_diet,66024
8,4501,Black_Death,60670
9,52135,Pneumonia,59182


In [13]:
# write top1k data to a table for later use in most-visited-WPM-pages.ipynb
top1k.createOrReplaceTempView("temp_top1k")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top1k_med_anon")
sqlContext.sql("CREATE TABLE ryanmax.top1k_med_anon AS SELECT * FROM temp_top1k")
# total pageload counts for these pages: 14908141
# select sum(count) from ryanmax.top1k_med_anon

DataFrame[]

## Event Data

#### source: citationusage (limited to sampled pageloads from citationusagepageload)

In [14]:
# daily count of events for WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_wpm_events_query = """
SELECT to_date(event_time) date, action, count(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent_is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), action
ORDER BY to_date(event_time)
"""

sampled_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_wpm_events = spark.sql(
        sampled_wpm_events_query.format(dt, pageload_exclusion_sql, d.day, d.month, d.year, 
                                        event_exclusion_sql, d.day, d.month, d.year))
    sampled_wpm_events_rdd = sampled_wpm_events_rdd.union(sampled_daily_wpm_events.rdd)

sampled_wpm_events_merged = sqlContext.createDataFrame(sampled_wpm_events_rdd)
sampled_wpm_events = sampled_wpm_events_merged.toPandas()
# summary and daily counts appear below

In [15]:
# daily count of events for NOT WP:M pages with ext links over study period
# limited to events w/ pageload data
sampled_not_wpm_events_query = """
SELECT to_date(event_time) date, action, count(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND page_id NOT IN (SELECT DISTINCT page_id FROM ryanmax.projmed_with_extlinks WHERE dt LIKE '{}')
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent_is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), action
ORDER BY to_date(event_time)
"""

sampled_not_wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_not_wpm_events = spark.sql(
        sampled_not_wpm_events_query.format(dt, dt, 
                                            pageload_exclusion_sql, d.day, d.month, d.year,
                                            event_exclusion_sql, d.day, d.month, d.year))
    sampled_not_wpm_events_rdd = sampled_not_wpm_events_rdd.union(sampled_daily_not_wpm_events.rdd)

sampled_not_wpm_events_merged = sqlContext.createDataFrame(sampled_not_wpm_events_rdd)
sampled_not_wpm_events = sampled_not_wpm_events_merged.toPandas()
# summary and daily counts appear below

In [16]:
# daily count of events for all pages with ext links over study period
# limited to events w/ pageload data
sampled_events_query = """
SELECT to_date(event_time) date, action, count(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks WHERE dt LIKE '{}')
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND day = {}
    AND month = {}
    AND year = {}
    AND useragent_is_bot = FALSE
    )
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), action
ORDER BY to_date(event_time)
"""

sampled_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    sampled_daily_events = spark.sql(
        sampled_events_query.format(dt, 
                                    pageload_exclusion_sql, d.day, d.month, d.year,
                                    event_exclusion_sql, d.day, d.month, d.year))
    sampled_events_rdd = sampled_events_rdd.union(sampled_daily_events.rdd)

sampled_events_merged = sqlContext.createDataFrame(sampled_events_rdd)
sampled_events = sampled_events_merged.toPandas()
# summary and daily counts appear below

#### Event Summaries (limited to sampled pageloads)

In [17]:
# WPM:M pages with external links
sampled_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,211372
fnClick,226332
fnHover,389094
upClick,12528


In [18]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,12965475
fnClick,6026747
fnHover,9205508
upClick,248972


In [19]:
# all pages with external links
sampled_events.groupby(['action']).sum()

Unnamed: 0_level_0,count
action,Unnamed: 1_level_1
extClick,13176847
fnClick,6253079
fnHover,9594602
upClick,261500


#### Event Daily Counts (limited to sampled pageloads)

In [20]:
# WPM:M pages with external links
sampled_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,8395,8511,14962,313
2019-03-30,6232,7354,11733,220
2019-03-31,7290,8355,13996,253
2019-04-01,9698,9643,18640,523
2019-04-02,10124,9620,19325,291
2019-04-03,9571,9794,18942,252
2019-04-04,9617,9186,17136,287
2019-04-05,8266,8442,15199,228
2019-04-06,6449,7491,11343,245
2019-04-07,7305,8636,13747,349


In [21]:
# NOT WPM:M pages with external links
sampled_not_wpm_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,491535,213114,351367,7574
2019-03-30,437730,207961,272632,6124
2019-03-31,480191,235201,313472,7490
2019-04-01,552479,248145,422689,18464
2019-04-02,549984,237161,423018,13516
2019-04-03,546398,233918,416634,8320
2019-04-04,539540,232288,407407,7876
2019-04-05,512805,223166,369008,7662
2019-04-06,457382,219334,277318,6265
2019-04-07,497332,247197,319717,7575


In [22]:
# all pages with external links
sampled_events.pivot(index='date', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-29,499930,221625,366329,7887
2019-03-30,443962,215315,284365,6344
2019-03-31,487481,243556,327468,7743
2019-04-01,562177,257788,441329,18987
2019-04-02,560108,246781,442343,13807
2019-04-03,555969,243712,435576,8572
2019-04-04,549157,241474,424543,8163
2019-04-05,521071,231608,384207,7890
2019-04-06,463831,226825,288661,6510
2019-04-07,504637,255833,333464,7924


In [23]:
# Total event count for top 1000 viewed WP:M pages with ext links by event type
sum_top1k_events_query = """
SELECT action, count(*) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.top1k_med)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY action
"""

sum_top1k_events = spark.sql(
    sum_top1k_events_query.format(
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
sum_top1k_events_rdd = sum_top1k_events.rdd
sum_top1k_events_df = sqlContext.createDataFrame(sum_top1k_events_rdd)
sum_top1k_events_pandas = sum_top1k_events_df.toPandas()
sum_top1k_events_pandas

Unnamed: 0,action,count
0,fnHover,191297
1,fnClick,105164
2,upClick,6569
3,extClick,55319


In [24]:
# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type
pm_category_events_query = """
SELECT projmed_categories.category, action, count(*) count
FROM 
    citationusage, 
    (SELECT DISTINCT page_id, category 
    FROM ryanmax.projmed_categories 
    WHERE projmed_categories.category LIKE '%Class_medicine_articles%') 
    AS projmed_categories
WHERE citationusage.page_id = projmed_categories.page_id
    AND wiki = 'enwiki'
    AND citationusage.page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY projmed_categories.category, action
ORDER BY projmed_categories.category, action
"""

pm_category_events = spark.sql(
    pm_category_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_category_events_rdd = pm_category_events.rdd
pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)
pm_category_events_pandas = pm_category_events_df.toPandas()
# set precision of count values so they don't appear with a decimal place ... likely an easier way to do this
pm_category_events_pandas['count'] = pm_category_events_pandas['count'].map(lambda x: '{0:.0f}'.format(x))
pm_category_events_pandas.pivot(index='category', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B-Class_medicine_articles,48644,84512,153735,4683.0
C-Class_medicine_articles,56338,63619,108421,4671.0
Category-Class_medicine_articles,3,2,10,
Disambig-Class_medicine_articles,46,6,8,1.0
FA-Class_medicine_articles,3464,7497,17935,204.0
FL-Class_medicine_articles,116,239,385,14.0
GA-Class_medicine_articles,7043,16781,35509,2341.0
List-Class_medicine_articles,3245,2820,4459,43.0
Redirect-Class_medicine_articles,39,33,29,1.0
Start-Class_medicine_articles,76414,50193,74532,1775.0


In [25]:
# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will not match [1] because we're limiting to namespace 0 pages with external links
# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics
pm_category_pages = """
SELECT category, COUNT(DISTINCT page_id) AS pages_w_links 
FROM ryanmax.projmed_categories 
WHERE category LIKE '%Class_medicine_articles%' 
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
GROUP BY category
ORDER BY COUNT(*) DESC
"""
pm_cat_counts = spark.sql(pm_category_pages.format(start_date_string, end_date_string))
cats = sqlContext.createDataFrame(pm_cat_counts.rdd)
cats.toPandas()

Unnamed: 0,category,pages_w_links
0,Start-Class_medicine_articles,14572
1,Stub-Class_medicine_articles,9858
2,C-Class_medicine_articles,5368
3,B-Class_medicine_articles,2178
4,List-Class_medicine_articles,456
5,GA-Class_medicine_articles,241
6,FA-Class_medicine_articles,62
7,Disambig-Class_medicine_articles,17
8,Redirect-Class_medicine_articles,17
9,FL-Class_medicine_articles,12


In [26]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to W pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host, action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.limit(20).toPandas()


Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,9124324
1,en.m.wikipedia.org,fnClick,4253813
2,en.wikipedia.org,fnClick,2008427
3,www.imdb.com,extClick,803002
4,en.m.wikipedia.org,fnHover,554234
5,web.archive.org,extClick,528876
6,tools.wmflabs.org,extClick,282577
7,en.wikipedia.org,upClick,234937
8,www.youtube.com,extClick,157707
9,books.google.com,extClick,153265


In [27]:
# write data to hive for later use in R
top_hosts_events.createOrReplaceTempView("temp_top_hosts_w_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_w_events_anon")
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_w_events_anon AS SELECT * FROM temp_top_hosts_w_events")

DataFrame[]

In [28]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to W:PM pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host, action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.limit(20).toPandas()

Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,372114
1,en.m.wikipedia.org,fnClick,138233
2,en.wikipedia.org,fnClick,88974
3,en.m.wikipedia.org,fnHover,23331
4,www.ncbi.nlm.nih.gov,extClick,22206
5,doi.org,extClick,19184
6,web.archive.org,extClick,12715
7,en.wikipedia.org,upClick,11660
8,books.google.com,extClick,4854
9,www.drugs.com,extClick,4195


In [29]:
# write data to hive for later use in R
top_hosts_wpm_events.createOrReplaceTempView("temp_top_hosts_wpm_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_wpm_events_anon")
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_wpm_events_anon AS SELECT * FROM temp_top_hosts_wpm_events")

DataFrame[]

In [30]:
# Total event count for top 1K hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
# NOTE: limited to 4k which should produce more data than necessary
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host, action
ORDER BY COUNT(*) DESC
LIMIT 4000
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.limit(20).toPandas()

Unnamed: 0,host,action,count
0,en.wikipedia.org,fnHover,8752210
1,en.m.wikipedia.org,fnClick,4115580
2,en.wikipedia.org,fnClick,1919453
3,www.imdb.com,extClick,802787
4,en.m.wikipedia.org,fnHover,530903
5,web.archive.org,extClick,516161
6,tools.wmflabs.org,extClick,282278
7,en.wikipedia.org,upClick,223277
8,www.youtube.com,extClick,156644
9,books.google.com,extClick,148411


In [31]:
# write data to hive for later use in R
top_hosts_notwpm_events.createOrReplaceTempView("temp_top_hosts_notwpm_events")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.top_hosts_notwpm_events_anon")
sqlContext.sql("CREATE TABLE ryanmax.top_hosts_notwpm_events_anon AS SELECT * FROM temp_top_hosts_notwpm_events")

DataFrame[]

In [32]:
# Total count of events (by all event types) for each Section ID for WP:M pages only
## data capture issue: data is NOT limited to top-level section IDs
## example: https://en.wikipedia.org/wiki/Hepatitis#Signs_and_symptoms
## clicks on links under "Acute hepatitis" are captured with section_id Acute_hepatitis, not Signs_and_symptoms
## see section_ids.ipynb for post-processing results mapped to H2 section IDs
pm_section_events_query = """
SELECT section_id, action, count(*) count
FROM 
    citationusage 
WHERE page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    AND wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY section_id, action
ORDER BY count desc
LIMIT 100
"""

pm_section_events = spark.sql(
    pm_section_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_section_events_rdd = pm_section_events.rdd
pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd)
pm_section_events_pandas = pm_section_events_df.toPandas()
pm_section_events_pandas
#pm_section_events_pandas.pivot(index='section_id', columns='action', values='count')

Unnamed: 0,section_id,action,count
0,,fnHover,133043
1,References,extClick,125487
2,,fnClick,93425
3,,extClick,36996
4,External_links,extClick,29687
5,History,fnHover,12327
6,References,upClick,12018
7,Signs_and_symptoms,fnHover,7843
8,History,fnClick,6078
9,Causes,fnHover,4731


In [33]:
# Total count of events (by all event types) in InfoBoxes for WP:M pages only
# Total count of events (by all event types) in the Main section for WP:M pages only
pm_infobox_events_query = """
SELECT in_infobox, action, count(*) count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN 
            (SELECT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY in_infobox, action
"""

pm_infobox_events = spark.sql(
    pm_infobox_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_infobox_events_rdd = pm_infobox_events.rdd
pm_infobox_events_df = sqlContext.createDataFrame(pm_infobox_events_rdd)
pm_infobox_events_pandas = pm_infobox_events_df.toPandas()
pm_infobox_events_pandas

Unnamed: 0,in_infobox,action,count
0,False,fnHover,390041
1,False,extClick,176410
2,True,extClick,36319
3,False,fnClick,209682
4,False,upClick,12661
5,True,fnClick,18440
6,True,fnHover,6688


In [34]:
# Total count of events for freely accessible links in W
freely_events_query = """
SELECT freely_accessible, action, count(*) count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND action = 'extClick'
    AND page_id IN 
            (SELECT page_id 
            FROM ryanmax.pages_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id 
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY freely_accessible, action
"""

freely_events = spark.sql(
    freely_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
freely_events_rdd = freely_events.rdd
freely_events_df = sqlContext.createDataFrame(freely_events_rdd)
freely_events_pandas = freely_events_df.toPandas()
freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,13203834
1,True,extClick,9627


In [35]:
# Total count of events for freely accessible links in WP:M
pm_freely_events_query = """
SELECT freely_accessible, action, count(*) count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND action = 'extClick'
    AND page_id IN 
            (SELECT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY freely_accessible, action
"""

pm_freely_events = spark.sql(
    pm_freely_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_freely_events_rdd = pm_freely_events.rdd
pm_freely_events_df = sqlContext.createDataFrame(pm_freely_events_rdd)
pm_freely_events_pandas = pm_freely_events_df.toPandas()
pm_freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,212091
1,True,extClick,638


In [36]:
# Total count of events for freely accessible links not in WP:M
not_pm_freely_events_query = """
SELECT freely_accessible, action, count(*) count
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND action = 'extClick'
    AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.pages_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    AND page_id NOT IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY freely_accessible, action
"""

not_pm_freely_events = spark.sql(
    not_pm_freely_events_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
not_pm_freely_events_rdd = not_pm_freely_events.rdd
not_pm_freely_events_df = sqlContext.createDataFrame(not_pm_freely_events_rdd)
not_pm_freely_events_pandas = not_pm_freely_events_df.toPandas()
not_pm_freely_events_pandas

Unnamed: 0,freely_accessible,action,count
0,False,extClick,12991743
1,True,extClick,8989


### Time to event

In [37]:
w_event_time_query = """
SELECT action, 
    CAST(AVG(event_offset_time) AS DECIMAL(10,2)) AS average_time_to_event_in_ms, 
    PERCENTILE(event_offset_time,0.5) AS median_time_to_event_in_ms,
    STDDEV(event_offset_time) as stddev,
    (PERCENTILE(event_offset_time,0.75) - PERCENTILE(event_offset_time,0.25)) as iqr

FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.pages_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY action
"""

spark.sql(
    w_event_time_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    )).show()

+--------+---------------------------+--------------------------+--------------------+--------+
|  action|average_time_to_event_in_ms|median_time_to_event_in_ms|              stddev|     iqr|
+--------+---------------------------+--------------------------+--------------------+--------+
| fnHover|                 1807272.55|                   92214.0| 6.574003456480339E7|291359.0|
| fnClick|                  705897.00|                   70568.0|1.0894299972106414E7|171204.0|
| upClick|                 1834085.03|                  141331.0| 1.960274042415305E7|388910.0|
|extClick|                  455426.66|                   36100.0| 9.790809181493093E7| 97436.0|
+--------+---------------------------+--------------------------+--------------------+--------+



In [38]:
pm_event_time_query = """
SELECT action, 
    CAST(AVG(event_offset_time) AS DECIMAL(10,2)) AS average_time_to_event_in_ms, 
    PERCENTILE(event_offset_time,0.5) AS median_time_to_event_in_ms,
    STDDEV(event_offset_time) as stddev,
    (PERCENTILE(event_offset_time,0.75) - PERCENTILE(event_offset_time,0.25)) as iqr
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY action
"""

spark.sql(
    pm_event_time_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    )).show()

+--------+---------------------------+--------------------------+--------------------+--------+
|  action|average_time_to_event_in_ms|median_time_to_event_in_ms|              stddev|     iqr|
+--------+---------------------------+--------------------------+--------------------+--------+
| fnHover|                 3163081.42|                  124711.0| 2.651887131517461E7|460894.0|
| fnClick|                 1228410.22|                   91914.0| 1.322917933871596E7|237350.0|
| upClick|                 1890737.84|                  230660.0|1.7945569278417144E7|338096.0|
|extClick|                  881263.11|                   54187.0|1.1105105478290163E7|171779.0|
+--------+---------------------------+--------------------------+--------------------+--------+



In [39]:
not_pm_event_time_query = """
SELECT action, 
    CAST(AVG(event_offset_time) AS DECIMAL(10,2)) AS average_time_to_event_in_ms, 
    PERCENTILE(event_offset_time,0.5) AS median_time_to_event_in_ms,
    STDDEV(event_offset_time) as stddev,
    (PERCENTILE(event_offset_time,0.75) - PERCENTILE(event_offset_time,0.25)) as iqr
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.pages_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    AND page_id NOT IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}'
            AND to_date(dt) <= '{}'
            )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY action
"""

spark.sql(
    not_pm_event_time_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    )).show()

+--------+---------------------------+--------------------------+--------------------+--------+
|  action|average_time_to_event_in_ms|median_time_to_event_in_ms|              stddev|     iqr|
+--------+---------------------------+--------------------------+--------------------+--------+
| fnHover|                 1749494.33|                   91131.5|6.6901973983033575E7|285842.0|
| fnClick|                  686207.37|                   69894.0|1.0795954795933383E7|168957.0|
| upClick|                 1831242.67|                  136000.0|1.9682236331132613E7|389904.5|
|extClick|                  448458.76|                   35886.0| 9.869563159435776E7| 96541.0|
+--------+---------------------------+--------------------------+--------------------+--------+

