# Citation Usage Event Data

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

def get_stats(df,groupby):
    all_events_count = df['count'].sum()
    stats = df.groupby(groupby).agg(
    [('days','count'),
    ('pageloads_with_events','sum'),
    'mean', 
    'median', 
    'min', 
    'max', 
    'std', 
    ('25%', lambda x: x.quantile(.25)), 
    ('50%', lambda x: x.quantile(.5)), 
    ('75%', lambda x: x.quantile(.75)),
    ('perc', lambda x: sum(x)/all_events_count)])
    stats.columns = stats.columns.droplevel()
    return stats.reset_index()

## Overview

In this results, multiple events of the same type belonging to same page load are counted only 1 time.

### Session count

In [2]:
# citationusage sessions by date
sessions_query = """
SELECT COUNT(DISTINCT session_id) AS distinct_sessions
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
"""
sessions = spark.sql(sessions_query.format(event_exclusion_sql,start_date_string, end_date_string))
print('Distinct sessions: ', sessions.toPandas()['distinct_sessions'].sum())

Distinct sessions:  72953065


### Events by date and type

In [3]:
# show citationusage events by date and type
events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

events = spark.sql(events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
events_pandas = events_df.toPandas()

In [4]:
print('Total events: ', events_pandas['count'].sum())
get_stats(events_pandas,['eventType'])

Total events:  91772856


Unnamed: 0,eventType,days,pageloads_with_events,mean,median,min,max,std,25%,50%,75%,perc
0,extClick,32,46615900,1456747.0,1475226,1228205,1642758,122713.037987,1372526.5,1475226,1565200,0.507949
1,fnClick,32,18406094,575190.4,575488,496345,651380,37232.255559,558817.5,575488,598890,0.200561
2,fnHover,32,26130775,816586.7,855757,600465,972751,132438.255327,698424.25,855757,935459,0.284733
3,upClick,32,620087,19377.72,19866,15399,22594,2205.649791,17983.5,19866,21170,0.006757


In [5]:
events_pandas.pivot(index='date', columns='eventType', values='count')

eventType,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-22,1410330,520185,804278,18765
2019-03-23,1255146,508790,617506,15571
2019-03-24,1385769,575095,706213,18441
2019-03-25,1603639,593025,963849,22594
2019-03-26,1576963,576684,944257,22202
2019-03-27,1545937,568514,929313,21457
2019-03-28,1495966,551856,891895,20317
2019-03-29,1390021,516702,792048,18287
2019-03-30,1228205,496345,601760,15555
2019-03-31,1351799,562292,689464,17985


### Events for WP:M pages with external links

In [6]:
# daily count of events for WP:M pages with external links over study period
wpm_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_wpm_events = spark.sql(
        wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)

wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)
wpm_events = wpm_events_merged.toPandas()

#### WP:M event summary

In [7]:
# summary of events for WP:M pages with external links
print('Total pages with events: ', wpm_events['count'].sum())
get_stats(wpm_events,['eventType'])

Total pages with events:  2460353


Unnamed: 0,eventType,days,pageloads_with_events,mean,median,min,max,std,25%,50%,75%,perc
0,extClick,32,745441,23295.03125,23935,16756,28059,3898.642936,19607.25,23935,26460.0,0.302981
1,fnClick,32,688073,21502.28125,22061,17642,24630,2195.65188,19934.5,22061,23412.75,0.279664
2,fnHover,32,1005545,31423.28125,31893,21113,39107,5873.784158,26426.5,31893,36823.5,0.408699
3,upClick,32,21294,665.4375,663,504,823,93.40855,598.0,663,746.0,0.008655


#### WP:M daily events

In [8]:
# daily event counts for WP:M pages with external links
wpm_events.pivot(index='date', columns='eventType', values='count')

eventType,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-22,23378,20418,31555,639
2019-03-23,18694,18311,24259,528
2019-03-24,21412,20660,28186,649
2019-03-25,27827,23469,38683,779
2019-03-26,28059,23850,39107,809
2019-03-27,27225,24027,38333,771
2019-03-28,26637,23140,36642,740
2019-03-29,22473,20186,30374,661
2019-03-30,17051,17642,22947,524
2019-03-31,19545,19497,26757,625


### Events for W pages with external links

In [9]:
# daily count of events for W pages with ext links over study period
w_events_query = """
SELECT to_date(event_time) date, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY to_date(event_time), eventType
ORDER BY to_date(event_time)
"""

w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_w_events = spark.sql(
        w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    w_events_rdd = w_events_rdd.union(daily_w_events.rdd)

w_events_merged = sqlContext.createDataFrame(w_events_rdd)
w_events = w_events_merged.toPandas()

#### W event summary

In [10]:
# summary of events for W pages with external links
print('Total pages with events: ', w_events['count'].sum())
get_stats(w_events,['eventType'])

Total pages with events:  89148238


Unnamed: 0,eventType,days,pageloads_with_events,mean,median,min,max,std,25%,50%,75%,perc
0,extClick,32,45842949,1432592.0,1449503,1210401,1614845,118882.764276,1352835.5,1449503,1537707,0.514233
1,fnClick,32,17659961,551873.8,551124,477427,621618,35359.646859,537810.5,551124,573586,0.198097
2,fnHover,32,25054780,782961.9,820425,577351,932791,126416.422145,670321.5,820425,896259,0.281046
3,upClick,32,590548,18454.62,18965,14698,21477,2081.304399,17129.75,18965,20157,0.006624


#### W daily events

In [11]:
# daily event counts for W pages with external links
w_events.pivot(index='date', columns='eventType', values='count')

eventType,extClick,fnClick,fnHover,upClick
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-03-22,1385789,498287,770697,17910
2019-03-23,1235462,489042,591724,14825
2019-03-24,1363380,552712,676142,17564
2019-03-25,1574746,567770,922561,21477
2019-03-26,1547844,551155,902864,21079
2019-03-27,1517826,542858,888682,20402
2019-03-28,1468376,527117,853116,19303
2019-03-29,1366748,495043,759793,17376
2019-03-30,1210401,477427,577351,14842
2019-03-31,1331158,541375,661052,17108


## Mobile vs Desktop

### mobile vs desktop events for W pages with external links

In [12]:
# mobile vs desktop events for W pages with external links
w_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""

events = spark.sql(w_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
w_events_pandas = events_df.toPandas()
get_stats(w_events_pandas,['mode','eventType'])

Unnamed: 0,mode,eventType,days,pageloads_with_events,mean,median,min,max,std,25%,50%,75%,perc
0,desktop,extClick,32,27239064,851220.75,894458.5,606742,1030854,151086.553065,698524.5,894458.5,980671.75,0.305548
1,desktop,fnClick,32,6301025,196907.03125,207605.0,136672,239675,36643.423427,165509.5,207605.0,229652.0,0.07068
2,desktop,fnHover,32,23558456,736201.75,771796.5,529543,883770,126814.39361,619519.0,771796.5,850357.5,0.264262
3,desktop,upClick,32,495785,15493.28125,16259.5,11733,18344,2144.698343,13928.0,16259.5,17351.0,0.005561
4,mobile,extClick,32,18603885,581371.40625,567234.0,514540,687071,50628.565349,540496.0,567234.0,617475.5,0.208685
5,mobile,fnClick,32,11358936,354966.75,349187.5,302919,434966,35584.420283,321935.5,349187.5,378114.25,0.127416
6,mobile,fnHover,32,1496324,46760.125,46553.5,41844,52208,2624.315871,45157.5,46553.5,48323.0,0.016785
7,mobile,upClick,32,94763,2961.34375,2928.0,2579,3488,212.964121,2801.0,2928.0,3138.25,0.001063


### mobile vs desktop events for WP:M pages with external links

In [13]:
# mobile vs desktop events for WP:M pages with external links
wpm_mode_events_query = """
SELECT to_date(event_time) AS date, mode, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY date, mode, eventType
ORDER BY date, mode, eventType
"""

events = spark.sql(wpm_mode_events_query.format(event_exclusion_sql,start_date_string, end_date_string))
events_rdd = events.rdd
events_df = sqlContext.createDataFrame(events_rdd)
wpm_events_pandas = events_df.toPandas()
get_stats(wpm_events_pandas,['mode','eventType'])

Unnamed: 0,mode,eventType,days,pageloads_with_events,mean,median,min,max,std,25%,50%,75%,perc
0,desktop,extClick,32,515856,16120.5,16714.5,9957,20492,3612.649738,12715.0,16714.5,19327.0,0.209667
1,desktop,fnClick,32,286003,8937.59375,9199.5,5765,11185,1746.85411,7549.5,9199.5,10529.25,0.116245
2,desktop,fnHover,32,936468,29264.625,29706.5,19282,36660,5708.07563,24283.75,29706.5,34502.25,0.380623
3,desktop,upClick,32,18112,566.0,563.0,429,697,85.156403,514.0,563.0,642.0,0.007362
4,mobile,extClick,32,229585,7174.53125,7278.0,6354,8043,388.443045,6848.0,7278.0,7401.75,0.093314
5,mobile,fnClick,32,402070,12564.6875,12600.5,11293,13823,645.647534,12222.25,12600.5,12910.5,0.16342
6,mobile,fnHover,32,69077,2158.65625,2146.0,1831,2455,188.642965,2019.5,2146.0,2305.75,0.028076
7,mobile,upClick,32,3182,99.4375,97.5,75,132,12.273595,90.75,97.5,106.75,0.001293


------

# Statistical significance of the difference

## Get events count

In [14]:
w_events_query = """
SELECT mode, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""

w_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_w_events = spark.sql(
        w_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    w_events_rdd = w_events_rdd.union(daily_w_events.rdd)
    
w_events_merged = sqlContext.createDataFrame(w_events_rdd)

w_events_merged.registerTempTable("w_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'W' as group
FROM w_events_merged
GROUP BY mode, eventType
"""

w_events = spark.sql(merge_events_query)
w_events

DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]

In [15]:
wpm_events_query = """
SELECT mode, action AS eventType, COUNT(DISTINCT page_token) count
FROM citationusage
WHERE wiki = 'enwiki'
AND page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND day = {}
AND month = {}
AND year = {}
AND useragent_is_bot = FALSE
GROUP BY mode, action
"""

wpm_events_rdd = sc.emptyRDD()
for d in daterange(start_date, end_date):
    dt = date_to_dt(d)
    daily_wpm_events = spark.sql(
        wpm_events_query.format(event_exclusion_sql, d.day, d.month, d.year))
    wpm_events_rdd = wpm_events_rdd.union(daily_wpm_events.rdd)
    
wpm_events_merged = sqlContext.createDataFrame(wpm_events_rdd)

wpm_events_merged.registerTempTable("wpm_events_merged")
merge_events_query = """
SELECT mode, eventType, SUM(count) as pageloads_with_event, 'WPM' as group
FROM wpm_events_merged
GROUP BY mode, eventType
"""

wpm_events = spark.sql(merge_events_query)
wpm_events

DataFrame[mode: string, eventType: string, pageloads_with_event: bigint, group: string]

In [16]:
all_events = wpm_events.union(w_events).toPandas()
all_events

Unnamed: 0,mode,eventType,pageloads_with_event,group
0,desktop,fnClick,286003,WPM
1,mobile,extClick,229585,WPM
2,mobile,fnHover,69077,WPM
3,mobile,fnClick,402070,WPM
4,desktop,extClick,515856,WPM
5,desktop,fnHover,936468,WPM
6,desktop,upClick,18112,WPM
7,mobile,upClick,3182,WPM
8,desktop,fnClick,6301025,W
9,mobile,extClick,18603885,W


## Get page loads

In [17]:
from pyspark.sql import functions as fn

w_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
     SUM(view_count) AS total_pageviews, 'W' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
    (SELECT DISTINCT page_id 
    FROM ryanmax.population_w_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""


w_pageviews = spark.sql(w_query.format(start_date_string, end_date_string))\
                .groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))

In [18]:
wpm_query = """
SELECT CASE WHEN access_method = 'desktop' THEN 'desktop' ELSE 'mobile' END as mode,
            SUM(view_count) AS total_pageviews, 'WPM' as group
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
    (SELECT DISTINCT page_id 
    FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY access_method
"""
wpm_pageviews = spark.sql(wpm_query.format(start_date_string, end_date_string))\
                .groupBy("mode", "group").agg(fn.sum("total_pageviews").alias("total_pageviews"))

In [19]:
all_pageviews = wpm_pageviews.union(w_pageviews).toPandas()
all_pageviews

Unnamed: 0,mode,group,total_pageviews
0,mobile,WPM,125364759
1,desktop,WPM,62650292
2,mobile,W,4175643369
3,desktop,W,3134600739


Join page loads and events summary:

In [20]:
events_summary = all_events.merge(all_pageviews, on=['mode', 'group'])
events_summary['pageloads_without_event'] = events_summary['total_pageviews']-events_summary['pageloads_with_event']
events_summary['event_ratio'] = events_summary['pageloads_with_event']/events_summary['total_pageviews']
events_summary['pages_per_event'] = events_summary['total_pageviews']/events_summary['pageloads_with_event']

events_summary

Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
0,desktop,fnClick,286003,WPM,62650292,62364289,0.004565,219.054667
1,desktop,extClick,515856,WPM,62650292,62134436,0.008234,121.449187
2,desktop,fnHover,936468,WPM,62650292,61713824,0.014948,66.900622
3,desktop,upClick,18112,WPM,62650292,62632180,0.000289,3459.048807
4,mobile,extClick,229585,WPM,125364759,125135174,0.001831,546.049433
5,mobile,fnHover,69077,WPM,125364759,125295682,0.000551,1814.855292
6,mobile,fnClick,402070,WPM,125364759,124962689,0.003207,311.798341
7,mobile,upClick,3182,WPM,125364759,125361577,2.5e-05,39398.101508
8,desktop,fnClick,6301025,W,3134600739,3128299714,0.00201,497.474735
9,desktop,extClick,27239064,W,3134600739,3107361675,0.00869,115.077403


In [21]:
from IPython.core import display as ICD
import scipy

for mode in events_summary['mode'].unique():
    for eventType in events_summary['eventType'].unique():
        print("\n-------------")
        df = events_summary[(events_summary['mode']==mode)
                       & (events_summary['group'].isin(["WPM", "W"]))
                       & (events_summary['eventType']==eventType)]
        ICD.display(df)
        print("Contingency table:")
        cm = df[['pageloads_with_event', 'pageloads_without_event']].as_matrix()
        print(cm)
        oddsratio, pvalue = scipy.stats.fisher_exact(cm)
        if pvalue < 0.001:
            print("\nOddsRatio: {}, p-value < 0.001".format(oddsratio))
        else:
            print("\nOddsRatio: {}, p-value = {}".format(oddsratio, pvalue))


-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
0,desktop,fnClick,286003,WPM,62650292,62364289,0.004565,219.054667
8,desktop,fnClick,6301025,W,3134600739,3128299714,0.00201,497.474735


Contingency table:
[[    286003   62364289]
 [   6301025 3128299714]]

OddsRatio: 2.2768360831203105, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
1,desktop,extClick,515856,WPM,62650292,62134436,0.008234,121.449187
9,desktop,extClick,27239064,W,3134600739,3107361675,0.00869,115.077403


Contingency table:
[[    515856   62134436]
 [  27239064 3107361675]]

OddsRatio: 0.9470998124648808, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
2,desktop,fnHover,936468,WPM,62650292,61713824,0.014948,66.900622
10,desktop,fnHover,23558456,W,3134600739,3111042283,0.007516,133.056289


Contingency table:
[[    936468   61713824]
 [  23558456 3111042283]]

OddsRatio: 2.0038701411178237, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
3,desktop,upClick,18112,WPM,62650292,62632180,0.000289,3459.048807
11,desktop,upClick,495785,W,3134600739,3134104954,0.000158,6322.500154


Contingency table:
[[     18112   62632180]
 [    495785 3134104954]]

OddsRatio: 1.8280540577494713, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
6,mobile,fnClick,402070,WPM,125364759,124962689,0.003207,311.798341
14,mobile,fnClick,11358936,W,4175643369,4164284433,0.00272,367.608671


Contingency table:
[[    402070  124962689]
 [  11358936 4164284433]]

OddsRatio: 1.1795708751281577, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
4,mobile,extClick,229585,WPM,125364759,125135174,0.001831,546.049433
12,mobile,extClick,18603885,W,4175643369,4157039484,0.004455,224.450074


Contingency table:
[[    229585  125135174]
 [  18603885 4157039484]]

OddsRatio: 0.4099629516591732, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
5,mobile,fnHover,69077,WPM,125364759,125295682,0.000551,1814.855292
13,mobile,fnHover,1496324,W,4175643369,4174147045,0.000358,2790.601079


Contingency table:
[[     69077  125295682]
 [   1496324 4174147045]]

OddsRatio: 1.537940259881809, p-value < 0.001

-------------


Unnamed: 0,mode,eventType,pageloads_with_event,group,total_pageviews,pageloads_without_event,event_ratio,pages_per_event
7,mobile,upClick,3182,WPM,125364759,125361577,2.5e-05,39398.101508
15,mobile,upClick,94763,W,4175643369,4175548606,2.3e-05,44064.068983


Contingency table:
[[      3182  125361577]
 [     94763 4175548606]]

OddsRatio: 1.1184342831192702, p-value < 0.001
