## Event Data: Most Visited WP:M pages
#### relies on data in ryanmax.top1k_med_anon previously written in [pageload-event-anonymizedData.ipynb](pageload-event-anonymizedData.ipynb)

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
# top 20 most visited WP:M pages with external links
top20_query = """
SELECT * FROM ryanmax.top1k_med_anon
ORDER BY count desc
LIMIT 20
"""

top20_rdd = sc.emptyRDD()
top20 = spark.sql(top20_query)
top20.toPandas()

Unnamed: 0,page_id,title,count
0,43573275,Elizabeth_Holmes,180826
1,18079,Leonardo_da_Vinci,85207
2,58911,Measles,79840
3,41779862,Theranos,76707
4,27546,Sexual_intercourse,75459
5,7188999,Wiggers_diagram,72130
6,37556,Asperger_syndrome,67746
7,791546,Ketogenic_diet,66024
8,4501,Black_Death,60670
9,52135,Pneumonia,59182


In [3]:
# event counts for top 20 most visited WP:M pages with external links
top20_event_query = """
SELECT top1k_med_anon.page_id, top1k_med_anon.title, top1k_med_anon.count as pgload_count, count(*) as event_count
FROM citationusage, ryanmax.top1k_med_anon
WHERE citationusage.page_id IN (SELECT page_id FROM ryanmax.top1k_med_anon ORDER BY count desc LIMIT 20)
AND citationusage.page_id = top1k_med_anon.page_id
AND wiki = 'enwiki'
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY top1k_med_anon.page_id, top1k_med_anon.title, top1k_med_anon.count
ORDER BY pgload_count DESC
"""

top20_event_rdd = sc.emptyRDD()
top20_event = spark.sql(top20_event_query
                           .format(pageload_exclusion_sql, 
                                   start_date_string, end_date_string,
                                   event_exclusion_sql,
                                   start_date_string, end_date_string))
top20_event.toPandas()

Unnamed: 0,page_id,title,pgload_count,event_count
0,43573275,Elizabeth_Holmes,180826,7549
1,18079,Leonardo_da_Vinci,85207,2446
2,58911,Measles,79840,2771
3,41779862,Theranos,76707,4295
4,27546,Sexual_intercourse,75459,1261
5,7188999,Wiggers_diagram,72130,5
6,37556,Asperger_syndrome,67746,1684
7,791546,Ketogenic_diet,66024,712
8,4501,Black_Death,60670,1541
9,52135,Pneumonia,59182,1057


In [4]:
# extClick event counts for top 20 most visited WP:M pages with external links
top20_extClick_query = """
SELECT top1k_med_anon.page_id, top1k_med_anon.title, top1k_med_anon.count as pgload_count, count(*) as extClick_count
FROM citationusage, ryanmax.top1k_med_anon
WHERE citationusage.page_id IN (SELECT page_id FROM ryanmax.top1k_med_anon ORDER BY count desc LIMIT 20)
AND citationusage.page_id = top1k_med_anon.page_id
AND wiki = 'enwiki'
AND action = 'extClick'
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY top1k_med_anon.page_id, top1k_med_anon.title, top1k_med_anon.count
ORDER BY pgload_count DESC
"""

top20_extClick_rdd = sc.emptyRDD()
top20_extClick = spark.sql(top20_extClick_query
                           .format(pageload_exclusion_sql, 
                                   start_date_string, end_date_string,
                                   event_exclusion_sql,
                                   start_date_string, end_date_string))
top20_extClick.toPandas()

Unnamed: 0,page_id,title,pgload_count,extClick_count
0,43573275,Elizabeth_Holmes,180826,1396
1,18079,Leonardo_da_Vinci,85207,142
2,58911,Measles,79840,292
3,41779862,Theranos,76707,2454
4,27546,Sexual_intercourse,75459,188
5,7188999,Wiggers_diagram,72130,4
6,37556,Asperger_syndrome,67746,110
7,791546,Ketogenic_diet,66024,232
8,4501,Black_Death,60670,199
9,52135,Pneumonia,59182,309
