# Most Visited WP:M pages
- includes event and pageview data for top visited WP:M pages

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

## Top 20 most visited WP:M pages with external links

In [2]:
# most visited WP:M pages with external links
top_wpm_query = """
SELECT page_id, page_title, SUM(view_count) AS view_count
FROM wmf.pageview_hourly
WHERE project = 'en.wikipedia'
AND page_id IN
    (SELECT page_id 
    FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY page_id, page_title
ORDER BY view_count DESC
LIMIT 20
"""

top_wpm = spark.sql(top_wpm_query.format(start_date_string, end_date_string))
top_wpm.createOrReplaceTempView("temp_top_wpm")
top_wpm.toPandas()

Unnamed: 0,page_id,page_title,view_count
0,43573275,Elizabeth_Holmes,1349128
1,41779862,Theranos,546943
2,58911,Measles,386040
3,18079,Leonardo_da_Vinci,372784
4,27546,Sexual_intercourse,346756
5,37556,Asperger_syndrome,321686
6,791546,Ketogenic_diet,317814
7,56880920,Ramesh_Balwani,305086
8,1232575,Suicide_methods,280846
9,44990,Ryan_White,260539


### Event counts for top 20 most visited WP:M pages with external links

In [3]:
# event counts for top 20 most visited WP:M pages with external links
top_event_query = """
SELECT temp_top_wpm.page_id, temp_top_wpm.page_title, view_count, COUNT(*) as event_count, COUNT(*)/view_count AS events_per_pageview
FROM temp_top_wpm, citationusage
WHERE temp_top_wpm.page_id = citationusage.page_id
AND citationusage.wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY temp_top_wpm.page_id, temp_top_wpm.page_title, view_count
ORDER BY view_count DESC
"""

top_event_rdd = sc.emptyRDD()
top_event = spark.sql(top_event_query
                      .format(event_exclusion_sql,
                              start_date_string, end_date_string))
top_event.toPandas()

Unnamed: 0,page_id,page_title,view_count,event_count,events_per_pageview
0,43573275,Elizabeth_Holmes,1349128,39237,0.029083
1,41779862,Theranos,546943,22599,0.041319
2,58911,Measles,386040,10390,0.026914
3,18079,Leonardo_da_Vinci,372784,9660,0.025913
4,27546,Sexual_intercourse,346756,5020,0.014477
5,37556,Asperger_syndrome,321686,6287,0.019544
6,791546,Ketogenic_diet,317814,2695,0.00848
7,56880920,Ramesh_Balwani,305086,5743,0.018824
8,1232575,Suicide_methods,280846,3103,0.011049
9,44990,Ryan_White,260539,3401,0.013054


### extClick event counts for top 20 most visited WP:M pages with external links

In [4]:
# extClick event counts for top 20 most visited WP:M pages with external links
top_extClick_query = """
SELECT temp_top_wpm.page_id, temp_top_wpm.page_title, view_count, COUNT(*) as event_count, COUNT(*)/view_count AS events_per_pageview
FROM temp_top_wpm, citationusage
WHERE temp_top_wpm.page_id = citationusage.page_id
AND citationusage.wiki = 'enwiki'
AND action = 'extClick'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY temp_top_wpm.page_id, temp_top_wpm.page_title, view_count
ORDER BY view_count DESC
"""

top_extClick_rdd = sc.emptyRDD()
top_extClick = spark.sql(top_extClick_query
                           .format(event_exclusion_sql,
                                   start_date_string, end_date_string))
top_extClick.toPandas()

Unnamed: 0,page_id,page_title,view_count,event_count,events_per_pageview
0,43573275,Elizabeth_Holmes,1349128,6778,0.005024
1,41779862,Theranos,546943,12967,0.023708
2,58911,Measles,386040,1035,0.002681
3,18079,Leonardo_da_Vinci,372784,535,0.001435
4,27546,Sexual_intercourse,346756,627,0.001808
5,37556,Asperger_syndrome,321686,402,0.00125
6,791546,Ketogenic_diet,317814,846,0.002662
7,56880920,Ramesh_Balwani,305086,1661,0.005444
8,1232575,Suicide_methods,280846,274,0.000976
9,44990,Ryan_White,260539,1858,0.007131
