# Most Visited WP:M pages
- includes event and pageview data for top visited WP:M pages

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

## Top 20 most visited WP:M pages with external links

In [2]:
# most visited WP:M pages with external links
top_wpm_query = """
SELECT pageview_hourly.page_id, 
    population_page_titles_20190420.page_title, 
    SUM(pageview_hourly.view_count) AS view_count
FROM wmf.pageview_hourly
LEFT JOIN ryanmax.population_page_titles_20190420 
    ON pageview_hourly.page_id = population_page_titles_20190420.page_id
WHERE project = 'en.wikipedia'
AND pageview_hourly.page_id IN
    (SELECT page_id 
    FROM ryanmax.population_wpm_pages_with_extlinks)
AND agent_type = 'user'
AND to_date(CONCAT(year,'-',month,'-',day)) >= '{}'
AND to_date(CONCAT(year,'-',month,'-',day)) <= '{}'
GROUP BY pageview_hourly.page_id, population_page_titles_20190420.page_title
ORDER BY view_count DESC
LIMIT 20
"""

top_wpm = spark.sql(top_wpm_query.format(start_date_string, end_date_string))
top_wpm.createOrReplaceTempView("temp_top_wpm")
top_wpm.toPandas()

Unnamed: 0,page_id,page_title,view_count
0,43573275,Elizabeth Holmes,1349527
1,41779862,Theranos,546969
2,58911,Measles,388340
3,18079,Leonardo da Vinci,380421
4,27546,Sexual intercourse,371290
5,37556,Asperger syndrome,336547
6,791546,Ketogenic diet,319326
7,4488176,Factitious disorder imposed on another,314542
8,4501,Black Death,308057
9,56880920,Ramesh Balwani,305280


### Event counts for top 20 most visited WP:M pages with external links

In [3]:
# event counts for top 20 most visited WP:M pages with external links
top_event_query = """
SELECT temp_top_wpm.page_id, 
    temp_top_wpm.page_title, 
    view_count, 
    COUNT(*) as event_count, 
    COUNT(*)/view_count AS events_per_pageview
FROM temp_top_wpm, citationusage
WHERE temp_top_wpm.page_id = citationusage.page_id
AND citationusage.wiki = 'enwiki'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY temp_top_wpm.page_id, temp_top_wpm.page_title, view_count
ORDER BY view_count DESC
"""

top_event_rdd = sc.emptyRDD()
top_event = spark.sql(top_event_query
                      .format(event_exclusion_sql,
                              start_date_string, end_date_string))
top_event.toPandas()

Unnamed: 0,page_id,page_title,view_count,event_count,events_per_pageview
0,43573275,Elizabeth Holmes,1349527,39237,0.029075
1,41779862,Theranos,546969,22599,0.041317
2,58911,Measles,388340,10390,0.026755
3,18079,Leonardo da Vinci,380421,9660,0.025393
4,27546,Sexual intercourse,371290,5020,0.01352
5,37556,Asperger syndrome,336547,6287,0.018681
6,791546,Ketogenic diet,319326,2695,0.00844
7,4488176,Factitious disorder imposed on another,314542,1385,0.004403
8,4501,Black Death,308057,6208,0.020152
9,56880920,Ramesh Balwani,305280,5743,0.018812


### extClick event counts for top 20 most visited WP:M pages with external links

In [4]:
# extClick event counts for top 20 most visited WP:M pages with external links
top_extClick_query = """
SELECT temp_top_wpm.page_id, 
    temp_top_wpm.page_title, 
    view_count, COUNT(*) as event_count, 
    COUNT(*)/view_count AS events_per_pageview
FROM temp_top_wpm, citationusage
WHERE temp_top_wpm.page_id = citationusage.page_id
AND citationusage.wiki = 'enwiki'
AND action = 'extClick'
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY temp_top_wpm.page_id, temp_top_wpm.page_title, view_count
ORDER BY view_count DESC
"""

top_extClick_rdd = sc.emptyRDD()
top_extClick = spark.sql(top_extClick_query
                           .format(event_exclusion_sql,
                                   start_date_string, end_date_string))
top_extClick.toPandas()

Unnamed: 0,page_id,page_title,view_count,event_count,events_per_pageview
0,43573275,Elizabeth Holmes,1349527,6778,0.005023
1,41779862,Theranos,546969,12967,0.023707
2,58911,Measles,388340,1035,0.002665
3,18079,Leonardo da Vinci,380421,535,0.001406
4,27546,Sexual intercourse,371290,627,0.001689
5,37556,Asperger syndrome,336547,402,0.001194
6,791546,Ketogenic diet,319326,846,0.002649
7,4488176,Factitious disorder imposed on another,314542,128,0.000407
8,4501,Black Death,308057,858,0.002785
9,56880920,Ramesh Balwani,305280,1661,0.005441
