## WP:M Page Classes/Categories

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
# Total event count for each WP:M class (FA, A, GA, B, C, Start, Stub) by event type
pm_category_events_query = """
SELECT projmed_categories.category, action, count(*) count
FROM 
    citationusage, 
    (SELECT DISTINCT page_id, category 
    FROM ryanmax.projmed_categories 
    WHERE projmed_categories.category LIKE '%Class_medicine_articles%') 
    AS projmed_categories
WHERE citationusage.page_id = projmed_categories.page_id
    AND wiki = 'enwiki'
    AND citationusage.page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY projmed_categories.category, action
ORDER BY projmed_categories.category, action
"""

pm_category_events = spark.sql(
    pm_category_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_category_events_rdd = pm_category_events.rdd
pm_category_events_df = sqlContext.createDataFrame(pm_category_events_rdd)
pm_category_events_pandas = pm_category_events_df.toPandas()
# set precision of count values so they don't appear with a decimal place ... likely an easier way to do this
pm_category_events_pandas['count'] = pm_category_events_pandas['count'].map(lambda x: '{0:.0f}'.format(x))
pm_category_events_pandas.pivot(index='category', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B-Class_medicine_articles,48644,84512,153735,4683.0
C-Class_medicine_articles,56338,63619,108421,4671.0
Category-Class_medicine_articles,3,2,10,
Disambig-Class_medicine_articles,46,6,8,1.0
FA-Class_medicine_articles,3464,7497,17935,204.0
FL-Class_medicine_articles,116,239,385,14.0
GA-Class_medicine_articles,7043,16781,35509,2341.0
List-Class_medicine_articles,3245,2820,4459,43.0
Redirect-Class_medicine_articles,39,33,29,1.0
Start-Class_medicine_articles,76414,50193,74532,1775.0


In [3]:
# count of pages with external links for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will not match [1] because we're limiting to namespace 0 pages with external links
# [1] https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Medicine/Assessment#Statistics
pm_category_pages = """
SELECT category, COUNT(DISTINCT page_id) AS pages_w_links 
FROM ryanmax.projmed_categories 
WHERE category LIKE '%Class_medicine_articles%' 
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
GROUP BY category
ORDER BY COUNT(*) DESC
"""
pm_cat_counts = spark.sql(pm_category_pages.format(start_date_string, end_date_string))
cats = sqlContext.createDataFrame(pm_cat_counts.rdd)
cats.toPandas()

Unnamed: 0,category,pages_w_links
0,Start-Class_medicine_articles,14572
1,Stub-Class_medicine_articles,9858
2,C-Class_medicine_articles,5368
3,B-Class_medicine_articles,2178
4,List-Class_medicine_articles,456
5,GA-Class_medicine_articles,241
6,FA-Class_medicine_articles,62
7,Disambig-Class_medicine_articles,17
8,Redirect-Class_medicine_articles,17
9,FL-Class_medicine_articles,12


In [4]:
# pageloads for each WP:M class (FA, A, GA, B, C, Start, Stub)
# numbers will be higher than overall WP:M pageloads since one page may have more than one category
pm_category_pageloads_query = """
SELECT projmed_categories.category, count(*) as pageloads
FROM 
    citationusagepageload, 
    (SELECT DISTINCT page_id, category 
    FROM ryanmax.projmed_categories 
    WHERE projmed_categories.category LIKE '%Class_medicine_articles%') 
    AS projmed_categories
WHERE citationusagepageload.page_id = projmed_categories.page_id
    AND wiki = 'enwiki'
    AND citationusagepageload.page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    GROUP BY projmed_categories.category
    ORDER BY projmed_categories.category
"""

pm_category_pageloads = spark.sql(
    pm_category_pageloads_query.format(
        start_date_string, end_date_string,
        pageload_exclusion_sql,
        start_date_string, end_date_string,
    ))
pm_category_pageloads_rdd = pm_category_pageloads.rdd
pm_category_pageloads_df = sqlContext.createDataFrame(pm_category_pageloads_rdd)
pm_category_pageloads_df.toPandas()

Unnamed: 0,category,pageloads
0,B-Class_medicine_articles,11198987
1,C-Class_medicine_articles,10140558
2,Category-Class_medicine_articles,49
3,Disambig-Class_medicine_articles,7214
4,FA-Class_medicine_articles,829491
5,FL-Class_medicine_articles,25891
6,GA-Class_medicine_articles,1816339
7,List-Class_medicine_articles,351651
8,Redirect-Class_medicine_articles,4866
9,Start-Class_medicine_articles,10728110
