## Infobox and Section Events

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
# Infobox clicks that occurred under a section heading (e.g. External links)
# no limits
infobox_section_events_query = """
SELECT section_id, action, count(*) count
FROM
    citationusage
WHERE
    wiki = 'enwiki'
    AND in_infobox = TRUE
    {}
    AND useragent_is_bot = FALSE
GROUP BY section_id, action
ORDER BY count desc
"""

infobox_section_events = spark.sql(infobox_section_events_query.format(event_exclusion_sql))
infobox_section_events_rdd = infobox_section_events.rdd
infobox_section_events_df = sqlContext.createDataFrame(infobox_section_events_rdd)
infobox_section_events_df.show()

+--------------------+--------+--------+
|          section_id|  action|   count|
+--------------------+--------+--------+
|                null|extClick|14979994|
|                null| fnClick| 1849201|
|                null| fnHover| 1107570|
|           Reception| fnClick|   28389|
|           Reception| fnHover|   13881|
|      External_links|extClick|    7268|
|            Timeline|extClick|    4907|
|            Rankings| fnClick|    4270|
|            Rankings| fnHover|    3925|
|  In_popular_culture|extClick|    3107|
|       Bonnie_Parker|extClick|    2648|
|          Soundtrack|extClick|    2208|
|      Elevator_video|extClick|    2046|
|              Career|extClick|    2020|
|             History|extClick|    1918|
|       Assassination|extClick|    1774|
| Professional_career|extClick|    1671|
|Gesundheit!_Insti...|extClick|    1492|
| Body_camera_footage|extClick|    1425|
|              Events|extClick|    1392|
+--------------------+--------+--------+
only showing top

In [3]:
# Total count of events (by all event types) for each top-level (H2) section ID for WP:M pages only
# where the event also occurred in an InfoBox
pm_section_events_query = """
SELECT wpm_sections.section_h2, action, count(*) count
FROM 
    citationusage
    LEFT JOIN ryanmax.wpm_sections 
        ON 
        wpm_sections.page_id = citationusage.page_id 
        AND wpm_sections.section_id = citationusage.section_id
WHERE
    wiki = 'enwiki'
    AND in_infobox = TRUE
    AND citationusage.page_id IN (
                            SELECT DISTINCT page_id 
                            FROM ryanmax.projmed_with_extlinks 
                            WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}'
                        )
    {}
    AND to_date(citationusage.event_time) >= '{}'
    AND to_date(citationusage.event_time) <= '{}'
    AND useragent_is_bot = FALSE
    AND session_id in (
        SELECT session_id
        FROM citationusagepageload
        WHERE wiki = 'enwiki'
        {}
        AND to_date(event_time) >= '{}'
        AND to_date(event_time) <= '{}'
        AND useragent_is_bot = FALSE
        )
GROUP BY wpm_sections.section_h2, action
ORDER BY count desc
"""

pm_section_events = spark.sql(
    pm_section_events_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
pm_section_events_rdd = pm_section_events.rdd
pm_section_events_df = sqlContext.createDataFrame(pm_section_events_rdd)
pm_section_events_pandas = pm_section_events_df.toPandas()


### Count of Infobox events (by all event types) occurring under each top-level (H2) section ID
** Limits: WP:M pages and >= 5 events **

In [4]:
section_pda = pm_section_events_pandas.copy()
# replace 'NaN' section_h2 with 'missing'
section_pda.section_h2.fillna(value='-- Infobox event outside of a section --', inplace=True)
# limit to counts of 1K or more
section_pda['count'] = section_pda['count'].astype(int)
df_filtered = section_pda.query('count>=5').copy()
# set precision before pivot
df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered.pivot(index='section_h2', columns='action', values='count')

action,extClick,fnClick,fnHover
section_h2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-- Infobox event outside of a section --,34803.0,18400.0,6641.0
Academics,,6.0,6.0
Battle_with_schools,,7.0,5.0
Books,9.0,,
Causes,,,6.0
Common_families_of_interleukins,10.0,,
Diagnosis,5.0,,
External_links,1095.0,,
Family_and_birth,19.0,,
Ganfyd,5.0,,
