## Total Event Counts from Top 20 Hostnames

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### all event types

In [4]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W pages with external links
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.toPandas()


Unnamed: 0,host,count
0,en.wikipedia.org,11367746
1,en.m.wikipedia.org,4837557
2,www.imdb.com,803002
3,web.archive.org,528903
4,tools.wmflabs.org,282577
5,www.youtube.com,157707
6,books.google.com,153293
7,doi.org,108797
8,www.espncricinfo.com,101142
9,www.nytimes.com,80159


In [5]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W:PM pages with external links
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,472749
1,en.m.wikipedia.org,162507
2,www.ncbi.nlm.nih.gov,22206
3,doi.org,19184
4,web.archive.org,12715
5,books.google.com,4854
6,www.drugs.com,4195
7,www.who.int,3558
8,translate.googleusercontent.com,2777
9,www.cdc.gov,2419


In [6]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,10894997
1,en.m.wikipedia.org,4675050
2,www.imdb.com,802787
3,web.archive.org,516188
4,tools.wmflabs.org,282278
5,www.youtube.com,156644
6,books.google.com,148439
7,www.espncricinfo.com,101137
8,doi.org,89613
9,www.nytimes.com,78856


### limited to extClick events

In [7]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W pages with external links
# further limited to extClick events
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.toPandas()

Unnamed: 0,host,count
0,www.imdb.com,803002
1,web.archive.org,528876
2,tools.wmflabs.org,282577
3,www.youtube.com,157707
4,books.google.com,153265
5,doi.org,108797
6,www.espncricinfo.com,101142
7,www.nytimes.com,80159
8,www.ncbi.nlm.nih.gov,62421
9,www.theguardian.com,59727


In [8]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W:PM pages with external links
# further limited to extClick events
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.toPandas()

Unnamed: 0,host,count
0,www.ncbi.nlm.nih.gov,22206
1,doi.org,19184
2,web.archive.org,12715
3,books.google.com,4854
4,www.drugs.com,4195
5,www.who.int,3558
6,www.cdc.gov,2419
7,chemapps.stolaf.edu,2018
8,www.nlm.nih.gov,1605
9,apps.who.int,1533


In [9]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
# further limited to extClick events
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.toPandas()

Unnamed: 0,host,count
0,www.imdb.com,802787
1,web.archive.org,516161
2,tools.wmflabs.org,282278
3,www.youtube.com,156644
4,books.google.com,148411
5,www.espncricinfo.com,101137
6,doi.org,89613
7,www.nytimes.com,78856
8,www.theguardian.com,59134
9,www.bbc.co.uk,58655


### limited to fnHover events

In [10]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W pages with external links
# further limited to fnHover events
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'fnHover'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.toPandas()


Unnamed: 0,host,count
0,en.wikipedia.org,9124324
1,en.m.wikipedia.org,554234
2,translate.googleusercontent.com,14446
3,,7440
4,www.biblegateway.com,4495
5,www.translatoruser-int.com,542
6,papago.naver.net,348
7,z5h64q92x9.net,133
8,emedien3.sub.uni-hamburg.de,54
9,en.wikipedi0.org,49


In [11]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W:PM pages with external links
# further limited to fnHover events
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'fnHover'
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,372114
1,en.m.wikipedia.org,23331
2,translate.googleusercontent.com,1008
3,,176
4,www.translatoruser-int.com,29
5,papago.naver.net,24
6,www.biblegateway.com,21
7,emedien3.sub.uni-hamburg.de,12
8,z5h64q92x9.net,9
9,scholar.google.com,2


In [12]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
# further limited to fnHover events
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'fnHover'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,8752210
1,en.m.wikipedia.org,530903
2,translate.googleusercontent.com,13438
3,,7264
4,www.biblegateway.com,4474
5,www.translatoruser-int.com,513
6,papago.naver.net,324
7,z5h64q92x9.net,124
8,en.wikipedi0.org,49
9,emedien3.sub.uni-hamburg.de,42


### limited to extClick, fnClick OR upClick events

In [14]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W pages with external links
# further limited to extClick, fnClick OR upClick events
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action in ('extClick', 'upClick','fnClick')
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.toPandas()


Unnamed: 0,host,count
0,en.m.wikipedia.org,4283323
1,en.wikipedia.org,2243422
2,www.imdb.com,803002
3,web.archive.org,528885
4,tools.wmflabs.org,282577
5,www.youtube.com,157707
6,books.google.com,153272
7,doi.org,108797
8,www.espncricinfo.com,101142
9,www.nytimes.com,80159


In [15]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W:PM pages with external links
# further limited to extClick, fnClick OR upClick events
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action in ('extClick', 'upClick','fnClick')
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.toPandas()

Unnamed: 0,host,count
0,en.m.wikipedia.org,139176
1,en.wikipedia.org,100635
2,www.ncbi.nlm.nih.gov,22206
3,doi.org,19184
4,web.archive.org,12715
5,books.google.com,4854
6,www.drugs.com,4195
7,www.who.int,3558
8,www.cdc.gov,2419
9,chemapps.stolaf.edu,2018


In [16]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
# further limited to extClick, fnClick OR upClick events
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action in ('extClick', 'upClick','fnClick')
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.toPandas()

Unnamed: 0,host,count
0,en.m.wikipedia.org,4144147
1,en.wikipedia.org,2142787
2,www.imdb.com,802787
3,web.archive.org,516170
4,tools.wmflabs.org,282278
5,www.youtube.com,156644
6,books.google.com,148418
7,www.espncricinfo.com,101137
8,doi.org,89613
9,www.nytimes.com,78856


### top wikipedia.org links by event type

In [19]:
# wikipedia.org links by event type
top_wiki_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND parse_url(link_url,'HOST') like '%wikipedia.org'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host, action
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_wiki_hosts_events = spark.sql(
    top_wiki_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_wiki_hosts_events_rdd = top_wiki_hosts_events.rdd
top_wiki_hosts_events_df = sqlContext.createDataFrame(top_wiki_hosts_events_rdd)
top_wiki_hosts_events_pandas = top_wiki_hosts_events_df.toPandas()

top_wiki_hosts_events_pandas.pivot(index='host', columns='action', values='count')

action,extClick,fnClick,fnHover,upClick
host,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ar.wikipedia.org,83.0,,,
de.wikipedia.org,541.0,,,
en.m.wikipedia.org,,4253813.0,554234.0,29498.0
en.wikipedia.org,58.0,2008427.0,9124324.0,234937.0
eo.wikipedia.org,75.0,,,
es.wikipedia.org,210.0,,,
fr.wikipedia.org,514.0,,,
it.wikipedia.org,147.0,,,
ja.wikipedia.org,463.0,,,
nl.wikipedia.org,294.0,,,
