## Total Event Counts from Top 20 Hostnames

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [4]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W pages with external links
top_hosts_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_events = spark.sql(
    top_hosts_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_events_rdd = top_hosts_events.rdd
top_hosts_events_df = sqlContext.createDataFrame(top_hosts_events_rdd)
top_hosts_events_pandas = top_hosts_events_df.toPandas()

top_hosts_events.toPandas()


Unnamed: 0,host,count
0,en.wikipedia.org,11367746
1,en.m.wikipedia.org,4837557
2,www.imdb.com,803002
3,web.archive.org,528903
4,tools.wmflabs.org,282577
5,www.youtube.com,157707
6,books.google.com,153293
7,doi.org,108797
8,www.espncricinfo.com,101142
9,www.nytimes.com,80159


In [5]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…)
# limited to W:PM pages with external links
top_hosts_wpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
            (SELECT DISTINCT page_id 
            FROM ryanmax.projmed_with_extlinks
            WHERE to_date(dt) >= '{}' 
            AND to_date(dt) <= '{}'
            )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_wpm_events = spark.sql(
    top_hosts_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_wpm_events_rdd = top_hosts_wpm_events.rdd
top_hosts_wpm_events_df = sqlContext.createDataFrame(top_hosts_wpm_events_rdd)
top_hosts_wpm_events_pandas = top_hosts_wpm_events_df.toPandas()

top_hosts_wpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,472749
1,en.m.wikipedia.org,162507
2,www.ncbi.nlm.nih.gov,22206
3,doi.org,19184
4,web.archive.org,12715
5,books.google.com,4854
6,www.drugs.com,4195
7,www.who.int,3558
8,translate.googleusercontent.com,2777
9,www.cdc.gov,2419


In [6]:
# Total event count for top 20 hostnames (e.g., DOI.org / ncbi…) by event type
# limited to NOT W:PM pages with external links
top_hosts_notwpm_query = """
SELECT parse_url(link_url,'HOST') AS host, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
AND page_id NOT IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}'
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND session_id in (
    SELECT session_id
    FROM citationusagepageload
    WHERE wiki = 'enwiki'
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
    )
GROUP BY host
ORDER BY COUNT(*) DESC
LIMIT 20
"""
top_hosts_notwpm_events = spark.sql(
    top_hosts_notwpm_query.format(
        start_date_string, end_date_string,
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string,
        pageload_exclusion_sql, start_date_string, end_date_string,
    ))
top_hosts_notwpm_events_rdd = top_hosts_notwpm_events.rdd
top_hosts_notwpm_events_df = sqlContext.createDataFrame(top_hosts_notwpm_events_rdd)
top_hosts_notwpm_events_pandas = top_hosts_notwpm_events_df.toPandas()

top_hosts_notwpm_events.toPandas()

Unnamed: 0,host,count
0,en.wikipedia.org,10894997
1,en.m.wikipedia.org,4675050
2,www.imdb.com,802787
3,web.archive.org,516188
4,tools.wmflabs.org,282278
5,www.youtube.com,156644
6,books.google.com,148439
7,www.espncricinfo.com,101137
8,doi.org,89613
9,www.nytimes.com,78856
