# Top Level Domain Report

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

# show lots of data
pd.options.display.max_rows=10000

# convenience methods for reporting sql output from externallinks table
def merge_reports(report):
    merged = pd.DataFrame();
    for dt in ['20190401','20190420']:
        file = 'data/'+report+'-'+dt+'.txt'
        df = pd.read_csv(file, sep="\t")
        df['dt'] = dt
        merged = merged.append(df)
    return merged

def report_avg_per_domain(df, groupby, title):
    df = pd.DataFrame(df.groupby(groupby).agg(['mean'])['num_links']['mean'].sort_values(ascending=False))
    df.rename(columns={'mean':'avg count'}, inplace=True)
    print('\n\n', title)
    return df


## summary counts from externallinks table

In [2]:
report = 'count_of_externallinks_by_top-level-domain'
display(Markdown("**"+report+"** : limited to 10000 links or more"))
merged = merge_reports(report)
# reduce to counts of 10000 or more
merged = merged.loc[(merged['num_links'] >= 10000)]
df = pd.DataFrame(merged.groupby('tld').agg(['mean'])['num_links']['mean'].sort_values(ascending=False))
df.rename(columns={'mean':'avg count'}, inplace=True)
df

**count_of_externallinks_by_top-level-domain** : limited to 10000 links or more

Unnamed: 0_level_0,avg count
tld,Unnamed: 1_level_1
.com,22048766.5
.org,18603976.0
.gov,4376378.5
.uk,3090313.5
.net,1616011.0
.edu,1426978.5
.au,1097275.5
.fr,967446.0
.ca,640978.5
.de,510777.5


### count_of_externallinks_by_top-level-domain_projmed_only

In [3]:
report = 'count_of_externallinks_by_top-level-domain_projmed_only'
display(Markdown("**"+report+"**"))
merged = merge_reports(report)
df = pd.DataFrame(merged.groupby('tld').agg(['mean'])['num_links']['mean'].sort_values(ascending=False))
df.rename(columns={'mean':'avg count'}, inplace=True)
df

**count_of_externallinks_by_top-level-domain_projmed_only**

Unnamed: 0_level_0,avg count
tld,Unnamed: 1_level_1
.org,370686.5
.gov,279508.0
.com,160779.0
.uk,25528.5
.edu,25029.5
.int,17857.0
.ca,7580.5
.net,6405.5
.au,6144.0
.fr,4962.0


### count_of_externallinks_by_dot-gov-domain

In [4]:
report = 'count_of_externallinks_by_dot-gov-domain'
display(Markdown("**"+report+"**"))
merged = merge_reports(report)
df = pd.DataFrame(merged.groupby('domain').agg(['mean'])['num_links']['mean'].sort_values(ascending=False))
df.rename(columns={'mean':'avg count'}, inplace=True)
df


**count_of_externallinks_by_dot-gov-domain**

Unnamed: 0_level_0,avg count
domain,Unnamed: 1_level_1
.nih.gov,1455024.0
.nasa.gov,659323.0
.loc.gov,525681.5
.nps.gov,325395.0
.census.gov,236093.5
.usgs.gov,181857.0
.itis.gov,146904.0
.noaa.gov,66327.0
.fcc.gov,55557.0
.ars-grin.gov,37196.5


### count_of_externallinks_by_dot-gov-domain_projmed_only

In [5]:
report = 'count_of_externallinks_by_dot-gov-domain_projmed_only'
display(Markdown("**"+report+"**"))
merged = merge_reports(report)
df = pd.DataFrame(merged.groupby('domain').agg(['mean'])['num_links']['mean'].sort_values(ascending=False))
df.rename(columns={'mean':'avg count'}, inplace=True)
df

**count_of_externallinks_by_dot-gov-domain_projmed_only**

Unnamed: 0_level_0,avg count
domain,Unnamed: 1_level_1
.nih.gov,254420.5
.cdc.gov,5638.5
.fda.gov,3577.0
.loc.gov,3565.0
.cancer.gov,1273.0
medlineplus.gov,1046.5
.epa.gov,617.0
.hhs.gov,448.0
clinicaltrials.gov,428.5
.ahrq.gov,354.0


## Top Level Domain Event Data

### Event counts for top level domains by event type
- limited to W pages with external links
- not limited to sampled pageloads

In [6]:
# Event counts for top level domains by event type
# limited to W pages with external links
# not limited to sampled pageloads
w_tld_query = """
SELECT REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+)$',1) as tld, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY tld, action
ORDER BY COUNT(*) DESC
"""
w_tld_events = spark.sql(
    w_tld_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
w_tld_events_rdd = w_tld_events.rdd
w_tld_events_df = sqlContext.createDataFrame(w_tld_events_rdd)
w_tld_events_pandas = w_tld_events_df.toPandas()

In [7]:
tld_pda = w_tld_events_pandas.copy()
parsing_errors = tld_pda.loc[(tld_pda['tld'] == '') | (tld_pda['tld'].isnull())]
count_parsing_errors = parsing_errors['count'].sum()
total = tld_pda['count'].sum()
display(Markdown("**Table I**: Count of events by top level domain. W pages w/ external links.  Limited to >= 1000 events."))
display(Markdown("Could not parse hostname/domain from {0} link_urls (first two rows) which represents {1:.2%} of all values.".format(count_parsing_errors,count_parsing_errors/total)))

# limit to counts of 1K or more
df_filtered = tld_pda.query('count>1000').copy()
# set precision before pivot
df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered.pivot(index='tld', columns='action', values='count')

**Table I**: Count of events by top level domain. W pages w/ external links.  Limited to >= 1000 events.

Could not parse hostname/domain from 55105 link_urls (first two rows) which represents 0.06% of all values.

action,extClick,fnClick,fnHover,upClick
tld,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,12409,18212.0,22268.0,
,1496,,,
.122,3411,,,
.132,2203,,,
.173,2992,,,
.194,1907,,,
.27,3142,,,
.ac,1575,,,
.ad,1103,,,
.ae,23098,,,


### Event counts for top level domains by event type
- limited to WP:M pages with external links
- not limited to sampled pageloads

In [8]:
# Event counts for top level domains by event type
# limited to WP:M pages with external links
# not limited to sampled pageloads
wpm_tld_query = """
SELECT REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+)$',1) as tld, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY tld, action
ORDER BY COUNT(*) DESC
"""
wpm_tld_events = spark.sql(
    wpm_tld_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
wpm_tld_events_rdd = wpm_tld_events.rdd
wpm_tld_events_df = sqlContext.createDataFrame(wpm_tld_events_rdd)
wpm_tld_events_pandas = wpm_tld_events_df.toPandas()

In [9]:
tld_pda = wpm_tld_events_pandas.copy()
parsing_errors = tld_pda.loc[(tld_pda['tld'] == '') | (tld_pda['tld'].isnull())]
count_parsing_errors = parsing_errors['count'].sum()
total = tld_pda['count'].sum()
display(Markdown("**Table II**: Count of events by top level domain. WP:M pages w/ external links.  Limited to >= 100 events."))
display(Markdown("Could not parse hostname/domain from {0} link_urls (first row) which represents {1:.2%} of all values.".format(count_parsing_errors,count_parsing_errors/total)))

# limit to counts of 1K or more
df_filtered = tld_pda.query('count>100').copy()
# set precision before pivot
df_filtered['count'] = df_filtered['count'].map(lambda x: '{0:.0f}'.format(x))
df_filtered.pivot(index='tld', columns='action', values='count')

**Table II**: Count of events by top level domain. WP:M pages w/ external links.  Limited to >= 100 events.

Could not parse hostname/domain from 951 link_urls (first row) which represents 0.04% of all values.

action,extClick,fnClick,fnHover,upClick
tld,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,102,273.0,510.0,
.al,132,,,
.at,219,,,
.au,5711,,,
.be,386,,,
.br,323,,,
.ca,9971,,,
.ch,980,,,
.cn,849,,,
.co,233,,,


## .gov Domain Event Data

### Event counts for .gov top level domain by event type
- limited to W pages with external links
- not limited to sampled pageloads

In [10]:
# Event counts for .gov top level domain by event type
# limited to W pages with external links
# not limited to sampled pageloads
w_gov_query = """
SELECT REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+\.[^\.]+)$',1) as domain, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+)$',1) = '.gov'
GROUP BY domain, action
ORDER BY COUNT(*) DESC
"""
w_gov_events = spark.sql(
    w_gov_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
w_gov_events_rdd = w_gov_events.rdd
w_gov_events_df = sqlContext.createDataFrame(w_gov_events_rdd)
w_gov_events_pandas = w_gov_events_df.toPandas()

In [11]:
display(Markdown("**Table III**: Count of events for .gov domain. W pages w/ external links. Only extClick events found."))
tld_pda = w_gov_events_pandas.copy()
tld_pda.sort_values(by=['count'],ascending=False)[['domain','count']]


**Table III**: Count of events for .gov domain. W pages w/ external links. Only extClick events found.

Unnamed: 0,domain,count
0,.nih.gov,231437
1,.nps.gov,54546
2,.nasa.gov,43591
3,.loc.gov,36051
4,.census.gov,26097
5,.usgs.gov,25518
6,.ca.gov,23075
7,.house.gov,22679
8,.usda.gov,17724
9,.congress.gov,17248


### Event counts for .gov top level domain by event type
- limited to WP:M pages with external links
- not limited to sampled pageloads

In [12]:
# Event counts for .gov top level domain by event type
# limited to WP:M pages with external links
# not limited to sampled pageloads
wpm_gov_query = """
SELECT REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+\.[^\.]+)$',1) as domain, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
AND REGEXP_EXTRACT(parse_url(link_url,'HOST'),'(\.[^\.]+)$',1) = '.gov'
GROUP BY domain, action
ORDER BY COUNT(*) DESC
"""
wpm_gov_events = spark.sql(
    wpm_gov_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
wpm_gov_events_rdd = wpm_gov_events.rdd
wpm_gov_events_df = sqlContext.createDataFrame(wpm_gov_events_rdd)
wpm_gov_events_pandas = wpm_gov_events_df.toPandas()

In [13]:
display(Markdown("**Table IV**: Count of events for .gov domain. WP:M pages w/ external links. Only extClick events found."))
wpm_gov_events_pandas.sort_values(by=['count'],ascending=False)[['domain','count']]

**Table IV**: Count of events for .gov domain. WP:M pages w/ external links. Only extClick events found.

Unnamed: 0,domain,count
0,.nih.gov,89464
1,.cdc.gov,8072
2,.fda.gov,7039
3,.cancer.gov,2771
4,medlineplus.gov,1763
5,.hhs.gov,1143
6,.cms.gov,921
7,.epa.gov,798
8,.gpo.gov,602
9,.usda.gov,428
