## Exploring NCBI Click Events

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
pd.options.display.max_rows=500

### Counts of .ncbi.nlm.nih.gov links in the externallinks tables for W pages
- limited to W pages with external links
- dump_date included

In [3]:
# Counts of .ncbi.nlm.nih.gov links in the externallinks table by path
w_ncbi_paths_query = """
SELECT dump_date, LOWER(REGEXP_EXTRACT(parse_url(el_to,'PATH'),'(/[^/]+)')) as path, COUNT(DISTINCT el_to, el_from) AS num_links 
FROM ryanmax.externallinks
WHERE 
    parse_url(LOWER(el_to),'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND el_from IN 
        (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
GROUP BY dump_date, path
ORDER BY dump_date, num_links DESC
"""
w_ncbi_paths = spark.sql(w_ncbi_paths_query)
w_ncbi_paths_rdd = w_ncbi_paths.rdd
w_ncbi_paths_df = sqlContext.createDataFrame(w_ncbi_paths_rdd)
w_ncbi_paths_df.toPandas()

Unnamed: 0,dump_date,path,num_links
0,20190401,/pubmed,547752
1,20190401,/pmc,183732
2,20190401,/taxonomy,163796
3,20190401,/entrez,157350
4,20190401,/sites,33208
5,20190401,/compound,15652
6,20190401,/nlmcatalog,15023
7,20190401,/protein,6199
8,20190401,/gene,3797
9,20190401,/books,1686


### Counts of .ncbi.nlm.nih.gov links in the externallinks tables for WP:M pages
- limited to WP:M pages with external links
- dump_date included

In [4]:
# Counts of .ncbi.nlm.nih.gov links in the externallinks table by path for WP:M pages
wpm_ncbi_paths_query = """
SELECT dump_date, LOWER(REGEXP_EXTRACT(parse_url(el_to,'PATH'),'(/[^/]+)')) as path, COUNT(DISTINCT el_to, el_from) AS num_links 
FROM ryanmax.externallinks
WHERE 
    parse_url(LOWER(el_to),'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND el_from IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
GROUP BY dump_date, path
ORDER BY dump_date, num_links DESC
"""
wpm_ncbi_paths = spark.sql(wpm_ncbi_paths_query)
wpm_ncbi_paths_rdd = wpm_ncbi_paths.rdd
wpm_ncbi_paths_df = sqlContext.createDataFrame(wpm_ncbi_paths_rdd)
wpm_ncbi_paths_df.toPandas()

Unnamed: 0,dump_date,path,num_links
0,20190401,/pubmed,179752
1,20190401,/pmc,50216
2,20190401,/entrez,2405
3,20190401,/compound,1790
4,20190401,/books,1377
5,20190401,/nlmcatalog,1149
6,20190401,/sites,476
7,20190401,/taxonomy,381
8,20190401,/pubmedhealth,324
9,20190401,/gene,230


### Event counts for .ncbi.nlm.nih.gov by path for W
- limited to W pages with external links

In [5]:
# Event counts for .ncbi.nlm.nih.gov by path
# limited to W pages with external links
# not limited to sampled pageloads
w_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND LOWER(parse_url(link_url,'HOST')) LIKE '%.ncbi.nlm.nih.gov'
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY PATH, action
ORDER BY COUNT(*) DESC
"""
w_ncbi_paths_events = spark.sql(
    w_ncbi_paths_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
w_ncbi_paths_events_rdd = w_ncbi_paths_events.rdd
w_ncbi_paths_events_df = sqlContext.createDataFrame(w_ncbi_paths_events_rdd)
w_ncbi_paths_events_pandas = w_ncbi_paths_events_df.toPandas()

In [6]:
w_ncbi_paths_events_pandas.pivot(index='path', columns='action', values='count')

action,extClick
path,Unnamed: 1_level_1
,407
/2017,10
/about,26
/about.html,5
/assembly,18
/bankit,18
/bioproject,34
/biosample,1
/blast,93
/blast.cgi,197


### Event counts for *.ncbi.nlm.nih.gov by path for WP:M
- limited to WP:M pages with external links

In [7]:
# Event counts for *.ncbi.nlm.nih.gov by path
# limited to WP:M pages with external links
# not limited to sampled pageloads
wpm_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND LOWER(parse_url(link_url,'HOST')) LIKE '%.ncbi.nlm.nih.gov'
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
wpm_ncbi_paths_events = spark.sql(
    wpm_ncbi_paths_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
wpm_ncbi_paths_events_rdd = wpm_ncbi_paths_events.rdd
wpm_ncbi_paths_events_df = sqlContext.createDataFrame(wpm_ncbi_paths_events_rdd)
wpm_ncbi_paths_events_pandas = wpm_ncbi_paths_events_df.toPandas()

In [8]:
wpm_ncbi_paths_events_pandas.pivot(index='path', columns='action', values='count')

action,extClick
path,Unnamed: 1_level_1
,283
/2014,1
/2015,6
/2018,6
/about,3
/assembly,1
/bioproject,4
/books,1927
/bookshelf,242
/ccds,3


### freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
 - limited to W pages with external links

In [9]:
# freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
# limited to W pages with external links
# not limited to sampled pageloads
free_w_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND LOWER(parse_url(link_url,'HOST')) LIKE '%.ncbi.nlm.nih.gov'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
free_w_ncbi_paths = spark.sql(
    free_w_ncbi_paths_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_ncbi_paths_rdd = free_w_ncbi_paths.rdd
free_w_ncbi_paths_df = sqlContext.createDataFrame(free_w_ncbi_paths_rdd)
free_w_ncbi_paths_df.toPandas()

Unnamed: 0,path,count
0,/pmc,4555


### freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
 - limited to WP:M pages with external links

In [10]:
# freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
# limited to WP:M pages with external links
# not limited to sampled pageloads
free_wpm_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND LOWER(parse_url(link_url,'HOST')) LIKE '%.ncbi.nlm.nih.gov'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_wPM_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
free_w_ncbi_paths = spark.sql(
    free_wpm_ncbi_paths_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_ncbi_paths_rdd = free_w_ncbi_paths.rdd
free_w_ncbi_paths_df = sqlContext.createDataFrame(free_w_ncbi_paths_rdd)
free_w_ncbi_paths_df.toPandas()

Unnamed: 0,path,count
0,/pmc,2141


### freely_accessible extClick by hostname
 - limited to W pages with external links

In [11]:
# freely_accessible extClick by hostname
# limited to W pages with external links
# not limited to sampled pageloads
free_w_query = """
SELECT LOWER(parse_url(link_url,'HOST')) as hostname, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY hostname
ORDER BY free_count desc
"""
free_w = spark.sql(
    free_w_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_rdd = free_w.rdd
free_w_df = sqlContext.createDataFrame(free_w_rdd)
pdaw = free_w_df.toPandas()
print('Total:', pdaw['free_count'].sum())
pdaw

Total: 35018


Unnamed: 0,hostname,free_count
0,arxiv.org,23380
1,www.ncbi.nlm.nih.gov,4555
2,citeseerx.ist.psu.edu,4230
3,doi.org,1644
4,ssrn.com,1001
5,tools.ietf.org,98
6,adsabs.harvard.edu,36
7,translate.googleusercontent.com,36
8,www.jstor.org,18
9,hdl.handle.net,17


### freely_accessible extClick by hostname
- limited to WP:M pages with external links

In [12]:
# freely_accessible extClick by hostname
# limited to WP:M pages with external links
# not limited to sampled pageloads
free_wpm_query = """
SELECT LOWER(parse_url(link_url,'HOST')) as hostname, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY hostname
ORDER BY free_count desc
"""
free_wpm = spark.sql(
    free_wpm_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_wpm_rdd = free_wpm.rdd
free_wpm_df = sqlContext.createDataFrame(free_wpm_rdd)
pdawpm = free_wpm_df.toPandas()
print('Total:', pdawpm['free_count'].sum())
pdawpm

Total: 2606


Unnamed: 0,hostname,free_count
0,www.ncbi.nlm.nih.gov,2141
1,citeseerx.ist.psu.edu,200
2,doi.org,167
3,arxiv.org,64
4,ssrn.com,22
5,translate.googleusercontent.com,9
6,hdl.handle.net,3


### freely_accessible extClick by link label
- limited to W pages with external links

In [13]:
# freely_accessible extClick by link label
# limited to W pages with external links
# not limited to sampled pageloads
free_labels_w_query = """
SELECT link_text, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY link_text
ORDER BY free_count desc
"""
free_labels_w = spark.sql(
    free_labels_w_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_labels_w_rdd = free_labels_w.rdd
free_labels_w_df = sqlContext.createDataFrame(free_labels_w_rdd)
pdaw = free_labels_w_df.toPandas()

In [14]:
doi_count = pdaw.loc[(pdaw['link_text'].str.contains('^10\.'))]['free_count'].count()
print('Number of DOIs: {0}'.format(doi_count))
pm_count = pdaw.loc[(pdaw['link_text'].str.contains('^\d{5,10}$'))]['free_count'].count()
print('Number of likely PMC or PMIDs: {0}'.format(pm_count))



Number of DOIs: 2630
Number of likely PMC or PMIDs: 3912


### link labels for not freely_accessible .ncbi.nlm.nih.gov/pmc extClick events
- limited to W pages with external links

In [15]:
# link labels for not freely_accessible .ncbi.nlm.nih.gov/pmc extClick events
# limited to W pages with external links
# not limited to sampled pageloads
notfree_pmc_labels_query = """
SELECT link_text, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) = '/pmc'
AND freely_accessible = FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY link_text, action
ORDER BY COUNT(*) DESC
"""
notfree_pmc_labels = spark.sql(
    notfree_pmc_labels_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
notfree_pmc_labels_rdd = notfree_pmc_labels.rdd
notfree_pmc_labels_df = sqlContext.createDataFrame(notfree_pmc_labels_rdd)
notfree_pmc_labels_pandas = notfree_pmc_labels_df.toPandas()

In [16]:
pd.options.display.max_rows=20
not_free_pmc = notfree_pmc_labels_pandas.loc[-(notfree_pmc_labels_pandas['link_text'].str.contains('^\d{5,10}$'))]['count'].sum()
display(Markdown("Number of PMC link events that are not freely_accessible and whose label is not a number between 5 and 10 digits long (i.e. a PMID or PMCID): {0}".format(not_free_pmc)))
#notfree_pmc_labels_pandas
notfree_pmc_labels_pandas.loc[-(notfree_pmc_labels_pandas['link_text'].str.contains('^\d{5,10}$'))]

Number of PMC link events that are not freely_accessible and whose label is not a number between 5 and 10 digits long (i.e. a PMID or PMCID): 89536

Unnamed: 0,link_text,count
0,"""The Longitudinal Relationships among Injuncti...",3119
1,Website,892
2,"""Natural product agonists of peroxisome prolif...",838
3,"""International Committee on Taxonomy of Viruse...",199
4,"""Equilibrium Points in N-person Games""",191
5,"""A quantitative description of membrane curren...",93
6,"""The Essential Medicinal Chemistry of Curcumin...",88
7,"""Phylogenetic structure of the prokaryotic dom...",75
8,Official website,74
9,"""A framework for sensitivity analysis of decis...",71


### link labels for freely_accessible .ncbi.nlm.nih.gov/pmc extClick events
- limited to W pages with external links

In [17]:
# link labels for freely_accessible .ncbi.nlm.nih.gov/pmc extClick events
# limited to W pages with external links
# not limited to sampled pageloads
free_pmc_labels_query = """
SELECT link_text, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) = '/pmc'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY link_text, action
ORDER BY COUNT(*) DESC
"""
free_pmc_labels = spark.sql(
    free_pmc_labels_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_pmc_labels_rdd = free_pmc_labels.rdd
free_pmc_labels_df = sqlContext.createDataFrame(free_pmc_labels_rdd)
free_pmc_labels_pandas = free_pmc_labels_df.toPandas()

In [18]:
free_pmc = free_pmc_labels_pandas.loc[-(free_pmc_labels_pandas['link_text'].str.contains('^\d{5,10}$'))]['count'].sum()
display(Markdown("Number of PMC link events that are freely_accessible and whose label is not a number between 5 and 10 digits long (i.e. a PMID or PMCID): {0}".format(free_pmc)))
#free_pmc_labels_pandas
free_pmc_labels_pandas.loc[-(free_pmc_labels_pandas['link_text'].str.contains('^\d{5,10}$'))]

Number of PMC link events that are freely_accessible and whose label is not a number between 5 and 10 digits long (i.e. a PMID or PMCID): 2

Unnamed: 0,link_text,count
903,173176 9,1
3329,PMCPMC5203823,1


### link label length of extClick events by freely_accessible
- limited to W pages with external links

In [19]:
# link label length of extClick events by freely_accessible
# limited to W pages with external links
# not limited to sampled pageloads
all_label_length_query = """
SELECT freely_accessible,
    COUNT(*) AS count,
    MIN(length(link_text)) AS min_link_text_length, 
    MAX(length(link_text)) AS max_link_text_length, 
    CAST(AVG(length(link_text)) AS DECIMAL(10,2)) AS average_link_text_length, 
        PERCENTILE(length(link_text),0.5) AS median_link_text_length,
        STDDEV(length(link_text)) as stddev_link_text_length,
        (PERCENTILE(length(link_text),0.75) - PERCENTILE(length(link_text),0.25)) as iqr_link_text_length
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY freely_accessible
"""
all_label_length = spark.sql(
    all_label_length_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
all_label_length_rdd = all_label_length.rdd
all_label_length_df = sqlContext.createDataFrame(all_label_length_rdd)
all_label_length_pandas = all_label_length_df.toPandas()

In [20]:
all_label_length_pandas

Unnamed: 0,freely_accessible,count,min_link_text_length,max_link_text_length,average_link_text_length,median_link_text_length,stddev_link_text_length,iqr_link_text_length
0,True,35018,3,49,11.41,10.0,3.838434,6.0
1,False,49792871,0,842,31.58,22.0,26.049694,27.0


### link label length .ncbi.nlm.nih.gov/pmc extClick events by freely_accessible
- limited to W pages with external links

In [21]:
# link label length .ncbi.nlm.nih.gov/pmc extClick events by freely_accessible
# limited to W pages with external links
# not limited to sampled pageloads
label_length_query = """
SELECT freely_accessible,
    COUNT(*) AS count,
    MIN(length(link_text)) AS min_link_text_length, 
    MAX(length(link_text)) AS max_link_text_length, 
    CAST(AVG(length(link_text)) AS DECIMAL(10,2)) AS average_link_text_length, 
        PERCENTILE(length(link_text),0.5) AS median_link_text_length,
        STDDEV(length(link_text)) as stddev_link_text_length,
        (PERCENTILE(length(link_text),0.75) - PERCENTILE(length(link_text),0.25)) as iqr_link_text_length
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND LOWER(parse_url(link_url,'HOST')) LIKE '%.ncbi.nlm.nih.gov'
AND LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) = '/pmc'
AND page_id IN 
        (SELECT DISTINCT page_id FROM ryanmax.population_w_pages_with_extlinks)
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY freely_accessible
"""
label_length = spark.sql(
    label_length_query.format(
        event_exclusion_sql, start_date_string, end_date_string
    ))
label_length_rdd = label_length.rdd
label_length_df = sqlContext.createDataFrame(label_length_rdd)
label_length_pandas = label_length_df.toPandas()

In [22]:
label_length_pandas

Unnamed: 0,freely_accessible,count,min_link_text_length,max_link_text_length,average_link_text_length,median_link_text_length,stddev_link_text_length,iqr_link_text_length
0,True,4555,5,13,6.84,7.0,0.442621,0.0
1,False,89586,1,451,86.07,83.0,37.106081,51.0
