## Exploring NCBI Click Events

In [2]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
pd.options.display.max_rows=100

In [3]:
# Event counts for .ncbi.nlm.nih.gov by path
# limited to W pages with external links
# not limited to sampled pageloads
w_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY PATH, action
ORDER BY COUNT(*) DESC
"""
w_ncbi_paths_events = spark.sql(
    w_ncbi_paths_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
w_ncbi_paths_events_rdd = w_ncbi_paths_events.rdd
w_ncbi_paths_events_df = sqlContext.createDataFrame(w_ncbi_paths_events_rdd)
w_ncbi_paths_events_pandas = w_ncbi_paths_events_df.toPandas()

In [4]:
w_ncbi_paths_events_pandas.pivot(index='path', columns='action', values='count')

action,extClick
path,Unnamed: 1_level_1
,539
/2014,1
/2015,6
/2017,8
/2018,6
/about,24
/about.html,4
/assembly,14
/bankit,18
/bioproject,22


In [5]:
# Event counts for *.ncbi.nlm.nih.gov by path
# limited to WP:M pages with external links
# not limited to sampled pageloads
wpm_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, action, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
wpm_ncbi_paths_events = spark.sql(
    wpm_ncbi_paths_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
wpm_ncbi_paths_events_rdd = wpm_ncbi_paths_events.rdd
wpm_ncbi_paths_events_df = sqlContext.createDataFrame(wpm_ncbi_paths_events_rdd)
wpm_ncbi_paths_events_pandas = wpm_ncbi_paths_events_df.toPandas()

In [6]:
wpm_ncbi_paths_events_pandas.pivot(index='path', columns='action', values='count')

action,extClick
path,Unnamed: 1_level_1
,216
/2014,1
/2015,6
/2018,6
/about,3
/assembly,1
/bioproject,4
/books,1506
/bookshelf,201
/ccds,1


In [7]:
# freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
# limited to W pages with external links
# not limited to sampled pageloads
free_w_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
free_w_ncbi_paths = spark.sql(
    free_w_ncbi_paths_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_ncbi_paths_rdd = free_w_ncbi_paths.rdd
free_w_ncbi_paths_df = sqlContext.createDataFrame(free_w_ncbi_paths_rdd)
free_w_ncbi_paths_df.toPandas()

Unnamed: 0,path,count
0,/pmc,4947


In [8]:
# freely_accessible extClick event counts for .ncbi.nlm.nih.gov by path
# limited to WP:M pages with external links
# not limited to sampled pageloads
free_wpm_ncbi_paths_query = """
SELECT LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) as path, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY path, action
ORDER BY COUNT(*) DESC
"""
free_w_ncbi_paths = spark.sql(
    free_wpm_ncbi_paths_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_ncbi_paths_rdd = free_w_ncbi_paths.rdd
free_w_ncbi_paths_df = sqlContext.createDataFrame(free_w_ncbi_paths_rdd)
free_w_ncbi_paths_df.toPandas()

Unnamed: 0,path,count
0,/pmc,1683


In [22]:
# freely_accessible extClick by hostname
# limited to W pages with external links
# not limited to sampled pageloads
free_w_query = """
SELECT LOWER(parse_url(link_url,'HOST')) as hostname, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY hostname
ORDER BY free_count desc
"""
free_w = spark.sql(
    free_w_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_w_rdd = free_w.rdd
free_w_df = sqlContext.createDataFrame(free_w_rdd)
pdaw = free_w_df.toPandas()
print('Total:', pdaw['free_count'].sum())
pdaw

Total: 28945


Unnamed: 0,hostname,free_count
0,arxiv.org,18230
1,www.ncbi.nlm.nih.gov,4947
2,citeseerx.ist.psu.edu,3382
3,doi.org,1406
4,ssrn.com,796
5,tools.ietf.org,83
6,translate.googleusercontent.com,39
7,adsabs.harvard.edu,31
8,hdl.handle.net,15
9,www.jstor.org,14


In [21]:
# freely_accessible extClick by hostname
# limited to WP:M pages with external links
# not limited to sampled pageloads
free_wpm_query = """
SELECT LOWER(parse_url(link_url,'HOST')) as hostname, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.projmed_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY hostname
ORDER BY free_count desc
"""
free_wpm = spark.sql(
    free_wpm_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_wpm_rdd = free_wpm.rdd
free_wpm_df = sqlContext.createDataFrame(free_wpm_rdd)
pdawpm = free_wpm_df.toPandas()
print('Total:', pdawpm['free_count'].sum())
pdawpm

Total: 2026


Unnamed: 0,hostname,free_count
0,www.ncbi.nlm.nih.gov,1683
1,citeseerx.ist.psu.edu,150
2,doi.org,121
3,arxiv.org,43
4,ssrn.com,18
5,translate.googleusercontent.com,9
6,hdl.handle.net,2


In [37]:
# freely_accessible extClick by link label
# limited to W pages with external links
# not limited to sampled pageloads
free_labels_w_query = """
SELECT link_text, count(*) as free_count
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND freely_accessible != FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY link_text
ORDER BY free_count desc
"""
free_labels_w = spark.sql(
    free_labels_w_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
free_labels_w_rdd = free_labels_w.rdd
free_labels_w_df = sqlContext.createDataFrame(free_labels_w_rdd)
pdaw = free_labels_w_df.toPandas()

In [35]:
doi_count = pdaw.loc[(pdaw['link_text'].str.contains('^10\.'))]['free_count'].count()
print('Number of DOIs: {0}'.format(doi_count))
pm_count = pdaw.loc[(pdaw['link_text'].str.contains('^\d{5,10}$'))]['free_count'].count()
print('Number of likely PMC or PMIDs: {0}'.format(pm_count))



Number of DOIs: 2377
Number of likely PMC or PMIDs: 4584


In [18]:
# link labels for not freely_accessible .ncbi.nlm.nih.gov/pmc extClick events
# limited to W pages with external links
# not limited to sampled pageloads
notfree_pmc_labels_query = """
SELECT link_text, COUNT(*) AS count 
FROM citationusage 
WHERE wiki = 'enwiki'
AND action = 'extClick'
AND parse_url(link_url,'HOST') LIKE '%.ncbi.nlm.nih.gov'
AND LOWER(REGEXP_EXTRACT(parse_url(link_url,'PATH'),'(/[^/]+)')) = '/pmc'
AND freely_accessible = FALSE
AND page_id IN 
        (SELECT DISTINCT page_id 
        FROM ryanmax.pages_with_extlinks
        WHERE to_date(dt) >= '{}' 
        AND to_date(dt) <= '{}'
        )
{}
AND to_date(event_time) >= '{}'
AND to_date(event_time) <= '{}'
AND useragent_is_bot = FALSE
GROUP BY link_text, action
ORDER BY COUNT(*) DESC
"""
notfree_pmc_labels = spark.sql(
    notfree_pmc_labels_query.format(
        start_date_string, end_date_string,
        event_exclusion_sql, start_date_string, end_date_string
    ))
notfree_pmc_labels_rdd = notfree_pmc_labels.rdd
notfree_pmc_labels_df = sqlContext.createDataFrame(notfree_pmc_labels_rdd)
notfree_pmc_labels_pandas = notfree_pmc_labels_df.toPandas()

In [36]:
pd.options.display.max_rows=20
not_free_pmc = notfree_pmc_labels_pandas.loc[-(notfree_pmc_labels_pandas['link_text'].str.contains('^\d{5,10}$'))]['count'].sum()
display(Markdown("Number of PMC links that are not freely_accessible and whose label a number between 5 and 10 digits long (i.e. a PMID or PMCID): {0}".format(not_free_pmc)))
notfree_pmc_labels_pandas

Number of PMC links that are not freely_accessible and whose label a number between 5 and 10 digits long (i.e. a PMID or PMCID): 105397

Unnamed: 0,link_text,count
0,"""The Longitudinal Relationships among Injuncti...",2809
1,Website,639
2,Male circumcision rates of 237 countries aroun...,293
3,"""Global, regional, and national incidence, pre...",286
4,"""Global, regional, and national incidence, pre...",215
5,"""Global, regional, and national life expectanc...",184
6,"""Global, regional, and national life expectanc...",154
7,"""Equilibrium Points in N-person Games""",148
8,"""Global, regional, and national age-sex specif...",106
9,"""Global, regional, and national age-sex specif...",87
