# Freely Accessible Link Data
- extract "freely accessible" links and link typs from dump files
- populate `population_freelink_id_types` table
- report static counts
- report event counts

## Link Counts from XML Dumps

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
                       'enwiki-20190420-pages-articles-multistream.xml.bz2']

# regex derived from https://en.wikipedia.org/wiki/Module:Citation/CS1/Configuration as per Antonin Delpeuch
FREE_REGEX = re.compile(r'((pmc|arxiv|citeseerx|biorxiv|rfc|ssrn) ?= ?\w)|((doi|hdl|bibcode|ol|jstor|osti)-access ?= ?free)', re.IGNORECASE | re.DOTALL)

def extract(entity, date):
    page_text = entity.revision.text._VALUE
    matches = FREE_REGEX.finditer(page_text)
    rows = list()
    for m in matches:
        id_type = None
        if (m.group(2)):
            id_type = m.group(2)
        elif (m.group(4)):
            id_type = m.group(4)
        rows.append(Row(dt=date, page_id=entity.id, id_type=id_type.lower()))
    return rows
    
freely_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
    dump_date = re.search(r'.*(\d{8}).*',file).group(1)
    articles = wikipedia\
        .filter("ns = '0'")\
        .filter("redirect._title is null") \
        .filter("revision.text._VALUE is not null") \
        .filter("length(revision.text._VALUE) > 0")
    
    daily_counts = sqlContext.createDataFrame(articles.rdd.flatMap(lambda entity: extract(entity, dump_date)))
    freely_rdd = freely_rdd.union(daily_counts.rdd)

freely_merged = sqlContext.createDataFrame(freely_rdd)
freely = freely_merged.toPandas()


In [3]:
freely_merged.show()

+--------+---------+-------+
|      dt|  id_type|page_id|
+--------+---------+-------+
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      doi|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      pmc|     25|
+--------+---------+-------+
only showing top 20 rows



In [4]:
# write free link counts data for later use
freely_merged.createOrReplaceTempView("temp_free_id_types")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.free_id_types")
sqlContext.sql("CREATE TABLE ryanmax.free_id_types AS SELECT * FROM temp_free_id_types")

DataFrame[]

### Rate of change between dump days

In [6]:
# calculate rate of change
q = """
SELECT count(*) AS freely_accessible_links, dt AS extract_date
FROM 
    ryanmax.free_id_types
GROUP BY extract_date
ORDER BY extract_date
"""
freelinks = spark.sql(q)
freelinks_pd = sqlContext.createDataFrame(freelinks.rdd).toPandas()
print('Rate of change: ',(freelinks_pd['freely_accessible_links'][1] - freelinks_pd['freely_accessible_links'][0])/freelinks_pd['freely_accessible_links'][0])
freelinks_pd


Rate of change:  0.0104899813798


Unnamed: 0,freely_accessible_links,extract_date
0,341564,20190401
1,345147,20190420


In [13]:
# write data for 2019-04-20 and report out link count
freelinks = spark.sql("SELECT * FROM ryanmax.free_id_types WHERE dt = '20190420'")
freelinks_df = sqlContext.createDataFrame(freelinks.rdd)
freelinks_df.createOrReplaceTempView("temp_free_id_types")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_freelink_id_types")
sqlContext.sql("CREATE TABLE ryanmax.population_freelink_id_types AS SELECT * FROM temp_free_id_types")
spark.sql("select count(*) as freely_accessible_links from ryanmax.population_freelink_id_types").show()

+-----------------------+
|freely_accessible_links|
+-----------------------+
|                 345147|
+-----------------------+



### Counts of freely_accessible links

#### Count of freely_accessible links in W pages w/ ext links

In [14]:
# Count of freely_accessible links in W pages w/ ext links
w_free_query = """
SELECT count(*) AS freely_accessible_links
FROM 
    ryanmax.population_freelink_id_types
WHERE 
    page_id IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
"""

w_free_links = spark.sql(w_free_query)
w_free_links.show()

+-----------------------+
|freely_accessible_links|
+-----------------------+
|                 292357|
+-----------------------+



#### Count of freely_accessible links in WP:M pages w/ ext links

In [15]:
# Count of freely_accessible links in WP:M pages w/ ext links
pm_free_query = """
SELECT count(*) AS freely_accessible_links
FROM 
    ryanmax.population_freelink_id_types
WHERE 
    page_id IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
"""

pm_free_links = spark.sql(pm_free_query)
pm_free_links.show()


+-----------------------+
|freely_accessible_links|
+-----------------------+
|                  52782|
+-----------------------+



### Counts of freely_accessible links by id_type

#### Count of freely_accessible links in W pages w/ ext links by id_type

In [16]:
# Count of freely_accessible links in W pages w/ ext links by id_type
w_free_ids_query = """
SELECT id_type, count(*) AS freely_accessible_links
FROM 
    ryanmax.population_freelink_id_types
WHERE page_id IN 
    (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
GROUP BY id_type
ORDER BY freely_accessible_links desc
"""

w_free_ids = spark.sql(w_free_ids_query)
w_free_ids.toPandas()

Unnamed: 0,id_type,freely_accessible_links
0,pmc,188354
1,arxiv,53216
2,doi,24997
3,citeseerx,19440
4,ssrn,3238
5,rfc,1974
6,hdl,704
7,biorxiv,221
8,bibcode,117
9,jstor,81


#### Count of freely_accessible links in WP:M pages w/ ext links by id_type

In [17]:
# Count of freely_accessible links in WP:M pages w/ ext links by id_type
wpm_free_ids_query = """
SELECT id_type, count(*) AS freely_accessible_links
FROM 
    ryanmax.population_freelink_id_types
WHERE page_id IN 
    (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
GROUP BY id_type
ORDER BY freely_accessible_links desc
"""

wpm_free_ids = spark.sql(wpm_free_ids_query)
wpm_free_ids.toPandas()

Unnamed: 0,id_type,freely_accessible_links
0,pmc,50830
1,citeseerx,1390
2,doi,347
3,arxiv,118
4,ssrn,67
5,biorxiv,17
6,hdl,10
7,rfc,2
8,jstor,1


## Event Counts

### Count of events for freely accessible links in WP:M pages

In [18]:
# Total count of events for freely accessible links in WP:M
pm_freely_events_query = """
SELECT freely_accessible, action, count(*) AS total_events, count(*)/{} AS daily_average_events
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND action = 'extClick'
    AND page_id IN 
            (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY freely_accessible, action
"""

pm_freely_events = spark.sql(
    pm_freely_events_query.format(
        days_in_study, event_exclusion_sql, start_date_string, end_date_string
    ))
pm_freely_events_rdd = pm_freely_events.rdd
pm_freely_events_df = sqlContext.createDataFrame(pm_freely_events_rdd)
pm_freely_events_pandas = pm_freely_events_df.toPandas()
pm_freely_events_pandas

Unnamed: 0,freely_accessible,action,total_events,daily_average_events
0,False,extClick,823375,25730.46875
1,True,extClick,2606,81.4375


### Count of events for freely accessible links in W pages

In [20]:
# Total count of events for freely accessible links in W
w_freely_events_query = """
SELECT freely_accessible, action, count(*) AS total_events, count(*)/{} AS daily_average_events
FROM 
    citationusage 
WHERE wiki = 'enwiki'
    AND action = 'extClick'
    AND page_id IN 
            (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
    {}
    AND to_date(event_time) >= '{}'
    AND to_date(event_time) <= '{}'
    AND useragent_is_bot = FALSE
GROUP BY freely_accessible, action
"""

w_freely_events = spark.sql(
    w_freely_events_query.format(
        days_in_study, event_exclusion_sql, start_date_string, end_date_string
    ))
w_freely_events_rdd = w_freely_events.rdd
w_freely_events_df = sqlContext.createDataFrame(w_freely_events_rdd)
w_freely_events_pandas = w_freely_events_df.toPandas()
w_freely_events_pandas

Unnamed: 0,freely_accessible,action,total_events,daily_average_events
0,False,extClick,49792871,1556027.0
1,True,extClick,35018,1094.312
