## Extract Freely Accessible Link Counts from XML Dumps

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
                       'enwiki-20190420-pages-articles-multistream.xml.bz2']

# regex derived from https://en.wikipedia.org/wiki/Module:Citation/CS1/Configuration as per Antonin Delpeuch
FREE_REGEX = re.compile(r'((pmc|arxiv|citeseerx|biorxiv|rfc|ssrn) ?= ?\w)|((doi|hdl|bibcode|ol|jstor|osti)-access ?= ?free)', re.IGNORECASE | re.DOTALL)

def extract(entity, date):
    page_text = entity.revision.text._VALUE
    matches = FREE_REGEX.finditer(page_text)
    rows = list()
    for m in matches:
        id_type = None
        if (m.group(2)):
            id_type = m.group(2)
        elif (m.group(4)):
            id_type = m.group(4)
        rows.append(Row(dt=date, page_id=entity.id, id_type=id_type.lower()))
    return rows
    
freely_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
    dump_date = re.search(r'.*(\d{8}).*',file).group(1)
    articles = wikipedia\
        .filter("ns = '0'")\
        .filter("redirect._title is null") \
        .filter("revision.text._VALUE is not null") \
        .filter("length(revision.text._VALUE) > 0")
    
    daily_counts = sqlContext.createDataFrame(articles.rdd.flatMap(lambda entity: extract(entity, dump_date)))
    freely_rdd = freely_rdd.union(daily_counts.rdd)

freely_merged = sqlContext.createDataFrame(freely_rdd)
freely = freely_merged.toPandas()


In [3]:
freely_merged.show()

+--------+---------+-------+
|      dt|  id_type|page_id|
+--------+---------+-------+
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      doi|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|citeseerx|     25|
|20190401|      pmc|     25|
|20190401|      pmc|     25|
|20190401|      doi|     25|
|20190401|      pmc|     25|
+--------+---------+-------+
only showing top 20 rows



In [4]:
# write free link counts data for later use
freely_merged.createOrReplaceTempView("temp_free_id_types")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.free_id_types")
sqlContext.sql("CREATE TABLE ryanmax.free_id_types AS SELECT * FROM temp_free_id_types")

DataFrame[]

In [5]:
freely.query('free_links>100')

Unnamed: 0,dt,free_links,page_id
5695,20190401,103,12024
13778,20190401,105,27680
90311,20190401,111,156964
563567,20190401,105,2146034
935711,20190401,157,4501641
1653642,20190401,137,11664498
1665664,20190401,109,11790568
1989129,20190401,103,14958673
2019712,20190401,133,15308316
3656464,20190401,155,33431450


### Counts of freely_accessible links

#### Count of freely_accessible links in W pages w/ ext links

In [5]:
# Count of freely_accessible links in W pages w/ ext links
w_free_query = """
SELECT count(*) AS freely_accessible_links, dt AS extract_date
FROM 
    ryanmax.free_id_types
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

w_free_links = spark.sql(w_free_query.format(start_date_string, end_date_string))
w_free_links.show()

+-----------------------+------------+
|freely_accessible_links|extract_date|
+-----------------------+------------+
|                 345144|    20190420|
|                 341561|    20190401|
+-----------------------+------------+



#### Count of freely_accessible links in WP:M pages w/ ext links

In [6]:
# Count of freely_accessible links in WP:M pages w/ ext links
pm_free_query = """
SELECT count(*) AS freely_accessible_links, dt AS extract_date
FROM 
    ryanmax.free_id_types
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

pm_free_links = spark.sql(pm_free_query.format(start_date_string, end_date_string))
pm_free_links.show()


+-----------------------+------------+
|freely_accessible_links|extract_date|
+-----------------------+------------+
|                  52843|    20190420|
|                  52241|    20190401|
+-----------------------+------------+



#### Count of freely_accessible links not in WP:M pages

In [7]:
# Count of freely_accessible links not in WP:M pages
not_pm_free_query = """
SELECT count(*) AS freely_accessible_links, dt as extract_date
FROM 
    ryanmax.free_id_types
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
    AND page_id NOT IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

not_pm_free_links = spark.sql(
    not_pm_free_query.format(start_date_string, end_date_string, start_date_string, end_date_string))
not_pm_free_links.show()


+-----------------------+------------+
|freely_accessible_links|extract_date|
+-----------------------+------------+
|                 292301|    20190420|
|                 289320|    20190401|
+-----------------------+------------+



### Counts of freely_accessible links by id_type

#### Count of freely_accessible links in W pages w/ ext links by id_type

In [8]:
# Count of freely_accessible links in W pages w/ ext links by id_type
w_free_ids_query = """
SELECT dt AS extract_date, id_type, count(*) AS freely_accessible_links
FROM 
    ryanmax.free_id_types
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY id_type, dt
ORDER BY extract_date, freely_accessible_links desc
"""

w_free_ids = spark.sql(w_free_ids_query.format(start_date_string, end_date_string))
w_free_ids.toPandas()

Unnamed: 0,extract_date,id_type,freely_accessible_links
0,20190401,pmc,236292
1,20190401,arxiv,52651
2,20190401,doi,25308
3,20190401,citeseerx,20835
4,20190401,ssrn,3311
5,20190401,rfc,2008
6,20190401,hdl,710
7,20190401,biorxiv,236
8,20190401,bibcode,114
9,20190401,jstor,83


#### Count of freely_accessible links in WP:M pages w/ ext links by id_type

In [9]:
# Count of freely_accessible links in WP:M pages w/ ext links by id_type
wpm_free_ids_query = """
SELECT dt AS extract_date, id_type, count(*) AS freely_accessible_links
FROM 
    ryanmax.free_id_types
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY id_type, dt
ORDER BY extract_date, freely_accessible_links desc
"""

wpm_free_ids = spark.sql(wpm_free_ids_query.format(start_date_string, end_date_string))
wpm_free_ids.toPandas()

Unnamed: 0,extract_date,id_type,freely_accessible_links
0,20190401,pmc,50298
1,20190401,citeseerx,1387
2,20190401,doi,349
3,20190401,arxiv,110
4,20190401,ssrn,67
5,20190401,biorxiv,17
6,20190401,hdl,10
7,20190401,rfc,2
8,20190401,jstor,1
9,20190420,pmc,50886
