### Extract Freely Available Link Counts from XML Dumps

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
                       'enwiki-20190420-pages-articles-multistream.xml.bz2']

# regex derived from https://en.wikipedia.org/wiki/Module:Citation/CS1/Configuration as per Antonin Delpeuch
FREE_REGEX = re.compile(r'((pmc|arxiv|citeseerx|biorxiv|rfc|ssrn) ?= ?\w)|((doi|hdl|bibcode|ol|jstor|osti)-access ?= ?free)', re.IGNORECASE | re.DOTALL)

def count_free_links(entity, date):
    page_text = entity.revision.text._VALUE
    count = len(FREE_REGEX.findall(page_text))
    return Row(page_id=entity.id, free_links=count, dt=date)

freely_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
    dump_date = re.search(r'.*(\d{8}).*',file).group(1)
    articles = wikipedia\
        .filter("ns = '0'")\
        .filter("redirect._title is null") \
        .filter("revision.text._VALUE is not null") \
        .filter("length(revision.text._VALUE) > 0")
    daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_free_links(entity, dump_date)))
    freely_rdd = freely_rdd.union(daily_counts.rdd)

freely_merged = sqlContext.createDataFrame(freely_rdd)
freely = freely_merged.toPandas()


In [3]:
freely

Unnamed: 0,dt,free_links,page_id
0,20190401,0,12
1,20190401,69,25
2,20190401,3,39
3,20190401,0,290
4,20190401,0,303
5,20190401,0,305
6,20190401,0,307
7,20190401,4,308
8,20190401,0,309
9,20190401,0,316


In [4]:
# write free link counts data to a table for later use
freely_merged.createOrReplaceTempView("temp_free_count")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.free_count_w_date")
sqlContext.sql("CREATE TABLE ryanmax.free_count_w_date AS SELECT * FROM temp_free_count")

DataFrame[]

In [5]:
freely.query('free_links>100')

Unnamed: 0,dt,free_links,page_id
5695,20190401,103,12024
13778,20190401,105,27680
90311,20190401,111,156964
563567,20190401,105,2146034
935711,20190401,157,4501641
1653642,20190401,137,11664498
1665664,20190401,109,11790568
1989129,20190401,103,14958673
2019712,20190401,133,15308316
3656464,20190401,155,33431450


In [6]:
# Count of freely_available links in W pages w/ ext links
w_free_query = """
SELECT sum(free_links) AS freely_available_links, dt AS extract_date
FROM 
    ryanmax.free_count_w_date
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

w_free_links = spark.sql(w_free_query.format(start_date_string, end_date_string))
w_free_links.show()

+----------------------+------------+
|freely_available_links|extract_date|
+----------------------+------------+
|                345144|    20190420|
|                341561|    20190401|
+----------------------+------------+



In [7]:
# Count of freely_available links in WP:M pages w/ ext links
pm_free_query = """
SELECT sum(free_links) AS freely_available_links, dt AS extract_date
FROM 
    ryanmax.free_count_w_date
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

pm_free_links = spark.sql(pm_free_query.format(start_date_string, end_date_string))
pm_free_links.show()


+----------------------+------------+
|freely_available_links|extract_date|
+----------------------+------------+
|                 52843|    20190420|
|                 52241|    20190401|
+----------------------+------------+



In [8]:
# Count of freely_available links not in WP:M pages
not_pm_free_query = """
SELECT sum(free_links) AS freely_available_links, dt as extract_date
FROM 
    ryanmax.free_count_w_date
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
    AND page_id NOT IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

not_pm_free_links = spark.sql(
    not_pm_free_query.format(start_date_string, end_date_string, start_date_string, end_date_string))
not_pm_free_links.show()


+----------------------+------------+
|freely_available_links|extract_date|
+----------------------+------------+
|                292301|    20190420|
|                289320|    20190401|
+----------------------+------------+

