In [1]:
import pyspark
import re
import pyspark.sql
from pyspark.sql import *
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import os.path
from pyspark.sql.functions import desc
from datetime import timedelta, date
from pyspark import *

%matplotlib inline
spark_hive = pyspark.sql.HiveContext(sc)


In [2]:
WIKIPEDIA_XML_DUMP = 'enwiki-20190420-pages-articles-multistream.xml.bz2'

FREE_REGEX = re.compile(r'((pmc|arxiv|citeseerx|biorxiv|rfc|ssrn) ?= ?\w)|((doi|hdl|bibcode|ol|jstor|osti)-access ?= ?free)', re.IGNORECASE | re.DOTALL)

def count_free_links(entity):
    page_text = entity.revision.text._VALUE
    count = len(FREE_REGEX.findall(page_text))
    return Row(page_id=entity.id, free_links=count)

wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(WIKIPEDIA_XML_DUMP)

articles = wikipedia\
    .filter("ns = '0'")\
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0")

free_count = sqlContext.createDataFrame(articles.rdd.map(count_free_links))

In [3]:
free_count_pandas = free_count.toPandas()

In [24]:
#free_count_pandas['free_links'].sum(axis=0)
#free_count_pandas.loc[:,"free_links"].sum(axis=0)
#print(free_count_pandas.loc[1:10 , ])
free_count_pandas.sum(axis=0)



free_links             345147
page_id       153564000517915
dtype: int64

In [5]:
free_count_pandas.query('free_links>100')

Unnamed: 0,free_links,page_id
5695,103,12024
13779,105,27680
90307,111,156964
563509,105,2146034
935589,157,4501641
1653370,137,11664498
1665387,109,11790568
1988503,104,14958673
2019036,135,15308316
3655450,155,33431450


In [25]:
# write free link counts data to a table for later use
free_count.createOrReplaceTempView("temp_free_count")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.free_count")
sqlContext.sql("CREATE TABLE ryanmax.free_count AS SELECT * FROM temp_free_count")

DataFrame[]

In [2]:
# set date ranges for queries
start_date = date(2019, 3, 29)
end_date = date(2019, 4, 22)
date_format = '%Y-%m-%d'
start_date_string = start_date.strftime(date_format)
end_date_string = end_date.strftime(date_format)

In [3]:
# Count of freely_available links in W pages w/ ext links
w_free_query = """
SELECT sum(free_links) AS freely_available_links
FROM 
    ryanmax.free_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
"""

w_free_links = spark.sql(w_free_query.format(start_date_string, end_date_string))
w_free_links.show()

+----------------------+
|freely_available_links|
+----------------------+
|                345144|
+----------------------+



In [4]:
# Count of freely_available links in WP:M pages w/ ext links
pm_free_query = """
SELECT sum(free_links) AS freely_available_links
FROM 
    ryanmax.free_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
"""

pm_free_links = spark.sql(pm_free_query.format(start_date_string, end_date_string))
pm_free_links.show()


+----------------------+
|freely_available_links|
+----------------------+
|                 52843|
+----------------------+



In [5]:
# Count of freely_available links not in WP:M pages
not_pm_free_query = """
SELECT sum(free_links) AS freely_available_links
FROM 
    ryanmax.free_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
    AND page_id NOT IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
"""

not_pm_free_links = spark.sql(
    not_pm_free_query.format(start_date_string, end_date_string, start_date_string, end_date_string))
not_pm_free_links.show()


+----------------------+
|freely_available_links|
+----------------------+
|                292301|
+----------------------+

