### string lengths of Wikipedia pages

In [1]:
import pyspark
import re
import pyspark.sql
from pyspark.sql import *
import pandas as pd
import matplotlib.pyplot as plt
import hashlib
import os.path
from pyspark.sql.functions import desc
from datetime import timedelta, date
from pyspark import *

%matplotlib inline
spark_hive = pyspark.sql.HiveContext(sc)


In [2]:
WIKIPEDIA_XML_DUMP = 'enwiki-20190420-pages-articles-multistream.xml.bz2'

def page_length(entity):
    page_text = entity.revision.text._VALUE
    size = len(page_text)
    return Row(page_id=entity.id, page_length=size)

wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(WIKIPEDIA_XML_DUMP)

articles = wikipedia\
    .filter("ns = '0'")\
    .filter("redirect._title is null") \
    .filter("revision.text._VALUE is not null") \
    .filter("length(revision.text._VALUE) > 0")

page_lengths = sqlContext.createDataFrame(articles.rdd.map(page_length))

In [3]:
page_lengths.show()

+-------+-----------+
|page_id|page_length|
+-------+-----------+
|     12|     102191|
|     25|     135250|
|     39|      43966|
|    290|      25311|
|    303|     183922|
|    305|      73305|
|    307|     168733|
|    308|     138869|
|    309|      19905|
|    316|      96065|
|    324|     104397|
|    330|       5841|
|    332|       6389|
|    334|      12895|
|    336|      67605|
|    339|      90900|
|    340|       7512|
|    344|      12659|
|    358|     145728|
|    359|      32578|
+-------+-----------+
only showing top 20 rows



In [4]:
# write page lengths data to a table for later use
page_lengths.createOrReplaceTempView("temp_page_lengths")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.page_lengths")
sqlContext.sql("CREATE TABLE ryanmax.page_lengths AS SELECT * FROM temp_page_lengths")

DataFrame[]

In [5]:
# set date ranges for queries
start_date = date(2019, 3, 29)
end_date = date(2019, 4, 22)
date_format = '%Y-%m-%d'
start_date_string = start_date.strftime(date_format)
end_date_string = end_date.strftime(date_format)

In [6]:
# page_lengths of W pages w/ ext links
w_pl_query = """
SELECT CAST(AVG(page_length) AS DECIMAL(10,2)) AS average_page_length, 
    PERCENTILE(page_length,0.5) AS median_page_length,
    STDDEV(page_length) as stddev_page_length,
    (PERCENTILE(page_length,0.75) - PERCENTILE(page_length,0.25)) as iqr_page_length
FROM 
    ryanmax.page_lengths
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) = '{}' AND to_date(dt) <= '{}')
"""

w_pl_links = spark.sql(w_pl_query.format(start_date_string, end_date_string))
w_pl_links.show()

+-------------------+------------------+------------------+---------------+
|average_page_length|median_page_length|stddev_page_length|iqr_page_length|
+-------------------+------------------+------------------+---------------+
|            7718.31|            3881.0| 13695.01314943199|         5826.0|
+-------------------+------------------+------------------+---------------+



In [7]:
# page_lengths of WP:M pages w/ ext links
pm_pl_query = """
SELECT CAST(AVG(page_length) AS DECIMAL(10,2)) AS average_page_length, 
    PERCENTILE(page_length,0.5) AS median_page_length,
    STDDEV(page_length) as stddev_page_length,
    (PERCENTILE(page_length,0.75) - PERCENTILE(page_length,0.25)) as iqr_page_length
FROM 
    ryanmax.page_lengths
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
"""

pm_pl_links = spark.sql(pm_pl_query.format(start_date_string, end_date_string))
pm_pl_links.show()


+-------------------+------------------+------------------+---------------+
|average_page_length|median_page_length|stddev_page_length|iqr_page_length|
+-------------------+------------------+------------------+---------------+
|           13073.98|            6618.0|19366.759391823256|        11638.0|
+-------------------+------------------+------------------+---------------+



In [8]:
# page_lengths of not WP:M pages w/ ext links
not_pm_pl_query = """
SELECT CAST(AVG(page_length) AS DECIMAL(10,2)) AS average_page_length, 
    PERCENTILE(page_length,0.5) AS median_page_length,
    STDDEV(page_length) as stddev_page_length,
    (PERCENTILE(page_length,0.75) - PERCENTILE(page_length,0.25)) as iqr_page_length
FROM 
    ryanmax.page_lengths
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
    AND page_id NOT IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
"""

not_pm_pl_links = spark.sql(
    not_pm_pl_query.format(start_date_string, end_date_string, start_date_string, end_date_string))
not_pm_pl_links.show()


+-------------------+------------------+------------------+---------------+
|average_page_length|median_page_length|stddev_page_length|iqr_page_length|
+-------------------+------------------+------------------+---------------+
|            7675.84|            3865.0|13634.502758611616|         5788.0|
+-------------------+------------------+------------------+---------------+

