# Wikipedia Page Sizes
- Report out page character counts from wiki markup in XML dumps
- Character counts use pre-rendered, pre-transcluded text, but relative page sizes should serve our study well
- populate `page_lengths_w_date` table
- report page sizes for W and WP:M for both dump dates

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
                       'enwiki-20190420-pages-articles-multistream.xml.bz2']

def page_length(entity, date):
    page_text = entity.revision.text._VALUE
    size = len(page_text)
    return Row(page_id=entity.id, page_length=size, dt=date)

page_lengths_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
    dump_date = re.search(r'.*(\d{8}).*',file).group(1)
    articles = wikipedia\
        .filter("ns = '0'")\
        .filter("redirect._title is null") \
        .filter("revision.text._VALUE is not null") \
        .filter("length(revision.text._VALUE) > 0")
    daily_page_lengths = sqlContext.createDataFrame(articles.rdd.map(lambda entity: page_length(entity, dump_date)))
    page_lengths_rdd = page_lengths_rdd.union(daily_page_lengths.rdd)

page_lengths_merged = sqlContext.createDataFrame(page_lengths_rdd)
page_lengths = page_lengths_merged.toPandas()


In [3]:
page_lengths

Unnamed: 0,dt,page_id,page_length
0,20190401,12,99963
1,20190401,25,135120
2,20190401,39,44303
3,20190401,290,25311
4,20190401,303,183922
5,20190401,305,72923
6,20190401,307,167030
7,20190401,308,138843
8,20190401,309,19905
9,20190401,316,95575


In [4]:
# write page lengths data to a table for later use
page_lengths_merged.createOrReplaceTempView("temp_page_lengths")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.page_lengths_w_date")
sqlContext.sql("CREATE TABLE ryanmax.page_lengths_w_date AS SELECT * FROM temp_page_lengths")

DataFrame[]

## Number of Wikipedia pages
- limited to namespace 0
- excludes redirect pages
- excludes pages without content

In [5]:
# page count of all pages in Wikipedia namespace 0, excluding redirect pages and pages with no content
count_query = """
SELECT dt, count(*) as pages
FROM 
    ryanmax.page_lengths_w_date
GROUP BY dt
"""
counts = spark.sql(count_query)
counts.show()

+--------+-------+
|      dt|  pages|
+--------+-------+
|20190420|5847824|
|20190401|5839083|
+--------+-------+



## Sizes of W pages with external links

In [6]:
# page_lengths of W pages w/ ext links
w_pl_query = """
SELECT dt, CAST(AVG(page_length) AS DECIMAL(10,2)) AS average_page_length, 
    PERCENTILE(page_length,0.5) AS median_page_length,
    STDDEV(page_length) as stddev_page_length,
    (PERCENTILE(page_length,0.75) - PERCENTILE(page_length,0.25)) as iqr_page_length
FROM 
    ryanmax.page_lengths_w_date
WHERE page_id IN 
    (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
GROUP BY dt
"""

w_pl_links = spark.sql(w_pl_query)
w_pl_links.show()

+--------+-------------------+------------------+------------------+---------------+
|      dt|average_page_length|median_page_length|stddev_page_length|iqr_page_length|
+--------+-------------------+------------------+------------------+---------------+
|20190420|            7676.41|            3865.0|13632.444562862427|         5789.0|
|20190401|            7662.35|            3860.0|13599.185396839484|         5779.0|
+--------+-------------------+------------------+------------------+---------------+



## Sizes of WP:M pages with external links

In [7]:
# page_lengths of WP:M pages w/ ext links
pm_pl_query = """
SELECT dt, CAST(AVG(page_length) AS DECIMAL(10,2)) AS average_page_length, 
    PERCENTILE(page_length,0.5) AS median_page_length,
    STDDEV(page_length) as stddev_page_length,
    (PERCENTILE(page_length,0.75) - PERCENTILE(page_length,0.25)) as iqr_page_length
FROM 
    ryanmax.page_lengths_w_date
WHERE page_id IN 
    (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
GROUP BY dt
"""

pm_pl_links = spark.sql(pm_pl_query)
pm_pl_links.show()


+--------+-------------------+------------------+------------------+---------------+
|      dt|average_page_length|median_page_length|stddev_page_length|iqr_page_length|
+--------+-------------------+------------------+------------------+---------------+
|20190420|           13084.95|            6628.0|19378.442855890582|        11640.0|
|20190401|           13057.77|            6607.0|19351.962904945245|        11611.0|
+--------+-------------------+------------------+------------------+---------------+

