### Extract Infobox Counts from XML Dumps

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

In [2]:
WIKIPEDIA_XML_DUMPS = ['enwiki-20190401-pages-articles-multistream.xml.bz2',
                       'enwiki-20190420-pages-articles-multistream.xml.bz2']

INFOBOX_REGEX = re.compile(r'\{ *infobox ', re.IGNORECASE)

def count_infobox(entity, date):
    page_text = entity.revision.text._VALUE
    count = len(INFOBOX_REGEX.findall(page_text))
    return Row(page_id=entity.id, infobox_count=count, dt=date)

infobox_rdd = sc.emptyRDD()
for file in WIKIPEDIA_XML_DUMPS:
    wikipedia = sqlContext.read.format('com.databricks.spark.xml').options(rowTag='page').load(file)
    dump_date = re.search(r'.*(\d{8}).*',file).group(1)
    articles = wikipedia\
        .filter("ns = '0'")\
        .filter("redirect._title is null") \
        .filter("revision.text._VALUE is not null") \
        .filter("length(revision.text._VALUE) > 0")
    daily_counts = sqlContext.createDataFrame(articles.rdd.map(lambda entity: count_infobox(entity, dump_date)))
    infobox_rdd = infobox_rdd.union(daily_counts.rdd)

infobox_merged = sqlContext.createDataFrame(infobox_rdd)
infobox = infobox_merged.toPandas()


In [3]:
infobox

Unnamed: 0,dt,infobox_count,page_id
0,20190401,0,12
1,20190401,1,25
2,20190401,0,39
3,20190401,1,290
4,20190401,2,303
5,20190401,0,305
6,20190401,2,307
7,20190401,1,308
8,20190401,0,309
9,20190401,1,316


In [4]:
# write infobox counts data to a table for later use
infobox_merged.createOrReplaceTempView("temp_infobox_count")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.infobox_count")
sqlContext.sql("CREATE TABLE ryanmax.infobox_count AS SELECT * FROM temp_infobox_count")

DataFrame[]

In [5]:
infobox.query('infobox_count>25')

Unnamed: 0,dt,infobox_count,page_id
5011,20190401,49,10577
6537,20190401,58,13696
49147,20190401,34,102962
49165,20190401,27,102985
49565,20190401,29,103782
162390,20190401,52,373334
164886,20190401,34,381506
169633,20190401,28,397295
276936,20190401,78,782670
285251,20190401,32,841648


#### Count of pages with infoboxes in W pages w/ ext links

In [6]:
# Count of pages with infoboxes in W pages w/ ext links
w_infobox_query = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date
FROM 
    ryanmax.infobox_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

w_infobox = spark.sql(w_infobox_query.format(start_date_string, end_date_string))
w_infobox.show()

+---------------+------------+
|pages_w_infobox|extract_date|
+---------------+------------+
|        5243258|    20190420|
|        5235364|    20190401|
+---------------+------------+



#### Count of pages with infoboxes in WP:M pages w/ ext links

In [7]:
# Count of pages with infoboxes in WP:M pages w/ ext links
pm_infobox_query = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date
FROM 
    ryanmax.infobox_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

pm_infobox = spark.sql(pm_infobox_query.format(start_date_string, end_date_string))
pm_infobox.show()


+---------------+------------+
|pages_w_infobox|extract_date|
+---------------+------------+
|          32653|    20190420|
|          32643|    20190401|
+---------------+------------+



#### Count of pages with infoboxes not in WP:M pages

In [8]:
# Count of pages with infoboxes not in WP:M pages
not_pm_infobox_query = """
SELECT COUNT(DISTINCT page_id) AS pages_w_infobox, dt AS extract_date
FROM 
    ryanmax.infobox_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
    AND page_id NOT IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

not_pm_infobox = spark.sql(
                        not_pm_infobox_query.format(
                            start_date_string, end_date_string, 
                            start_date_string, end_date_string))
not_pm_infobox.show()


+---------------+------------+
|pages_w_infobox|extract_date|
+---------------+------------+
|        5210605|    20190420|
|        5202721|    20190401|
+---------------+------------+



#### Number of infoboxes per page in W pages w/ ext links

In [10]:
# Number of infoboxes per page in W pages w/ ext links
w_infobox_cnts_query = """
SELECT MIN(infobox_count) as min,
    MAX(infobox_count) as max, 
    CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,
    PERCENTILE(infobox_count,0.5) AS median_infobox_count,
    STDDEV(infobox_count) as stddev_infobox_count,
    (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count,
    dt AS extract_date
FROM 
    ryanmax.infobox_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.pages_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

w_infobox_cnts = spark.sql(w_infobox_cnts_query.format(start_date_string, end_date_string))
w_infobox_cnts.show()

+---+---+---------------------+--------------------+--------------------+-----------------+------------+
|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|extract_date|
+---+---+---------------------+--------------------+--------------------+-----------------+------------+
|  0|346|                 0.67|                 1.0|  0.8557295792074723|              1.0|    20190420|
|  0|344|                 0.67|                 1.0|  0.8593801716672427|              1.0|    20190401|
+---+---+---------------------+--------------------+--------------------+-----------------+------------+



#### Number of infoboxes per page in WP:M pages w/ ext links

In [11]:
# Number of infoboxes per page in WP:M pages w/ ext links
pm_infobox_cnts_query = """
SELECT MIN(infobox_count) as min,
    MAX(infobox_count) as max, 
    CAST(AVG(infobox_count) AS DECIMAL(10,2)) AS average_infobox_count,
    PERCENTILE(infobox_count,0.5) AS median_infobox_count,
    STDDEV(infobox_count) as stddev_infobox_count,
    (PERCENTILE(infobox_count,0.75) - PERCENTILE(infobox_count,0.25)) as iqr_infobox_count,
    dt AS extract_date
FROM 
    ryanmax.infobox_count
WHERE page_id IN 
    (SELECT DISTINCT page_id 
    FROM ryanmax.projmed_with_extlinks 
    WHERE to_date(dt) >= '{}' AND to_date(dt) <= '{}')
GROUP BY dt
"""

pm_infobox_cnts = spark.sql(pm_infobox_cnts_query.format(start_date_string, end_date_string))
pm_infobox_cnts.show()

+---+---+---------------------+--------------------+--------------------+-----------------+------------+
|min|max|average_infobox_count|median_infobox_count|stddev_infobox_count|iqr_infobox_count|extract_date|
+---+---+---------------------+--------------------+--------------------+-----------------+------------+
|  0| 11|                 0.49|                 0.0|  0.5235294632520193|              1.0|    20190420|
|  0| 11|                 0.49|                 0.0|  0.5234552218588296|              1.0|    20190401|
+---+---+---------------------+--------------------+--------------------+-----------------+------------+

