# Wikipedia Pages
- create page subsets, including:
 - `population_wpm_pages_with_extlinks`: WP:M page subset, limited to pages with external links
 - `population_w_pages_with_extlinks`: W page subset, limited to pages with external links, excludes WP:M pages
 - `pages_wpm`: WP:M page subset, NOT limited to pages with external links
 - `projmed_categories`: categories or classes of WP:M pages
- report counts for W, WP:M pages with and without external links

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### Load Pages Data from SQL Output
- see [data](../data/) directory for input

In [2]:
# store SQL output in HDFS
!cat ~/wiki-citation-usage/data/wiki_proj_med_pages-2019*.txt > /tmp/pages_wpm
!hdfs dfs -put /tmp/pages_wpm pages_wpm

!cat ~/wiki-citation-usage/data/wiki_proj_med_pages-with-extlinks-2019*.txt > /tmp/pages_wpm_with_extlinks
!hdfs dfs -put /tmp/pages_wpm_with_extlinks pages_wpm_with_extlinks

!zcat ~/wiki-citation-usage/data/pages-with-extlinks-2019*.txt.gz > /tmp/pages_w_with_extlinks
!hdfs dfs -put /tmp/pages_w_with_extlinks pages_w_with_extlinks

!cat ~/wiki-citation-usage/data/wiki_proj_med_categorylinks-2019*.txt > /tmp/projmed_categories
!hdfs dfs -put /tmp/projmed_categories projmed_categories


In [3]:
# create Hive tables from HDFS
create = """
CREATE TABLE IF NOT EXISTS ryanmax.{} {}
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
"""

load = """
LOAD DATA INPATH '{}' OVERWRITE INTO TABLE ryanmax.{}
"""

tables = ['pages_wpm','pages_wpm_with_extlinks', 'pages_w_with_extlinks', 'projmed_categories']
for table in tables:
    fields = '(page_id bigint, revision_id bigint, page_title String, dt String)'
    # pages_w_with_extlinks and projmed_categories need schema adjustments
    if 'pages_w_with_extlinks' == table :
        fields = '(page_id bigint, revision_id bigint, dt String)'
    elif 'projmed_categories' == table:
        fields = '(page_id bigint, revision_id bigint, page_title String, category String, dt String)'

    sqlContext.sql(create.format(table, fields))
    sqlContext.sql(load.format(table,table))

### WP:M rate of change between 2019-04-01 and 2019-04-20 data

In [4]:
q = """
SELECT COUNT(DISTINCT page_id) AS num_pages, to_date(dt) as extract_date
FROM ryanmax.pages_wpm_with_extlinks
WHERE (to_date(dt) = '2019-04-01' OR to_date(dt) = '2019-04-20')
GROUP BY extract_date
ORDER BY extract_date
"""
wpm_pages_with_extlinks = spark.sql(q)
wpm_pages_with_extlinks_pd = sqlContext.createDataFrame(wpm_pages_with_extlinks.rdd).toPandas()
print('Rate of change: ',(wpm_pages_with_extlinks_pd['num_pages'][1] - wpm_pages_with_extlinks_pd['num_pages'][0])/wpm_pages_with_extlinks_pd['num_pages'][0])
wpm_pages_with_extlinks_pd


Rate of change:  0.00215126463628


Unnamed: 0,num_pages,extract_date
0,32539,2019-04-01
1,32609,2019-04-20


### W rate of change between 2019-04-01 and 2019-04-20 data

In [5]:
q = """
SELECT COUNT(DISTINCT page_id) AS num_pages, to_date(dt) as extract_date
FROM ryanmax.pages_w_with_extlinks
WHERE (to_date(dt) = '2019-04-01' OR to_date(dt) = '2019-04-20')
GROUP BY extract_date
ORDER BY extract_date
"""
w_pages_with_extlinks = spark.sql(q)
w_pages_with_extlinks_pd = sqlContext.createDataFrame(w_pages_with_extlinks.rdd).toPandas()
print('Rate of change: ',(w_pages_with_extlinks_pd['num_pages'][1] - w_pages_with_extlinks_pd['num_pages'][0])/w_pages_with_extlinks_pd['num_pages'][0])
w_pages_with_extlinks_pd


Rate of change:  0.0022570761574


Unnamed: 0,num_pages,extract_date
0,5231547,2019-04-01
1,5243355,2019-04-20


## Define WP:M and W subsets using a single day of data: 2019-04-20
- using a single day since rates of change are minimal
- these two tables `population_w_pages_with_extlinks` and `population_wpm_pages_with_extlinks` will be persisted for use in the rest of the study

### WP:M pages with external links

In [6]:
q = """
SELECT DISTINCT page_id 
FROM ryanmax.pages_wpm_with_extlinks
WHERE to_date(dt) = '2019-04-20'
ORDER BY page_id
"""
wpm_pages_with_extlinks = spark.sql(q)
wpm_pages_with_extlinks.count()


32609

In [7]:
# write data and report out distinct page count
wpm_pages_with_extlinks.createOrReplaceTempView("temp_wpm_pages_with_extlinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_wpm_pages_with_extlinks")
sqlContext.sql("CREATE TABLE ryanmax.population_wpm_pages_with_extlinks AS SELECT * FROM temp_wpm_pages_with_extlinks")
spark.sql("select count(*) as WPM_pages from ryanmax.population_wpm_pages_with_extlinks").show()

+---------+
|WPM_pages|
+---------+
|    32609|
+---------+



### W pages with external links

In [8]:
q = """
SELECT DISTINCT page_id FROM ryanmax.pages_w_with_extlinks WHERE to_date(dt) = '2019-04-20'
MINUS
SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks
ORDER BY page_id
"""
w_pages_with_extlinks = spark.sql(q)
w_pages_with_extlinks.count()


5210746

In [9]:
# write data and report out distinct page count
w_pages_with_extlinks.createOrReplaceTempView("temp_w_pages_with_extlinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_w_pages_with_extlinks")
sqlContext.sql("CREATE TABLE ryanmax.population_w_pages_with_extlinks AS SELECT * FROM temp_w_pages_with_extlinks")
spark.sql("select count(*) as W_pages from ryanmax.population_w_pages_with_extlinks").show()

+-------+
|W_pages|
+-------+
|5210746|
+-------+



### Count of W pages
- limited to namespace 0
- excludes redirect pages
- excludes pages without content
- includes pages with and without external links

In [10]:
q = """
SELECT dt, count(distinct page_id) as pages
FROM 
    ryanmax.page_lengths_w_date
GROUP BY dt
ORDER BY dt
"""
spark.sql(q).show()

+--------+-------+
|      dt|  pages|
+--------+-------+
|20190401|5839083|
|20190420|5847824|
+--------+-------+



### Count of WP:M pages
- limited to namespace 0
- excludes redirect pages
- excludes pages without content
- includes pages with and without external links

In [11]:
q = """
SELECT dt, count(distinct page_id) as pages
FROM 
    ryanmax.page_lengths_w_date
WHERE page_id IN (SELECT page_id FROM ryanmax.pages_wpm)
GROUP BY dt
ORDER BY dt
"""
spark.sql(q).show()

+--------+-----+
|      dt|pages|
+--------+-----+
|20190401|34325|
|20190420|34324|
+--------+-----+

