# Create WP:M and W subset of pages with external links
- W subset excludes WP:M pages

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### WP:M rate of change between 2019-04-01 and 2019-04-20 data

In [2]:
q = """
SELECT COUNT(DISTINCT page_id) AS num_pages, to_date(dt) as extract_date
FROM ryanmax.projmed_with_extlinks
WHERE (to_date(dt) = '2019-04-01' OR to_date(dt) = '2019-04-20')
GROUP BY extract_date
ORDER BY extract_date
"""
wpm_pages_with_extlinks = spark.sql(q)
wpm_pages_with_extlinks_pd = sqlContext.createDataFrame(wpm_pages_with_extlinks.rdd).toPandas()
print('Rate of change: ',(wpm_pages_with_extlinks_pd['num_pages'][1] - wpm_pages_with_extlinks_pd['num_pages'][0])/wpm_pages_with_extlinks_pd['num_pages'][0])
wpm_pages_with_extlinks_pd


Rate of change:  0.00215126463628


Unnamed: 0,num_pages,extract_date
0,32539,2019-04-01
1,32609,2019-04-20


### W rate of change between 2019-04-01 and 2019-04-20 data

In [3]:
q = """
SELECT COUNT(DISTINCT page_id) AS num_pages, to_date(dt) as extract_date
FROM ryanmax.pages_with_extlinks
WHERE (to_date(dt) = '2019-04-01' OR to_date(dt) = '2019-04-20')
GROUP BY extract_date
ORDER BY extract_date
"""
w_pages_with_extlinks = spark.sql(q)
w_pages_with_extlinks_pd = sqlContext.createDataFrame(w_pages_with_extlinks.rdd).toPandas()
print('Rate of change: ',(w_pages_with_extlinks_pd['num_pages'][1] - w_pages_with_extlinks_pd['num_pages'][0])/w_pages_with_extlinks_pd['num_pages'][0])
w_pages_with_extlinks_pd


Rate of change:  0.0022570761574


Unnamed: 0,num_pages,extract_date
0,5231547,2019-04-01
1,5243355,2019-04-20


## Define WP:M and W subsets using a single day of data: 2019-04-20
- using a single day since rates of change are minimal

### WP:M pages with external links

In [5]:
q = """
SELECT DISTINCT page_id 
FROM ryanmax.projmed_with_extlinks
WHERE to_date(dt) = '2019-04-20'
ORDER BY page_id
"""
wpm_pages_with_extlinks = spark.sql(q)
wpm_pages_with_extlinks.count()


32609

In [6]:
# write data and report out distinct page count
wpm_pages_with_extlinks.createOrReplaceTempView("temp_wpm_pages_with_extlinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_wpm_pages_with_extlinks")
sqlContext.sql("CREATE TABLE ryanmax.population_wpm_pages_with_extlinks AS SELECT * FROM temp_wpm_pages_with_extlinks")
spark.sql("select count(*) as WPM_pages from ryanmax.population_wpm_pages_with_extlinks").show()

+---------+
|WPM_pages|
+---------+
|    32609|
+---------+



### W pages with external links

In [7]:
q = """
SELECT DISTINCT page_id FROM ryanmax.pages_with_extlinks WHERE to_date(dt) = '2019-04-20'
MINUS
SELECT DISTINCT page_id FROM ryanmax.population_wpm_pages_with_extlinks
ORDER BY page_id
"""
w_pages_with_extlinks = spark.sql(q)
w_pages_with_extlinks.count()


5210746

In [8]:
# write data and report out distinct page count
w_pages_with_extlinks.createOrReplaceTempView("temp_w_pages_with_extlinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_w_pages_with_extlinks")
sqlContext.sql("CREATE TABLE ryanmax.population_w_pages_with_extlinks AS SELECT * FROM temp_w_pages_with_extlinks")
spark.sql("select count(*) as W_pages from ryanmax.population_w_pages_with_extlinks").show()

+-------+
|W_pages|
+-------+
|5210746|
+-------+

