# Load externallinks Data
- data source: SQL dumps of externallinks table from 20190401 and 20190420
- data filtered down to el_id, el_from, el_to columns
- calculate rate of change between two dates
- populate `population_externallinks` table
- report static counts of external links for W and WP:M pages

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### Load dump files to HDFS

In [2]:
dates = ['20190401','20190420']

In [3]:
for d in dates:
    !hdfs dfs -put ~/dumps/ext"$d" externallinks_from_dump_"$d"

### Create schema

In [6]:

create = """
CREATE TABLE IF NOT EXISTS ryanmax.externallinks_from_dump_{} ( el_id bigint, el_from bigint, el_to String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
TBLPROPERTIES("skip.header.line.count"="1")
"""

load = """
LOAD DATA INPATH 'externallinks_from_dump_{}' OVERWRITE INTO TABLE ryanmax.externallinks_from_dump_{}
"""

for d in dates:
    sqlContext.sql(create.format(d))
    sqlContext.sql(load.format(d,d))
    

In [7]:
# data check
for d in dates:
    spark.sql("SELECT '{}' as dump_date, count(distinct el_from, el_to) as num_links FROM ryanmax.externallinks_from_dump_{}".format(d,d)).show()
    

+---------+---------+
|dump_date|num_links|
+---------+---------+
| 20190401|111849403|
+---------+---------+

+---------+---------+
|dump_date|num_links|
+---------+---------+
| 20190420|112392445|
+---------+---------+



In [8]:
# write data to a single table with a date column
links_rdd = sc.emptyRDD()
for d in dates:
    links = spark.sql("SELECT '{}' as dump_date, * FROM ryanmax.externallinks_from_dump_{}".format(d,d))
    links_rdd = links_rdd.union(links.rdd)

# write links data to a table for later use
links_merged = sqlContext.createDataFrame(links_rdd)
links_merged.createOrReplaceTempView("temp_externallinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.externallinks")
sqlContext.sql("CREATE TABLE ryanmax.externallinks AS SELECT * FROM temp_externallinks")

DataFrame[]

In [9]:
# show rate of change between dump dates
q = """
SELECT dump_date, count(distinct el_from, el_to) as num_links 
FROM ryanmax.externallinks 
GROUP BY dump_date
ORDER BY dump_date
"""
extlinks = spark.sql(q)
extlinks_pd = sqlContext.createDataFrame(extlinks.rdd).toPandas()
print('Rate of change: ',(extlinks_pd['num_links'][1] - extlinks_pd['num_links'][0])/extlinks_pd['num_links'][0])
extlinks_pd

Rate of change:  0.00485511755481


Unnamed: 0,dump_date,num_links
0,20190401,111849403
1,20190420,112392445


### Create population_externallinks table to be used for the rest of the study

In [11]:
# write 20190420 data to a single "population" table
links_rdd = sc.emptyRDD()
links = spark.sql("SELECT * FROM ryanmax.externallinks_from_dump_20190420")
links_rdd = links_rdd.union(links.rdd)

# write links data to a table for later use
links_df = sqlContext.createDataFrame(links.rdd)
links_df.createOrReplaceTempView("temp_population_externallinks")
sqlContext.sql("DROP TABLE IF EXISTS ryanmax.population_externallinks")
sqlContext.sql("CREATE TABLE ryanmax.population_externallinks AS SELECT * FROM temp_population_externallinks")

# report number of links for sanity
spark.sql("SELECT count(distinct el_from, el_to) as num_links FROM ryanmax.population_externallinks").show()


+---------+
|num_links|
+---------+
|112392445|
+---------+



### Count of external links for W
- sum distinct links per page

In [2]:
q = """
SELECT count(distinct el_from, el_to) as w_num_externallinks 
FROM ryanmax.population_externallinks
WHERE el_from IN (SELECT page_id FROM ryanmax.population_w_pages_with_extlinks)
"""
spark.sql(q).toPandas()

Unnamed: 0,w_num_externallinks
0,60851396


### Count of external links for WP:M
- sum distinct links per page

In [3]:
q = """
SELECT count(distinct el_from, el_to) as wpm_num_externallinks 
FROM ryanmax.population_externallinks
WHERE el_from IN (SELECT page_id FROM ryanmax.population_wpm_pages_with_extlinks)
"""
spark.sql(q).toPandas()

Unnamed: 0,wpm_num_externallinks
0,945645
