## Load externallinks Data
- data source: SQL dumps of externallinks table
- data filtered down to el_id, el_from, el_to columns
- dates: 20190401 and 20190420

In [1]:
# basic defaults, including study dates, common SQL exclusions and parquet files for anonymized data
%run -i 'data-defaults.py'

### load dump files to HDFS

In [2]:
dates = ['20190401','20190420']

In [3]:
for d in dates:
    !hdfs dfs -put ~/dumps/ext$d externallinks_from_dump_$d

### schema creation

In [6]:

create = """
CREATE TABLE IF NOT EXISTS ryanmax.externallinks_from_dump_{} ( el_id bigint, el_from bigint, el_to String)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE
TBLPROPERTIES("skip.header.line.count"="1")
"""

load = """
LOAD DATA INPATH 'externallinks_from_dump_{}' OVERWRITE INTO TABLE ryanmax.externallinks_from_dump_{}
"""

for d in dates:
    sqlContext.sql(create.format(d))
    sqlContext.sql(load.format(d,d))
    

In [7]:
# data check
for d in dates:
    spark.sql("SELECT '{}' as dump_date, count(*) as num_links FROM ryanmax.externallinks_from_dump_{}".format(d,d)).show()


+---------+---------+
|dump_date|num_links|
+---------+---------+
| 20190401|128425048|
+---------+---------+

+---------+---------+
|dump_date|num_links|
+---------+---------+
| 20190420|129078534|
+---------+---------+

