# Load tables from `S3`

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, monotonically_increasing_id
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, dayofweek
from pyspark.sql.types import *

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

os.environ["AWS_ACCESS_KEY_ID"]= config['AWS']['AWS_ACCESS_KEY_ID']
os.environ["AWS_SECRET_ACCESS_KEY"]= config['AWS']['AWS_SECRET_ACCESS_KEY']

In [3]:
%%time
spark = SparkSession.builder\
                     .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:2.7.0")\
                     .getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

CPU times: user 80.2 ms, sys: 15.4 ms, total: 95.6 ms
Wall time: 16.6 s


## Load `songplay` table

In [12]:
output_data = "s3a://salas-bucket/temp-save/"

In [17]:
%%time
sp_df = spark.read.format("parquet").option("basePath", os.path.join(output_data, "songplays/")).load(os.path.join(output_data, "songplays/*/*/"))

CPU times: user 3.14 ms, sys: 0 ns, total: 3.14 ms
Wall time: 3.83 s


In [18]:
sp_df

songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent,year,month
0,2018-11-15 16:19:...,97,paid,SOBLFFE12AF72AA5BA,ARJNIUY12298900C91,605,Lansing-East Lans...,"""Mozilla/5.0 (X11...",2018,11


# Load `song` table

In [20]:
%%time
so_df = spark.read.format("parquet").option("basePath", os.path.join(output_data, "songs/")).load(os.path.join(output_data, "songs/*/*/"))

CPU times: user 2.86 ms, sys: 4.42 ms, total: 7.29 ms
Wall time: 24.1 s


In [21]:
so_df

song_id,title,duration,year,artist_id
SOKTJDS12AF72A25E5,Drown In My Own T...,192.522,0,ARA23XO1187B9AF18F
SOEKAZG12AB018837E,I'll Slap Your Fa...,129.85424,2001,ARSVTNL1187B992A91
SOAFBCP12A8C13CC7D,King Of Scurf (20...,301.40036,1972,ARTC1LV1187B9A4858
SORRNOC12AB017F52B,The Last Beat Of ...,337.81506,2004,ARSZ7L31187FB4E610
SOQPWCR12A6D4FB2A3,A Poor Recipe For...,118.07302,2005,AR73AIO1187B9AD57B
SODZYPO12A8C13A91E,Burn My Body (Alb...,177.99791,0,AR1C2IX1187B99BF74
SOBRKGM12A8C139EF6,Welcome to the Pl...,821.05424,1985,ARXQBR11187B98A2CC
SOERIDA12A6D4F8506,I Want You (Album...,192.28689,2006,ARBZIN01187FB362CC
SOAPERH12A58A787DC,The One And Only ...,230.42567,0,ARZ5H0P1187B98A1DD
SOSMJFC12A8C13DE0C,Is That All There...,343.87546,0,AR1KTV21187B9ACD72


## load `artists` table

In [29]:
%%time
ar_df = spark.read.format("parquet").option("basePath", os.path.join(output_data, "artists/")).load(os.path.join(output_data, "artists/*"))

CPU times: user 0 ns, sys: 3.74 ms, total: 3.74 ms
Wall time: 5.27 s


In [30]:
ar_df

artist_id,artist_name,artist_location,artist_latitude,artist_longitude
ARTC1LV1187B9A4858,The Bonzo Dog Band,Goldsmith's Colle...,51.4536,-0.01802
ARA23XO1187B9AF18F,The Smithereens,"Carteret, New Jersey",40.57885,-74.21956
AR73AIO1187B9AD57B,Western Addiction,"San Francisco, CA",37.77916,-122.42005
ARSVTNL1187B992A91,Jonathan King,"London, England",51.50632,-0.12714
AR10USD1187B99F3F1,Tweeterfriendly M...,"Burlington, Ontar...",,
ARZ5H0P1187B98A1DD,Snoop Dogg,"Long Beach, CA",33.76672,-118.1924
ARMJAGH1187FB546F3,The Box Tops,"Memphis, TN",35.14968,-90.04892
ARXQBR11187B98A2CC,Frankie Goes To H...,"Liverpool, England",,
ARCLYBR1187FB53913,Neal Schon,"San Mateo, CA",37.54703,-122.31483
ARY589G1187B9A9F4E,Talkdemonic,"Portland, OR",45.51179,-122.67563


# Load `users` table

In [23]:
%%time
us_df = spark.read.format("parquet").option("basePath", os.path.join(output_data, "users/")).load(os.path.join(output_data, "users/*"))

CPU times: user 4.43 ms, sys: 0 ns, total: 4.43 ms
Wall time: 12.4 s


In [24]:
us_df

user_id,first_name,last_name,gender,level
88,Mohammad,Rodriguez,M,free
75,Joseph,Gutierrez,M,free
69,Anabelle,Simpson,F,free
29,Jacqueline,Lynch,F,free
68,Jordan,Rodriguez,F,free
2,Jizelle,Benjamin,F,free
61,Samuel,Gonzalez,M,free
14,Theodore,Harris,M,free
40,Tucker,Garrison,M,free
52,Theodore,Smith,M,free


# load `time table`

In [31]:
tm_df = spark.read.format("parquet").option("basePath", os.path.join(output_data, "time_table/")).load(os.path.join(output_data, "time_table/*/*/"))

In [32]:
tm_df

ts,start_time,hour,day,week,weekday,year,month
1542299400796,2018-11-15 16:30:...,16,15,46,5,2018,11
1542302304796,2018-11-15 17:18:...,17,15,46,5,2018,11
1542313156796,2018-11-15 20:19:...,20,15,46,5,2018,11
1542318492796,2018-11-15 21:48:...,21,15,46,5,2018,11
1542828783796,2018-11-21 19:33:...,19,21,47,4,2018,11
1542834185796,2018-11-21 21:03:...,21,21,47,4,2018,11
1542842644796,2018-11-21 23:24:...,23,21,47,4,2018,11
1542175935796,2018-11-14 06:12:...,6,14,46,4,2018,11
1542193206796,2018-11-14 11:00:...,11,14,46,4,2018,11
1542215827796,2018-11-14 17:17:...,17,14,46,4,2018,11
