**Instructions**

1. Run the ETL with `python etl.py`
1. Run this jupyter notebook to check the parquet files

In [5]:
from utils import create_spark_session, read_config
from IPython.display import display

config_file = 'dl.cfg'
src = 'local'

In [3]:
spark = create_spark_session(is_s3_input=False)

[2020-05-17 23:56:50,922] INFO - Running `create_spark_session`...
[2020-05-17 23:56:53,725] INFO - `create_spark_session` finished!


In [6]:
config = read_config(config_file)
output_path = config[src]['output_path']

# First few rows of all tables, schemas and count

In [10]:
tables = (('artists', 'artist_id'),
          ('songs', 'song_id'),
          ('time', 'start_time'),
          ('users', 'user_id'),
          ('songplays', 'songplay_id'))

for t, primary_col in tables:
    print(80 * '-')
    print(t)
    df = spark.read.parquet(f'{output_path:s}{t:s}.parquet/')
    print('Number of rows: {:,d} (all), {:,d} (unique values in primary key)'
          ''.format(df.count(), df.select(primary_col).distinct().count()))
    display(df.limit(10).toPandas())
    df.printSchema()

--------------------------------------------------------------------------------
artists
Number of rows: 69 (all), 69 (unique values in primary key)


Unnamed: 0,artist_id,name,location,latitude,longitude
0,ARMAC4T1187FB3FA4C,The Dillinger Escape Plan,"Morris Plains, NJ",40.82624,-74.47995
1,ARNF6401187FB57032,Sophie B. Hawkins,"New York, NY [Manhattan]",40.79086,-73.96644
2,AROUOZZ1187B9ABE51,Willie Bobo,"New York, NY [Spanish Harlem]",40.79195,-73.94512
3,ARI2JSK1187FB496EF,Nick Ingman;Gavyn Wright,"London, England",51.50632,-0.12714
4,AREBBGV1187FB523D2,Mike Jones (Featuring CJ_ Mello & Lil' Bran),"Houston, TX",,
5,ARD842G1187B997376,Blue Rodeo,"Toronto, Ontario, Canada",43.64856,-79.38533
6,AR9AWNF1187B9AB0B4,Kenny G featuring Daryl Hall,"Seattle, Washington USA",,
7,ARIG6O41187B988BDD,Richard Souther,United States,37.16793,-95.84502
8,ARGSJW91187B9B1D6B,JennyAnyKind,North Carolina,35.21962,-80.01955
9,AR3JMC51187B9AE49D,Backstreet Boys,"Orlando, FL",28.53823,-81.37739


root
 |-- artist_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)

--------------------------------------------------------------------------------
songs
Number of rows: 71 (all), 71 (unique values in primary key)


Unnamed: 0,song_id,title,duration,year,artist_id
0,SOAOIBZ12AB01815BE,I Hold Your Hand In Mine [Live At Royal Albert...,43.36281,2000,ARPBNLO1187FB3D52F
1,SONYPOM12A8C13B2D7,I Think My Wife Is Running Around On Me (Taco ...,186.48771,2005,ARDNS031187B9924F0
2,SODREIN12A58A7F2E5,A Whiter Shade Of Pale (Live @ Fillmore West),326.00771,0,ARLTWXK1187FB5A3F8
3,SOYMRWW12A6D4FAB14,The Moon And I (Ordinary Day Album Version),267.7024,0,ARKFYS91187B98E58F
4,SOWQTQZ12A58A7B63E,Streets On Fire (Explicit Album Version),279.97995,0,ARPFHN61187FB575F6
5,SOUDSGM12AC9618304,Insatiable (Instrumental Version),266.39628,0,ARNTLGG11E2835DDB9
6,SOPEGZN12AB0181B3D,Get Your Head Stuck On Your Neck,45.66159,0,AREDL271187FB40F44
7,SOOLYAZ12A6701F4A6,Laws Patrolling (Album Version),173.66159,0,AREBBGV1187FB523D2
8,SOBBUGU12A8C13E95D,Setting Fire to Sleeping Giants,207.77751,2004,ARMAC4T1187FB3FA4C
9,SOBAYLL12A8C138AF9,Sono andati? Fingevo di dormire,511.16363,0,ARDR4AC1187FB371A1


root
 |-- song_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- artist_id: string (nullable = true)

--------------------------------------------------------------------------------
time
Number of rows: 6,813 (all), 6,813 (unique values in primary key)


Unnamed: 0,start_time,hour,day,week,weekday,year,month
0,2018-11-15 08:52:20,8,15,46,5,2018,11
1,2018-11-15 13:52:55,13,15,46,5,2018,11
2,2018-11-21 07:56:03,7,21,47,4,2018,11
3,2018-11-21 09:34:42,9,21,47,4,2018,11
4,2018-11-21 17:08:21,17,21,47,4,2018,11
5,2018-11-21 20:11:16,20,21,47,4,2018,11
6,2018-11-14 02:51:03,2,14,46,4,2018,11
7,2018-11-28 03:13:38,3,28,48,4,2018,11
8,2018-11-28 06:18:57,6,28,48,4,2018,11
9,2018-11-28 14:57:00,14,28,48,4,2018,11


root
 |-- start_time: timestamp (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- week: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

--------------------------------------------------------------------------------
users
Number of rows: 96 (all), 96 (unique values in primary key)


Unnamed: 0,user_id,first_name,last_name,gender,level
0,88,Mohammad,Rodriguez,M,free
1,4,Alivia,Terrell,F,free
2,55,Martin,Johnson,M,free
3,59,Lily,Cooper,F,free
4,69,Anabelle,Simpson,F,free
5,53,Celeste,Williams,F,free
6,75,Joseph,Gutierrez,M,free
7,29,Jacqueline,Lynch,F,free
8,32,Lily,Burns,F,free
9,60,Devin,Larson,M,free


root
 |-- user_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- level: string (nullable = true)

--------------------------------------------------------------------------------
songplays
Number of rows: 1 (all), 1 (unique values in primary key)


Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent,year,month
0,0,2018-11-21 19:56:47,15,paid,SOZCTXZ12AB0182364,AR5KOSW1187FB35FF4,818,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",2018,11


root
 |-- songplay_id: long (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- user_id: string (nullable = true)
 |-- level: string (nullable = true)
 |-- song_id: string (nullable = true)
 |-- artist_id: string (nullable = true)
 |-- session_id: long (nullable = true)
 |-- location: string (nullable = true)
 |-- user_agent: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

