# Test Sparkify's S3 to Redshift ETL Notebook

This notebook is used as a sanity check to see if the ETL process worked properly. We will connect to the Redshift cluster and use several queries to make sure everything worked properly.

### Import Libraries

In [1]:
from configparser import ConfigParser
import psycopg2
import psycopg2.extras
import pandas as pd

from utils import *

### Connect to Database

In [2]:
# Get information from configuration and
# connect to Redshift
config = ConfigParser()
config.read_file(open('dwh.cfg'))

conn_string = "host={} dbname={} user={} password={} port={}".format(*config['CLUSTER'].values())
conn = psycopg2.connect(conn_string)

print(conn_string)

host=redshiftdwh.c0fxig4ucntn.us-west-2.redshift.amazonaws.com dbname=dev user=awsuser password=Redshift0 port=5439


### Test SQL Queries

In [3]:
# https://stackoverflow.com/questions/35604186/convert-psycopg2-dictrow-query-to-pandas-dataframe
# This returns the headers of each tuple/row so we can get the column names
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

In [4]:
query = """
    SELECT page, count(*)
    FROM staging_events
    GROUP BY page
"""

execute_query(cur=cur, query=query) 

Unnamed: 0,page,count
0,Login,92
1,Home,806
2,NextSong,6820
3,Downgrade,60
4,Settings,56
5,Save Settings,10
6,Logout,90
7,Help,47
8,Error,9
9,Upgrade,21


In [5]:
query = """
    SELECT * FROM staging_songs LIMIT 5;
"""

execute_query(cur=cur, query=query) 

Unnamed: 0,num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
0,1,ARCLYBR1187FB53913,37.54703,-122.31483,"San Mateo, CA",Neal Schon,SOOVHYF12A8C134892,I'll Be Waiting,304.56118,1989
1,1,ARCZBX11187B9AD089,,,,Suicide Commando,SOVROYO12AB0186D09,Blood In Face,287.29424,2000
2,1,AR9E1QW1187B999A34,,,,Mikel Erentxun,SOSBVKG12A8C1409B8,En Que Mujer,210.33751,2000
3,1,ARZJDBC1187FB52056,27.94017,-82.32547,"Brandon, Florida",Nasty Savage,SORUZFR12AB01866C1,Hypnotic Trance,280.37179,1987
4,1,ARPQ4Z01187FB3A736,29.42449,-98.49462,"San Antonio, TX",Butthole Surfers,SOJXVAH12A8C139700,"Barking Dogs (From ""Piouhgd"")",450.84689,0


In [6]:
query = """
    SELECT * FROM songplays LIMIT 5
"""

execute_query(cur=cur, query=query) 

Unnamed: 0,songplay_id,timestamp,song_id,artist_id,user_id,session_id,level,location,user_agent
0,18,1541106673796,SOEIQUY12AF72A086A,ARHUC691187B9AD27F,8,139,free,"Phoenix-Mesa-Scottsdale, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK..."
1,184,1541179905796,SOHTKMO12AB01843B0,AR5EYTL1187B98EDA0,10,182,free,"Washington-Arlington-Alexandria, DC-VA-MD-WV","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
2,171,1541267941796,SOHTKMO12AB01843B0,AR5EYTL1187B98EDA0,15,199,paid,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
3,353,1541273619796,SOYTFSY12A6D4FD84E,ARRFSMX1187FB39B03,95,152,paid,"Winston-Salem, NC","""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like..."
4,90,1541348754796,SOARUPP12AB01842E0,ARD46C811C8A414F3F,69,235,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


In [7]:
# List artists with the most number plays descending
query = """
SELECT
    a.artist_name,
    a.artist_id,
    count(*) AS count
FROM songplays sp
JOIN artists a 
ON sp.artist_id = a.artist_id
GROUP BY a.artist_id, a.artist_name
ORDER BY count DESC
LIMIT 5
"""

execute_query(cur=cur, query=query) 

Unnamed: 0,artist_name,artist_id,count
0,Dwight Yoakam,AR5E44Z1187B9A1D74,37
1,Kid Cudi / Kanye West / Common,ARD46C811C8A414F3F,10
2,Ron Carter,AR37SX11187FB3E164,9
3,Lonnie Gordon,AR5EYTL1187B98EDA0,9
4,B.o.B,ARKQQZA12086C116FC,8


In [8]:
# List users by average session duration descending
query = """
SELECT
    avg(sessions.duration) AS mean_duration,
    user_id
FROM (
    SELECT
        max(timestamp) - min(timestamp) AS duration,
        session_id,
        user_id
    FROM songplays
    GROUP BY session_id, user_id
) sessions
GROUP BY user_id
ORDER BY mean_duration DESC
LIMIT 5
"""

execute_query(cur=cur, query=query) 

Unnamed: 0,mean_duration,user_id
0,9810000,30
1,8204400,24
2,5919000,72
3,5491454,97
4,5082000,15


### Drop All Tables and Disconnect from Redshift

In [9]:
drop_tables(cur, conn)
delete_staging_tables(cur, conn)

In [10]:
conn.close()