### Test Queries for ETL Pipeline - Sparkify Database

This notebook will connect to the Sparkify Database created on the redhsift cluster & run some simple queries against the tables built

In [1]:
import configparser

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg')) 

In [None]:
DWH_HOST               = config.get("CLUSTER","HOST")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT") 

print(DWH_HOST) 
print(DWH_DB_PASSWORD)
print(DWH_DB_USER)
print(DWH_PORT) 
print(DWH_DB) 

In [None]:
# connect to cluster - Sparkify Database
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_HOST, DWH_PORT, DWH_DB)
print(conn_string) 

In [6]:
%load_ext sql

Connect to the Database

In [7]:
%sql $conn_string 

-----------------
#### Selection 1 - Simple queries that check against each of the 5 analytics tables

In [8]:
%%sql 
SELECT count(*) as records_ 
FROM songplays ; 

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


records_
7154


In [9]:
%%sql
SELECT * 
FROM songplays
LIMIT 3 ;

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
3 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
387,2018-11-02 01:34:17,83,free,,,82,"Lubbock, TX","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
306,2018-11-02 09:26:49,15,paid,,,172,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
411,2018-11-02 10:02:20,15,paid,,,172,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""


In [10]:
%%sql
SELECT * 
FROM users
LIMIT 3 ;

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
3 rows affected.


user_id,first_name,last_name,gender,level
24,Layla,Griffin,F,paid
26,Ryan,Smith,M,free
28,Brantley,West,M,free


In [11]:
%%sql
SELECT * 
FROM songs
LIMIT 3 ;

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
3 rows affected.


song_id,title,artist_id,year,duration
SOAACTC12AB0186A20,Christmas Is Coming Soon,ARXWFZ21187FB43A0B,2008,180.76689
SOAADJH12AB018BD30,Black Light (Album Version),AR3FKJ61187B990357,1975,385.90649
SOAAFUV12AB018831D,Where Do The Children Play? (LP Version),AR5ZGC11187FB417A3,0,216.05832


In [12]:
%%sql
SELECT count(*) as artists_records
FROM artists
;

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


artists_records
9670


In [13]:
%%sql
SELECT * 
FROM time
LIMIT 3 ;

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
3 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-01 20:57:10,20,1,44,11,2018,4
2018-11-01 21:01:46,21,1,44,11,2018,4
2018-11-01 21:02:12,21,1,44,11,2018,4


-----------------
#### Selection 2 - Queries that involve joins across the analytics tables

*Where the value of user_id is 8, identify what songs have been listened to, who the artists are, and what weekday the songplay was on*

In [14]:
%%sql
SELECT 
a.user_id, a.first_name, a.last_name, a.level, 
b.songplay_id, b.song_id, b.artist_id, b.start_time,
c.name as Artist_Name,
d.weekday 

FROM users as a 
LEFT JOIN songplays as b ON a.user_id = b.user_id 
LEFT JOIN artists as c ON b.artist_id = c.artist_id 
LEFT JOIN time as d ON b.start_time = d.start_time
WHERE a.user_id = '8' 
AND b.song_id IS NOT NULL 

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
3 rows affected.


user_id,first_name,last_name,level,songplay_id,song_id,artist_id,start_time,artist_name,weekday
8,Kaylee,Summers,free,647,SOEIQUY12AF72A086A,ARHUC691187B9AD27F,2018-11-01 21:11:13,The Mars Volta,4
8,Kaylee,Summers,free,5421,SOWEUOO12A6D4F6D0C,ARQUMH41187B9AF699,2018-11-27 04:25:00,Linkin Park,2
8,Kaylee,Summers,free,4271,SOWTZNU12AB017EADB,AR6NYHH1187B9BA128,2018-11-07 01:42:43,Yeah Yeah Yeahs,3


*Identify how many times the artist "Linkin Park" was played during Q4 2018*

In [15]:
%%sql
SELECT 
t1.name as Artist, 
count(t1.songplay_id) as number_of_plays_Q42018

FROM 
(
SELECT
    a.artist_id, a.name, b.songplay_id, b.start_time, c.year 
FROM
    artists as a 
LEFT JOIN 
    songplays as b 
ON a.artist_id = b.artist_id 

INNER JOIN
    time as c 
ON b.start_time = c.start_time 

WHERE a.name = 'Linkin Park' AND c.year = '2018' AND c.month in ('10','11','12') 
) as t1 
GROUP BY t1.name 

 * postgresql://sparkifyadmin:***@sparkify1.cms6ffzqbc3y.us-west-2.redshift.amazonaws.com:5439/sparkify
1 rows affected.


artist,number_of_plays_q42018
Linkin Park,4


End of Tests