In [1]:
import configparser


config = configparser.ConfigParser()
config.read('dwh.cfg')

# IAM_USER
KEY = config.get('IAM_USER', 'KEY')
SECRET = config.get('IAM_USER', 'SECRET')

# DATABASE
host = config.get('DATABASE', 'DB_ENDPOINT')
dbname = config.get('DATABASE', 'DB_NAME')
user = config.get('DATABASE', 'DB_USERNAME')
password = config.get('DATABASE', 'DB_PASSWORD')
port = config.get('DATABASE', 'DB_PORT')

# Have a look through the S3 buckets

In [2]:
import boto3
import json

In [3]:
s3 = boto3.resource('s3',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

bucket = s3.Bucket('udacity-dend')

## Log data

In [4]:
# Minus 1 because we will also count the folder log_data
count = -1
next_song_count = 0

for i, obj in enumerate(bucket.objects.filter(Prefix='log_data')):
    file_content = obj.get()['Body'].read().decode('utf-8')
    count += file_content.count('\n') + 1
    next_song_count += file_content.count('"page":"NextSong"')

# Note: We don't need to add 1 as it also counts for the folder log_data
print('The number of log files is: {}.'.format(i))
print('The number of events is: {}.'.format(count))
print('The number of NextSong events is: {}.'.format(next_song_count))

The number of log files is: 30.
The number of events is: 8056.
The number of NextSong events is: 6820.


## Log JSONPath

In [5]:
for i, obj in enumerate(bucket.objects.filter(Prefix='log_json_path')):
    file_content = obj.get()['Body'].read().decode('utf-8')
    print(file_content)

{
    "jsonpaths": [
        "$['artist']",
        "$['auth']",
        "$['firstName']",
        "$['gender']",
        "$['itemInSession']",
        "$['lastName']",
        "$['length']",
        "$['level']",
        "$['location']",
        "$['method']",
        "$['page']",
        "$['registration']",
        "$['sessionId']",
        "$['song']",
        "$['status']",
        "$['ts']",
        "$['userAgent']",
        "$['userId']"
    ]
}


## Song data

<font color='red'>***Note***: The following cell takes a very long time to run.

In [6]:
# for i, obj in enumerate(bucket.objects.filter(Prefix='song_data')):
#     pass

# print('The number of songs is: {}.'.format(i))

# Test queries on the database

## Setup SQL connection

In [7]:
from sql_queries import datamart_schema_search_path

In [8]:
%load_ext sql

In [9]:
conn_string = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname)
%sql $conn_string

'Connected: dwhuser@dwhdb'

In [10]:
%sql $datamart_schema_search_path

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
Done.


[]

## Start testing

In [11]:
%sql SELECT * FROM songplays LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
17,2018-11-04 09:19:03,44,paid,SOCSXKQ12A6D4F95A0,ARRE7IQ1187FB4CF13,196,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
193,2018-11-05 02:09:47,44,paid,SOUNZHU12A8AE47481,AR37SX11187FB3E164,237,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
201,2018-11-05 14:24:56,44,paid,SOTETAR12AF72A5FF7,AROR8OB1187FB50D6A,269,"Waterloo-Cedar Falls, IA",Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0
249,2018-11-23 18:11:01,86,free,SOQDMXT12A6D4F8255,ART5MUE1187B98C961,869,"La Crosse-Onalaska, WI-MN","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
281,2018-11-26 00:52:05,33,free,SONQEAO12A6D4F8CB3,AR7S2271187FB38B1F,827,"Eugene, OR","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""


In [12]:
%sql SELECT * FROM songplays where song_id is NOT NULL and artist_id is NOT NULL LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
57,2018-11-21 01:05:29,97,paid,SORTFPF12A81C2171A,ARY5UO61187FB5271F,671,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
121,2018-11-05 17:54:12,25,paid,SOKOGIP12AB0182FCD,AROS1ML1187FB4CF35,231,"Marinette, WI-MI","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
129,2018-11-24 17:14:29,29,paid,SOHKKXU12A67ADA08F,ARMRCET1187FB52049,898,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
137,2018-11-13 17:47:05,29,paid,SOJWFXM12A3F1EBE8B,AR049S81187B9AE8A5,486,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""
153,2018-11-13 14:24:57,29,paid,SOQHFWA12AAA8C6662,ARWAQTR1187FB38810,486,"Atlanta-Sandy Springs-Roswell, GA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2"""


In [13]:
%sql SELECT COUNT(*) FROM songplays;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
319


In [14]:
%sql SELECT * FROM users LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


user_id,first_name,last_name,gender,level
39,Walter,Frye,M,free
60,Devin,Larson,M,free
33,Bronson,Harris,M,free
85,Kinsley,Young,F,paid
101,Jayden,Fox,M,free


In [15]:
%sql SELECT count(*) FROM users;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
96


In [16]:
%sql SELECT * FROM songs LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


song_id,title,artist_id,year,duration
SOQPWCR12A6D4FB2A3,A Poor Recipe For Civic Cohesion,AR73AIO1187B9AD57B,2005,118.07302
SOQYORG12AC3DF81B4,Rainbow Yoshi,ARP7AEC1187B98B4B2,2008,148.61016
SOIHDHD12AB017C6A3,The Best Summer Ever,AR5AMEV1187FB563FA,2006,283.81995
SOKTLGM12A6D4FA4FD,All About Money (Featuring Skip),ARTVUER1187FB3905B,2002,280.78975
SOLLSWW12A8C1433F5,Me & My Sister,ARYIWF11187FB53F2D,2004,213.05424


In [17]:
%sql SELECT count(*) FROM songs;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
14896


In [18]:
%sql SELECT * FROM artists LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


artist_id,name,location,latitude,longitude
ARC6UC81187B989062,Hem,NY - New York City,40.71455,-74.00712
ARBDGN21187FB4C201,Weatherbox,"San Diego, CA",,
ARDNW1B1187FB4ABBB,Chéco Feliciano And Joe King,,,
ARXP7MQ1187FB3AC5F,Dave Hollister,LOWELL,,
ARIAFCM1187B9A104F,Tenth Avenue North,"West Palm Beach, Florida",26.71438,-80.05269


In [19]:
%sql SELECT count(*) FROM artists;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
10025


In [20]:
%sql SELECT * FROM time LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-17 16:42:29,16,17,46,11,2018,6
2018-11-05 11:08:56,11,5,45,11,2018,1
2018-11-26 13:47:35,13,26,48,11,2018,1
2018-11-17 15:41:21,15,17,46,11,2018,6
2018-11-24 13:55:51,13,24,47,11,2018,6


In [21]:
%sql SELECT count(*) FROM time;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
319


In [22]:
%sql SELECT location, COUNT(*) FROM songplays GROUP BY location ORDER BY COUNT(*) DESC LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


location,count
"San Francisco-Oakland-Hayward, CA",41
"Portland-South Portland, ME",31
"Lansing-East Lansing, MI",28
"Waterloo-Cedar Falls, IA",20
"Tampa-St. Petersburg-Clearwater, FL",18


In [23]:
%sql SELECT user_id, first_name, last_name, gender, level, COUNT(*) FROM songplays NATURAL JOIN users GROUP BY user_id, first_name, last_name, gender, level ORDER BY COUNT(*) DESC LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


user_id,first_name,last_name,gender,level,count
49,Chloe,Cuevas,F,paid,39
80,Tegan,Levine,F,paid,31
97,Kate,Harrell,F,paid,28
44,Aleena,Kirby,F,paid,20
73,Jacob,Klein,M,paid,18
