In [1]:
import configparser


config = configparser.ConfigParser()
config.read('dwh.cfg')

# IAM_USER
KEY = config.get('IAM_USER', 'KEY')
SECRET = config.get('IAM_USER', 'SECRET')

# DATABASE
host = config.get('DATABASE', 'DB_ENDPOINT')
dbname = config.get('DATABASE', 'DB_NAME')
user = config.get('DATABASE', 'DB_USERNAME')
password = config.get('DATABASE', 'DB_PASSWORD')
port = config.get('DATABASE', 'DB_PORT')

# Have a look through the S3 buckets

In [2]:
import boto3
import json

In [3]:
s3 = boto3.resource('s3',
                    region_name='us-west-2',
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET)

bucket = s3.Bucket('udacity-dend')

## Log data

In [4]:
# Minus 1 because we will also count the folder log_data
count = -1
next_song_count = 0

for i, obj in enumerate(bucket.objects.filter(Prefix='log_data')):
    file_content = obj.get()['Body'].read().decode('utf-8')
    count += file_content.count('\n') + 1
    next_song_count += file_content.count('"page":"NextSong"')

# Note: We don't need to add 1 as it also counts for the folder log_data
print('The number of log files is: {}.'.format(i))
print('The number of events is: {}.'.format(count))
print('The number of NextSong events is: {}.'.format(next_song_count))

The number of log files is: 30.
The number of events is: 8056.
The number of NextSong events is: 6820.


## Log JSONPath

In [5]:
for i, obj in enumerate(bucket.objects.filter(Prefix='log_json_path')):
    file_content = obj.get()['Body'].read().decode('utf-8')
    print(file_content)

{
    "jsonpaths": [
        "$['artist']",
        "$['auth']",
        "$['firstName']",
        "$['gender']",
        "$['itemInSession']",
        "$['lastName']",
        "$['length']",
        "$['level']",
        "$['location']",
        "$['method']",
        "$['page']",
        "$['registration']",
        "$['sessionId']",
        "$['song']",
        "$['status']",
        "$['ts']",
        "$['userAgent']",
        "$['userId']"
    ]
}


## Song data

<font color='red'>***Note***: The following cell takes a very long time to run.

In [6]:
# for i, obj in enumerate(bucket.objects.filter(Prefix='song_data')):
#     pass

# print('The number of songs is: {}.'.format(i))

# Test queries on the database

In [7]:
%load_ext sql

In [8]:
conn_string = 'postgresql://{}:{}@{}:{}/{}'.format(user, password, host, port, dbname)
%sql $conn_string

'Connected: dwhuser@dwhdb'

In [9]:
%sql SELECT * FROM songplays LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
220,2018-11-27 18:52:58,28,free,SOEKSGJ12A67AE227E,ARQUMH41187B9AF699,270,"Portland-Vancouver-Hillsboro, OR-WA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
268,2018-11-28 18:48:55,24,paid,SOZQSGL12AF72A9145,AR050VJ1187B9B13A7,984,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
8,2018-11-29 00:40:14,24,paid,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,984,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
16,2018-11-21 12:10:49,15,paid,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,764,"Chicago-Naperville-Elgin, IL-IN-WI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""
96,2018-11-07 13:54:52,23,free,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,177,"Raleigh, NC","""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D201 Safari/9537.53"""


In [10]:
%sql SELECT * FROM songplays where song_id is NOT NULL and artist_id is NOT NULL LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
52,2018-11-01 21:11:13,8,free,SOEIQUY12AF72A086A,ARHUC691187B9AD27F,139,"Phoenix-Mesa-Scottsdale, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"""
140,2018-11-29 08:23:09,98,free,SOYJPKO12A6D4FDCEA,ARC3KQN1187FB54018,865,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
236,2018-11-06 08:49:19,12,free,SOBJDDA12A6BD53159,ARCS4GZ1187FB469EB,300,"New York-Newark-Jersey City, NY-NJ-PA",Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20100101 Firefox/31.0
392,2018-11-12 21:22:01,12,free,SOARUPP12AB01842E0,ARD46C811C8A414F3F,371,"New York-Newark-Jersey City, NY-NJ-PA",Mozilla/5.0 (Windows NT 6.1; rv:31.0) Gecko/20100101 Firefox/31.0
39,2018-11-12 11:45:25,26,free,SOBONKR12A58A7A7E0,AR5E44Z1187B9A1D74,491,"San Jose-Sunnyvale-Santa Clara, CA","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36"""


In [36]:
%sql SELECT COUNT(*) FROM songplays;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
319


In [43]:
%%sql
SELECT COUNT(*) FROM
(SELECT DISTINCT se.*
FROM staging_events AS se JOIN staging_songs AS ss
ON se.artist = ss.artist_name AND se.song = ss.title AND se.length = ss.duration
WHERE se.page = 'NextSong');

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
319


In [46]:
%sql SELECT * FROM staging_songs LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
1,ARGS47D1187FB40225,,,"London, England",Peter And Gordon,SODIWID12A6D4F803A,Lucille (1999 Digital Remaster) (Stereo),127.97342,0
1,ARF2EHS1187B994F4E,36.16778,-86.77836,"Nashville, Tennessee",Kings Of Leon,SOLGHDZ12AB0183B11,Wicker Chair,187.92444,2003
1,ARZPYZ11187FB4938A,,,,MC Esoteric,SOJYWLL12AB018A7AD,Waxing on Decepticon,119.53587,0
1,ARYO9BU1187B9ADA88,,,,Porcupine Tree,SOJEVZW12A8C133988,Waiting,416.80934,1997
1,ARE4SDM1187FB4D7E4,45.51228,-73.55439,"Montreal, Quebec, Canada",David Wilcox,SOKNGDE12AB017CA4D,Step Into Your Skin,139.72853,0


In [12]:
%sql SELECT * FROM users LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


user_id,first_name,last_name,gender,level
45,Dominick,Norris,M,free
13,Ava,Robinson,F,free
41,Brayden,Clark,M,free
80,Tegan,Levine,F,paid
88,Mohammad,Rodriguez,M,paid


In [13]:
%sql SELECT count(*) FROM users;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
96


In [14]:
%sql SELECT * FROM songs LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


song_id,title,artist_id,year,duration
SOLGHDZ12AB0183B11,Wicker Chair,ARF2EHS1187B994F4E,2003,187.92444
SOYWOHR12A6D4FC2B2,The big fight,ARK885Q1187FB5398E,2004,321.82812
SOGRZFG12A8C135B63,The Crazies,ARPCE321187B9B94C9,1990,174.05342
SOWCEQL12AF729E70C,Close To You,ARWPYQI1187FB4D55A,2005,184.55465
SOBAZOE12A81C2098C,Stick With Me,ARRJB8U1187FB3B787,2007,226.53342


In [15]:
%sql SELECT count(*) FROM songs;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
14896


In [16]:
%sql SELECT * FROM artists LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


artist_id,name,location,latitude,longitude
ARE4SDM1187FB4D7E4,David Wilcox,"Montreal, Quebec, Canada",45.51228,-73.55439
AR6ZBSM1187B990030,Refused,Sweden,62.19845,17.55142
ARB3TCS1187B99A0D2,Ornette Coleman,"Fort Worth, TX",32.74863,-97.32925
AREEAVN1187FB5BC58,The Feeling,"London, England",,
ARU6Y2R1187B9AD7B0,Apurimac,,,


In [17]:
%sql SELECT count(*) FROM artists;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
10025


In [18]:
%sql SELECT * FROM time LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-04 07:31:31,7,4,44,11,2018,0
2018-11-29 16:05:40,16,29,48,11,2018,4
2018-11-15 14:53:43,14,15,46,11,2018,4
2018-11-13 18:26:40,18,13,46,11,2018,2
2018-11-15 19:06:33,19,15,46,11,2018,4


In [19]:
%sql SELECT count(*) FROM time;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
1 rows affected.


count
319


In [20]:
%sql SELECT location, COUNT(*) FROM songplays GROUP BY location ORDER BY COUNT(*) DESC LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


location,count
"San Francisco-Oakland-Hayward, CA",41
"Portland-South Portland, ME",31
"Lansing-East Lansing, MI",28
"Waterloo-Cedar Falls, IA",20
"Tampa-St. Petersburg-Clearwater, FL",18


In [21]:
%sql SELECT user_id, first_name, last_name, gender, level, COUNT(*) FROM songplays NATURAL JOIN users GROUP BY user_id, first_name, last_name, gender, level ORDER BY COUNT(*) DESC LIMIT 5;

 * postgresql://dwhuser:***@dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com:5439/dwhdb
5 rows affected.


user_id,first_name,last_name,gender,level,count
49,Chloe,Cuevas,F,paid,39
80,Tegan,Levine,F,paid,31
97,Kate,Harrell,F,paid,28
44,Aleena,Kirby,F,paid,20
73,Jacob,Klein,M,paid,18
