In [1]:
from pprint import pprint
import json

import pandas as pd

from aws_client.aws_client import AWS
from aws_client import utils

In [2]:
%load_ext sql

In [3]:
configs = utils.parse_configs('../config/dwh.cfg')
secrets = utils.get_secrets()

In [4]:
aws = AWS(aws_access_key_id=secrets.get('KEY'),
          aws_secret_access_key=secrets.get('SECRET'),
          region=configs.get('REGION'),
          config_params=configs)

In [5]:
redshift_url = aws.get_dwh_endpoint()

In [6]:
conn_string = "postgresql://{}:{}@{}:{}/{}".format(configs["DWH_DB_USER"],
                                                   configs["DWH_DB_PASSWORD"],
                                                   redshift_url, 
                                                   configs["DWH_PORT"],
                                                   configs["DWH_DB"])

%sql $conn_string

'Connected: dwhuser@dwh'

In [7]:
%%sql songplays <<
SELECT *
FROM songplays;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
9957 rows affected.
Returning data to local variable songplays


In [33]:
%%sql artists <<
SELECT *
FROM artists;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
14896 rows affected.
Returning data to local variable artists


In [11]:
df_songplays = songplays.DataFrame()

In [34]:
df_artists = artists.DataFrame()

In [14]:
df_songplays.head()

Unnamed: 0,songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
0,0,2018-11-26 07:00:07,49,paid,SOLJCCO12A6701F987,ARR6LWJ1187FB44C8B,930,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20...
1,13,2018-11-21 06:25:55,97,paid,SOFAMRP12AF72A069E,ARMI4NV1187B99D55D,797,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5..."
2,21,2018-11-09 00:42:13,42,paid,SOFAMRP12AF72A069E,ARMI4NV1187B99D55D,275,"New York-Newark-Jersey City, NY-NJ-PA","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebK..."
3,30,2018-11-14 07:57:38,80,paid,SOFWVCZ12A8C1462BF,ARNSMZT1187B98E003,548,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."
4,41,2018-11-05 00:33:12,69,free,SOBBGQK12AB0183F1E,AR4E4121187FB51F4E,256,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4..."


In [32]:
df_songplays[['songplay_id', 'level']] \
    .groupby('level') \
    .count()

Unnamed: 0_level_0,songplay_id
level,Unnamed: 1_level_1
free,1756
paid,8201


In [28]:
(df_songplays[['songplay_id', 'user_agent']]
     .groupby('user_agent')
     .count()
     .sort_values('songplay_id', ascending=False)
     .head(10))

Unnamed: 0_level_0,songplay_id
user_agent,Unnamed: 1_level_1
"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",1453
Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0,1037
"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2""",995
"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36""",855
"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/36.0.1985.125 Chrome/36.0.1985.125 Safari/537.36""",839
"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",691
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0,637
"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",607
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0,444
"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4""",426


In [79]:
(df_songplays[['songplay_id', 'artist_id']]
    .groupby('artist_id')
    .count()
    .rename(columns={'artist_id': 'num_of_songplays'})
    .sort_values('songplay_id', ascending=False)
    .head(10)
    .merge(df_artists[['artist_id', 'name']], 
           on='artist_id', 
           how='left')
    .drop('artist_id', axis=1)
    .drop_duplicates()
    .reset_index()
    .drop('index', axis=1)
)

Unnamed: 0,songplay_id,name
0,245,Muse
1,240,Radiohead
2,232,Coldplay
3,220,Kings Of Leon
4,150,Alliance Ethnik
5,126,Foo Fighters
6,108,The Black Keys
7,105,Beastie Boys
8,105,The Beastie Boys
9,105,Jack Johnson
