In [17]:
#import necessary libraries
import pandas as pd
import boto3
import json

# Connect to Redshift cluster
## Load DWH Params from a file

In [3]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")

DWH_IAM_ROLE_NAME      = config.get("DWH", "DWH_IAM_ROLE_NAME")

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

pd.DataFrame({"Param":
                  ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
              "Value":
                  [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,dwhCluster
4,DWH_DB,dev
5,DWH_DB_USER,dwhadmin
6,DWH_DB_PASSWORD,Pika1324_
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,dwhRole


## Create clients for IAM, EC2, S3 and Redshift

In [4]:
args = {
    "region_name": "us-west-2",
    "aws_access_key_id": KEY,
    "aws_secret_access_key": SECRET
}

ec2 = boto3.resource('ec2', **args)
s3 = boto3.resource('s3', **args)
iam = boto3.client('iam', **args)
redshift = boto3.client('redshift', **args)

## Connect to the cluster

In [5]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhadmin
4,DBName,dev
5,Endpoint,"{'Address': 'dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-0e131f9e92b1f31c5
7,NumberOfNodes,4


In [6]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::385815011402:role/dwhRole


In [7]:
%load_ext sql

In [8]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT,DWH_DB)
print(conn_string)
%sql $conn_string

postgresql://dwhadmin:Pika1324_@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev


'Connected: dwhadmin@dev'

In [9]:
nstaging_events = %sql SELECT count(*) FROM staging_events;
nstaging_songs = %sql SELECT count(*) FROM staging_songs;
nsongplays = %sql SELECT count(*) FROM songplays;
nusers = %sql SELECT count(*) FROM users;
nsongs = %sql SELECT count(*) FROM songs;
nartists = %sql SELECT count(*) FROM artists;
ntimes = %sql SELECT count(*) FROM times;

print("nstaging_events\t\t", nstaging_events[0][0])
print("nstaging_songs\t\t", nstaging_songs[0][0])
print("nsong_plays\t\t", nsongplays[0][0])
print("nusers\t\t", nusers[0][0])
print("nsongs\t\t", nsongs[0][0])
print("nartists\t\t", nartists[0][0])
print("ntimes\t\t", ntimes[0][0])

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
1 rows affected.
nstaging_events		 8056
nstaging_songs		 14896
nsong_plays		 319
nusers		 104
nsongs		 14896
nartists		 10025
ntimes		 319


In [10]:
%sql SELECT * FROM times limit 10;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-05 11:14:00.796000,11,5,45,11,2018,1
2018-11-05 17:54:12.796000,17,5,45,11,2018,1
2018-11-05 17:54:32.796000,17,5,45,11,2018,1
2018-11-05 18:00:37.796000,18,5,45,11,2018,1
2018-11-06 07:36:46.796000,7,6,45,11,2018,2
2018-11-07 15:16:17.796000,15,7,45,11,2018,3
2018-11-08 07:41:08.796000,7,8,45,11,2018,4
2018-11-09 14:50:53.796000,14,9,45,11,2018,5
2018-11-10 20:36:50.796000,20,10,45,11,2018,6
2018-11-11 18:53:36.796000,18,11,45,11,2018,0


In [11]:
%sql SELECT * FROM users limit 10;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


user_id,first_name,last_name,gender,level
12,Austin,Rosales,M,free
14,Theodore,Harris,M,free
19,Zachary,Thomas,M,free
35,Molly,Taylor,F,free
36,Matthew,Jones,M,paid
37,Jordan,Hicks,F,free
39,Walter,Frye,M,free
41,Brayden,Clark,M,free
43,Jahiem,Miles,M,free
60,Devin,Larson,M,free


In [12]:
%sql SELECT * FROM songs limit 10;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


song_id,title,artist_id,year,duration
SOAAETA12A6D4FC626,Shine,ARQXK0B1187B9ACC97,2007,448.23465
SOAAFHQ12A6D4F836E,Ridin' Rims (Explicit Album Version),AR3CQ2D1187B9B1953,2006,322.84689
SOAAXAZ12A6701CC77,Min Häst har Blivit Sjuk,AR45GZC1187FB48E44,1975,282.14812
SOABBVH12AF72A5B57,My Love I Love,ARUIM291187FB3911A,2007,132.41424
SOABFQI12A58A7D162,Another World (Album Version),AR9B5JX1187FB55D84,1977,494.10567
SOABHYQ12A8C13D8FF,Inner Frame,ARAYDD71187FB5B775,0,247.43138
SOABTKM12A8AE4721E,Nothing's Clear (Album Version),ARQATWU1187B98EE26,0,202.13506
SOABWAP12A8C13F82A,Take Time,AR5LMPY1187FB573FE,1978,258.89914
SOACBGF12AC9097E79,O'Malley & Delacey,ARQFV881187FB3C24C,0,220.02893
SOACXKV12AB0189328,Skyscraper,ARGECAJ122BCFCD64C,2009,199.49669


In [14]:
%sql SELECT * FROM songplays order by songplay_id limit 10;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


songplay_id,start_time,user_id,level,song_id,artist_id,session_id,location,user_agent
1,2018-11-08 14:27:08.796000,81,free,SOQDMXT12A6D4F8255,ART5MUE1187B98C961,317,"Las Cruces, NM","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
2,2018-11-19 08:32:12.796000,24,paid,SOSMTXQ12A6D4F721D,ARS927Z1187B9ACA29,672,"Lake Havasu City-Kingman, AZ","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"""
3,2018-11-23 15:21:17.796000,30,paid,SOQDMXT12A6D4F8255,ART5MUE1187B98C961,691,"San Jose-Sunnyvale-Santa Clara, CA",Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0
4,2018-11-13 17:28:33.796000,97,paid,SOIBHYW12AB0188F49,ARWNARC122BCFCAFEB,537,"Lansing-East Lansing, MI","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36"""
5,2018-11-19 06:30:08.796000,80,paid,SOYDHXP12AB01849D4,AR73S4G1187B9A03C2,666,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
6,2018-11-29 20:21:41.796000,49,paid,SOABIXP12A8C135F75,AR15DJQ1187FB5910C,1041,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
7,2018-11-23 12:08:33.796000,80,paid,SOBBHVN12A6702162D,ARFSZGT1187B9B1E44,848,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
8,2018-11-26 07:08:28.796000,49,paid,SOYQYTX12AB0186FFA,ARWVF341187B9B55D8,930,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0
9,2018-11-04 06:51:12.796000,25,paid,SORKKTY12A8C132F3E,ARIH5GU1187FB4C958,128,"Marinette, WI-MI","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""
10,2018-11-15 19:01:55.796000,42,paid,SOIOESO12A6D4F621D,ARVLXWP1187FB5B94A,404,"New York-Newark-Jersey City, NY-NJ-PA","""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36"""


In [15]:
%sql SELECT * FROM artists limit 10;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
10 rows affected.


artist_id,name,location,latitude,longitude
AR00FVC1187FB5BE3E,Panda,"Monterrey, NL, México",25.67084,-100.30953
AR00MQ31187B9ACD8F,Chris Carrier,,,
AR00TGQ1187B994F29,Paula Toller,,,
AR02LMQ1187B992AC9,The Classic Crime,"Seattle, WA",47.60356,-122.32944
AR040JZ1187FB399A9,Sparks,"Los Angeles, CA",34.05349,-118.24532
AR040M31187B98CA41,The Bug Featuring Spaceape,,,
AR049S81187B9AE8A5,The Human League,"Sheffield, Yorkshire, England",53.38311,-1.46454
AR04KY61187FB44E3A,Fidel Nadal,,,
AR04PRW1187FB4D60D,The Bens,,,
AR04S8J1187FB48358,Clifford Brown,"Wilmington, DE",39.74023,-75.55084


In [16]:
%%sql
delete from staging_events;
delete from staging_songs;
delete from songplays;
delete from users;
delete from songs;
delete from artists;
delete from time;

 * postgresql://dwhadmin:***@dwhcluster.c1uipwqamq1l.us-west-2.redshift.amazonaws.com:5439/dev
8056 rows affected.
14896 rows affected.
319 rows affected.
104 rows affected.
14896 rows affected.
10025 rows affected.
(psycopg2.ProgrammingError) relation "time" does not exist
 [SQL: 'delete from time;']
