In [20]:
from pprint import pprint
import json

import pandas as pd

from aws_client.aws_client import AWS
from aws_client import utils

In [23]:
def get_dwh_endpoint(redshift_cluster_props):
    dwh_endpoint = redshift_cluster_props['Endpoint']['Address']
    return dwh_endpoint


def get_dwh_role_arn(redshift_cluster_props):
    dwh_role_arn = redshift_cluster_props['IamRoles'][0]['IamRoleArn']
    return dwh_role_arn

In [2]:
%load_ext sql

In [3]:
configs = utils.parse_configs('../config/dwh.cfg')
secrets = utils.get_secrets()

## Create AWS Infrastructure

In [4]:
aws = AWS(aws_access_key_id=secrets.get('KEY'),
          aws_secret_access_key=secrets.get('SECRET'),
          region=configs.get('REGION'),
          config_params=configs)

In [5]:
aws.create_iam_role()

In [6]:
read_s3_role_arn = aws.get_iam_role_arn()

In [8]:
aws.create_redshift_cluster(read_s3_role_arn)

In [78]:
redshift_cluster_props = aws.get_redshift_cluster_props()
aws.print_redshift_props(redshift_cluster_props)

                 Key  \
0  ClusterIdentifier   
1           NodeType   
2      ClusterStatus   
3     MasterUsername   
4             DBName   
5           Endpoint   
6              VpcId   
7      NumberOfNodes   

                                                                                   Value  
0                                                                             dwhcluster  
1                                                                              dc2.large  
2                                                                              available  
3                                                                                dwhuser  
4                                                                                    dwh  
5  {'Address': 'dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com', 'Port': 5439}  
6                                                                           vpc-2d456f55  
7                                                      

## Connect to Redshift and run queries

In [79]:
redshift_url = utils.get_dwh_endpoint(redshift_cluster_props)

AttributeError: module 'aws_client.utils' has no attribute 'get_dwh_endpoint'

In [73]:
conn_string = "postgresql://{}:{}@{}:{}/{}".format(configs["DWH_DB_USER"],
                                                   configs["DWH_DB_PASSWORD"],
                                                   redshift_url, 
                                                   configs["DWH_PORT"],
                                                   configs["DWH_DB"])

%sql $conn_string

'Connected: dwhuser@dwh'

In [72]:
%sql DROP TABLE IF EXISTS staging_log_data;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [74]:
%%sql
CREATE TABLE IF NOT EXISTS "staging_log_data" (
    "artist" TEXT,
    "auth" VARCHAR,
    "firstName" TEXT,
    "gender" TEXT,
    "itemInSession" INTEGER,
    "length" DECIMAL,
    "level" VARCHAR,
    "location" TEXT,
    "method" VARCHAR,
    "page" TEXT,
    "registration" VARCHAR,
    "sessionId" INTEGER,
    "song" TEXT,
    "status" SMALLINT,
    "ts" INTEGER,
    "userAgent" VARCHAR,
    "userId" INTEGER
);

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [81]:
query = f"""
COPY staging_log_data
FROM 's3://udacity-dend/log-data/'
CREDENTIALS 'aws_iam_role={read_s3_role_arn}'
REGION 'us-west-2'
JSON 'auto ignorecase';
"""

%sql $query

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh


InternalError: (psycopg2.errors.InternalError_) Load into table 'staging_log_data' failed.  Check 'stl_load_errors' system table for details.

[SQL: COPY staging_log_data
FROM 's3://udacity-dend/log-data/'
CREDENTIALS 'aws_iam_role=arn:aws:iam::787511476638:role/dwhRole'
REGION 'us-west-2'
JSON 'auto ignorecase';]
(Background on this error at: http://sqlalche.me/e/14/2j85)

In [68]:
%sql SELECT *, size, tbl_rows FROM SVV_TABLE_INFO

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


database,schema,table_id,table,encoded,diststyle,sortkey1,max_varchar,sortkey1_enc,sortkey_num,size,pct_used,empty,unsorted,stats_off,tbl_rows,skew_sortkey1,skew_rows,estimated_visible_rows,risk_event,vacuum_sort_benefit,size_1,tbl_rows_1


In [39]:
%sql SELECT COUNT(*) FROM staging_log_data;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
8056


In [63]:
%sql select count(all userId) from staging_log_data;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
1743


In [64]:
%%sql

select *
from staging_log_data
limit 20;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
20 rows affected.


artist,auth,firstname,gender,iteminsession,length,level,location,method,page,registration,sessionid,song,status,timestamp,useragent,userid
N.E.R.D. FEATURING MALICE,Logged In,Jayden,M,0,288.0,free,"New Orleans-Metairie, LA",PUT,NextSong,1541033612796,184,Am I High (Feat. Malice),200,,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",101
,Logged In,Stefany,F,0,,free,"Lubbock, TX",GET,Home,1540708070796,82,,200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",83
Death Cab for Cutie,Logged In,Stefany,F,1,216.0,free,"Lubbock, TX",PUT,NextSong,1540708070796,82,A Lack Of Color (Album Version),200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",83
Tracy Gang Pussy,Logged In,Stefany,F,2,221.0,free,"Lubbock, TX",PUT,NextSong,1540708070796,82,I Have A Wish,200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",83
Skillet,Logged In,Kevin,M,0,178.0,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540006905796,153,Monster (Album Version),200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",66
Dance Gavin Dance,Logged In,Marina,F,0,218.0,free,"Salinas, CA",PUT,NextSong,1541064343796,47,Uneasy Hearts Weigh The Most,200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",48
,Logged In,Aiden,M,0,,free,"La Crosse-Onalaska, WI-MN",GET,Home,1540829025796,170,,200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36""",86
Dalto,Logged In,Aiden,M,1,190.0,free,"La Crosse-Onalaska, WI-MN",PUT,NextSong,1540829025796,170,Falta Te Dizer,200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36""",86
Kanye West,Logged In,Makinley,F,0,278.0,free,"Chicago-Naperville-Elgin, IL-IN-WI",PUT,NextSong,1541091973796,118,Family Business,200,,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",17
Jason Mraz & Colbie Caillat,Logged In,Kevin,M,0,189.0,free,"Harrisburg-Carlisle, PA",PUT,NextSong,1540006905796,187,Lucky (Album Version),200,,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36""",66


In [56]:
%%sql

select *
from staging_log_data
where gender is not null
limit 20;

 * postgresql://dwhuser:***@dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com:5439/dwh
20 rows affected.


artist,auth,firstname,gender,iteminsession,length,level,location,method,page,registration,sessionid,song,status,timestamp,useragent,userid
A Fine Frenzy,Logged In,,F,,267,free,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",PUT,NextSong,1541044398796,,Almost Lover (Album Version),200,,,
Nirvana,Logged In,,F,,214,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Serve The Servants,200,,,
Television,Logged In,,F,,238,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,See No Evil (Remastered LP Version),200,,,
JOHN COLTRANE,Logged In,,F,,346,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Blues To Bechet (LP Version),200,,,
NOFX,Logged In,,F,,80,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,It's My Job To Keep Punk Rock Elite,200,,,
The Backyardigans,Logged In,,F,,158,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Into The Thick Of It!,200,,,
Bruce Springsteen,Logged In,,F,,202,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Radio Nowhere,200,,,
Maroon 5,Logged In,,F,,173,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Harder To Breathe,200,,,
Two Door Cinema Club,Logged In,,F,,189,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,What You Know,200,,,
Five Finger Death Punch,Logged In,,F,,262,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,,Meet the Monster,200,,,


In [None]:
%%sql


CREATE TABLE IF NOT EXISTS  users (
	user_id INTEGER PRIMARY KEY NOT NULL SORTKEY,
	first_name VARCHAR(50) NOT NULL,
	last_name VARCHAR(50) NOT NULL,
	gender VARCHAR(20) NOT NULL,
	level VARCHAR(20) NOT NULL
);

CREATE TABLE IF NOT EXISTS time (
	start_time TIMESTAMP PRIMARY KEY NOT NULL,
	hour SMALLINT NOT NULL,
	day SMALLINT NOT NULL,
	week SMALLINT NOT NULL,
	month SMALLINT NOT NULL,
	year SMALLINT NOT NULL,
	weekday SMALLINT NOT NULL
);

## Delete Cluster and IAM role

In [26]:
aws.redshift.delete_cluster(
    ClusterIdentifier=aws.configs['DWH_CLUSTER_IDENTIFIER'],
    SkipFinalClusterSnapshot=True)

{'Cluster': {'ClusterIdentifier': 'dwhcluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'dwhuser',
  'DBName': 'dwh',
  'Endpoint': {'Address': 'dwhcluster.c6jsnvqemczs.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 6, 7, 21, 56, 26, 214000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-4ad38f74',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-2d456f55',
  'AvailabilityZone': 'us-west-2b',
  'PreferredMaintenanceWindow': 'sun:06:30-sun:07:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible'

In [27]:
aws.iam.detach_role_policy(
    RoleName=aws.configs['DWH_IAM_ROLE_NAME'],
    PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")

{'ResponseMetadata': {'RequestId': 'b356d859-2e0a-48c6-935c-0c967138884b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b356d859-2e0a-48c6-935c-0c967138884b',
   'content-type': 'text/xml',
   'content-length': '212',
   'date': 'Mon, 07 Jun 2021 22:32:09 GMT'},
  'RetryAttempts': 0}}

In [28]:
aws.iam.delete_role(RoleName=aws.configs['DWH_IAM_ROLE_NAME'])

{'ResponseMetadata': {'RequestId': 'c7cd8624-3b82-466f-98f8-4fde19decd44',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c7cd8624-3b82-466f-98f8-4fde19decd44',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Mon, 07 Jun 2021 22:32:11 GMT'},
  'RetryAttempts': 0}}