# Exercise 2: Creating Redshift Cluster using the AWS python SDK 
## An example of Infrastructure-as-code

In [2]:
import pandas as pd
import boto3
import json

# Setup AWS Profile

The AWS Profile is set in `~/.aws/credentials` as follows:

```
[profileName]
aws_access_key_id = BOOMSHAKALAKA
aws_secret_access_key = SOMECOMPLICATEDSTRING
```

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Edit the file `iac_dwh.cfg` in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
PROFILE=YOUR_AWS_PROFILE_HERE<BR>
<font/>



# Load DWH Params from a file

In [23]:
import configparser
config = configparser.ConfigParser()
config.read_file(open('iac_dwh.cfg'))

AWS_PROFILE = config.get('AWS', 'PROFILE')

IAC_CLUSTER_TYPE = config.get('IAC', 'CLUSTER_TYPE')
IAC_NUM_NODES = config.get('IAC', 'NUM_NODES')
IAC_NODE_TYPE = config.get('IAC', 'NODE_TYPE')
IAC_CLUSTER_IDENTIFIER = config.get('IAC', 'CLUSTER_IDENTIFIER')
IAC_IAM_ROLE_NAME = config.get('IAC', 'IAM_ROLE_NAME')
IAC_DB = config.get('IAC', 'DB')
IAC_DB_USER = config.get('IAC', 'DB_USER')
IAC_DB_PASSWORD = config.get('IAC', 'DB_PASSWORD')
IAC_PORT = config.get('IAC', 'PORT')

(IAC_DB_USER, IAC_DB_PASSWORD, IAC_DB)

pd.DataFrame({"Param":
                  ["IAC_CLUSTER_TYPE", "IAC_NUM_NODES", "IAC_NODE_TYPE", "IAC_CLUSTER_IDENTIFIER", "IAC_DB", "IAC_DB_USER", "IAC_DB_PASSWORD", "IAC_PORT", "IAC_IAM_ROLE_NAME"],
              "Value":
                  [IAC_CLUSTER_TYPE, IAC_NUM_NODES, IAC_NODE_TYPE, IAC_CLUSTER_IDENTIFIER, IAC_DB, IAC_DB_USER, IAC_DB_PASSWORD, IAC_PORT, IAC_IAM_ROLE_NAME]
             })

Unnamed: 0,Param,Value
0,IAC_CLUSTER_TYPE,multi-node
1,IAC_NUM_NODES,4
2,IAC_NODE_TYPE,dc2.large
3,IAC_CLUSTER_IDENTIFIER,sparkifyCluster
4,IAC_DB,sparkifydb
5,IAC_DB_USER,sparkifyuser
6,IAC_DB_PASSWORD,ssm://redshift-dend-password
7,IAC_PORT,5439
8,IAC_IAM_ROLE_NAME,sparkifyReadAccess


## Create clients for EC2, S3, IAM, and Redshift

Setup Session for AWS, `boto3` loads `AWS_PROFILE` from config.

In [5]:
boto3.setup_default_session(profile_name=AWS_PROFILE)

## Create clients for EC2, S3, IAM, and Redshift

In [6]:
import boto3

ec2 = boto3.resource('ec2',
                       region_name="us-west-2"
                    )

s3 = boto3.resource('s3',
                       region_name="us-west-2"
                   )

iam = boto3.client('iam',
                   region_name='us-west-2'
                  )

redshift = boto3.client('redshift',
                       region_name="us-west-2",
                       )

## Check out the sample data sources on S3

In [7]:
sampleDbBucket =  s3.Bucket("awssampledbuswest2")

# TODO: Iterate over bucket objects starting with "ssbgz" and print
for obj in sampleDbBucket.objects.filter(Prefix="ssbgz"):
    print(obj)

s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/customer0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/dwdate.tbl.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0000_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0001_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0003_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0004_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0005_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0006_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/lineorder0007_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='s

## IAM Role
- Create an IAM Role that makes Redshift able to access S3 bucket (ReadOnly)

In [8]:
# Create the IAM role
try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path='/',
        RoleName=IAC_IAM_ROLE_NAME,
        Description='Allow redshift cluster to call S3',
        AssumeRolePolicyDocument=json.dumps(
            {'Statement': [{'Action': 'sts:AssumeRole',
               'Effect': 'Allow',
               'Principal': {'Service': 'redshift.amazonaws.com'}}],
             'Version': '2012-10-17'})
    )
    
except Exception as e:
    print(e)

1.1 Creating a new IAM Role


In [9]:
# Attach Policy
print('1.2 Attaching Policy')
iam.attach_role_policy(RoleName=IAC_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )['ResponseMetadata']['HTTPStatusCode']

1.2 Attaching Policy


200

In [24]:
# Get and print the IAM role ARN
print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName=IAC_IAM_ROLE_NAME)['Role']['Arn']
print(roleArn)

1.3 Get the IAM role ARN
arn:aws:iam::358485744732:role/sparkifyReadAccess


# Master Password for Cluster

In [82]:
import os

def get_password_from_source(password):
    if password.startswith('ssm://'):
        parameter_name = password.split('ssm://')[-1]
        ssm = boto3.client('ssm', region_name="us-west-2")
        response = ssm.get_parameter(
            Name=parameter_name,
            WithDecryption=True
        )
        return response['Parameter']['Value']
    elif password.startswith('file://'):
        password_file = password.split('file://')[-1]
        f = open(os.path.abspath(password_file), 'r')
        p = f.read()
        f.close()
        return p
    else:
        return password
        
DB_PASSWORD = get_password_from_source(IAC_DB_PASSWORD)

## Creating Redshift Cluster

- Create a RedShift Cluster
- For complete arguments to `create_cluster`, see [docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster)

In [32]:
try:
    response = redshift.create_cluster(        
        ClusterType=IAC_CLUSTER_TYPE,
        NodeType=IAC_NODE_TYPE,
        NumberOfNodes=int(IAC_NUM_NODES),
        DBName=IAC_DB,
        ClusterIdentifier=IAC_CLUSTER_IDENTIFIER,
        MasterUsername=IAC_DB_USER,
        MasterUserPassword=DB_PASSWORD,
        IamRoles=[roleArn],
        PubliclyAccessible=True
    )
except Exception as e:
    print(e)

## 2.1 *Describe* the cluster to see its status
- run this block several times until the cluster status becomes `Available`

In [35]:
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

myClusterProps = redshift.describe_clusters(ClusterIdentifier=IAC_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

Unnamed: 0,Key,Value
0,ClusterIdentifier,sparkifycluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,sparkifyuser
4,DBName,sparkifydb
5,Endpoint,"{'Address': 'sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-6044a805
7,NumberOfNodes,4


<h2> Fetching cluster's <font color='red'> endpoint and role ARN </font> </h2>

<font color='red'>DO NOT RUN THIS unless the cluster status becomes "Available" </font>

In [42]:
IAC_ENDPOINT = myClusterProps['Endpoint']['Address']
IAC_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("IAC_ENDPOINT :: ", IAC_ENDPOINT)
print("IAC_ROLE_ARN :: ", IAC_ROLE_ARN)

IAC_ENDPOINT ::  sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com
IAC_ROLE_ARN ::  arn:aws:iam::358485744732:role/sparkifyReadAccess


## Test Cluster connection

In [43]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [46]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(IAC_DB_USER, DB_PASSWORD, IAC_ENDPOINT, IAC_PORT,IAC_DB)
%sql $conn_string

'Connected: sparkifyuser@sparkifydb'

# Analytics

In [71]:
import psycopg2

def run_queries(cur, conn, queries):
    for query in queries:
        try:
            cur.execute(query)
            yield cur.fetchone()
            conn.commit()
        except Exception as e:
            print('Error in: ', query)
            print(e)
    
conn = psycopg2.connect("host={} dbname={} user={} password={} port={}".format(
    IAC_ENDPOINT,
    IAC_DB,
    IAC_DB_USER,
    DB_PASSWORD,
    IAC_PORT
))
cur = conn.cursor()

In [77]:
# Get Counts
titles = [
    'staging_events',
    'staging_songs',
    'artists',
    'songs',
    'time',
    'users',
    'songplays'
]

def table_name_to_count_query(name):
    return 'SELECT COUNT(*) AS total FROM {}'.format(name)

for title, result in zip(titles, run_queries(cur, conn, map(table_name_to_count_query, titles))):
    print('Table: {} - Count: {}'.format(title, result[0]))

Table: staging_events - Count: 8056
Table: staging_songs - Count: 14896
Table: artists - Count: 10025
Table: songs - Count: 14896
Table: time - Count: 333
Table: users - Count: 104
Table: songplays - Count: 333


In [60]:
%sql SELECT title, name FROM (songplays JOIN songs ON songplays.song_id=songs.song_id) JOIN artists ON artists.artist_id=songplays.artist_id LIMIT 5;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
5 rows affected.


title,name
On A Deeper Level,Blufeld
Lady,Stryper
Up Up & Away,Kid Cudi
Up Up & Away,Kid Cudi
Up Up & Away,Kid Cudi


In [78]:
%sql SELECT COUNT(*) FROM (songplays JOIN songs ON songplays.song_id=songs.song_id) JOIN artists ON artists.artist_id=songplays.artist_id;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
1 rows affected.


count
384


In [61]:
%sql SELECT * FROM users LIMIT 5;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
5 rows affected.


user_id,first_name,last_name,gender,level
7,Adelyn,Jordan,F,free
17,Makinley,Jones,F,free
20,Aiden,Ramirez,M,paid
22,Sean,Wilson,F,free
34,Evelin,Ayala,F,free


In [62]:
%sql SELECT * FROM songs LIMIT 5;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
5 rows affected.


song_id,title,artist_id,year,duration
SOAACFC12A8C140567,Supernatural Pt. II,ARNHTE41187B99289A,0,343.09179
SOAAFUV12AB018831D,Where Do The Children Play? (LP Version),AR5ZGC11187FB417A3,0,216.05832
SOAAKLA12A58A7A3CC,Snow Day (LP Version),ARGWNT41187FB463F1,0,211.90485
SOAAMWQ12A8C144DF1,Happy Nation,AR2IKF71187FB4D0C2,1992,255.08526
SOAAVYM12A8C13C43C,Barn's On Fire,ARGUSVR1187B9AD15E,2008,103.49669


In [63]:
%sql SELECT * FROM artists LIMIT 5;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
5 rows affected.


artist_id,name,location,latitude,longitude
AR00TGQ1187B994F29,Paula Toller,,,
AR02YGA1187B9B8AC4,Bersuit Vergarabat,Buenos Aires,-35.0,-58.0
AR049S81187B9AE8A5,The Human League,"Sheffield, Yorkshire, England",53.0,-1.0
AR04PRW1187FB4D60D,The Bens,,,
AR04S8J1187FB48358,Clifford Brown / Max Roach Quintet,"Wilmington, DE",40.0,-76.0


In [64]:
%sql SELECT * FROM time LIMIT 5;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
5 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-03 19:33:39.796000,19,3,44,11,2018,6
2018-11-03 21:14:28.796000,21,3,44,11,2018,6
2018-11-05 10:41:02.796000,10,5,45,11,2018,1
2018-11-05 11:17:25.796000,11,5,45,11,2018,1
2018-11-05 15:31:19.796000,15,5,45,11,2018,1


In [80]:
%sql SELECT DISTINCT month, year FROM time;

 * postgresql://sparkifyuser:***@sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com:5439/sparkifydb
1 rows affected.


month,year
11,2018


# Deleting Infrastructure

<b><font color='red'>DO NOT RUN THIS UNLESS YOU ARE SURE <br/> 
    We will be using these resources in the next exercises</span></b>

In [83]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=IAC_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'sparkifycluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'ClusterAvailabilityStatus': 'Modifying',
  'MasterUsername': 'sparkifyuser',
  'DBName': 'sparkifydb',
  'Endpoint': {'Address': 'sparkifycluster.chgw8krq7qfw.us-west-2.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2020, 2, 16, 9, 18, 30, 539000, tzinfo=tzutc()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ManualSnapshotRetentionPeriod': -1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-c1569fa4',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-6044a805',
  'AvailabilityZone': 'us-west-2d',
  'PreferredMaintenanceWindow': 'fri:07:00-fri:07:30',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,


- run this block several times until the cluster really deleted

In [85]:
myClusterProps = redshift.describe_clusters(ClusterIdentifier=IAC_CLUSTER_IDENTIFIER)['Clusters'][0]
prettyRedshiftProps(myClusterProps)

ClusterNotFoundFault: An error occurred (ClusterNotFound) when calling the DescribeClusters operation: Cluster sparkifycluster not found.

In [86]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
iam.detach_role_policy(RoleName=IAC_IAM_ROLE_NAME, PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess")
iam.delete_role(RoleName=IAC_IAM_ROLE_NAME)
#### CAREFUL!!

{'ResponseMetadata': {'RequestId': '4018a79b-113f-45f1-aa9a-44bec6f7b76f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4018a79b-113f-45f1-aa9a-44bec6f7b76f',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Sun, 16 Feb 2020 11:09:14 GMT'},
  'RetryAttempts': 0}}