In [1]:
import pandas as pd
import boto3
import json
import configparser

# Load DWH params from a file

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

# IAM_USER
KEY = config.get('IAM_USER', 'KEY')
SECRET = config.get('IAM_USER', 'SECRET')

# CLUSTER
CLUSTER_ID = config.get('CLUSTER', 'CLUSTER_ID')
CLUSTER_TYPE = config.get('CLUSTER','CLUSTER_TYPE')
NUM_NODES = int(config.get('CLUSTER','NUM_NODES'))
NODE_TYPE = config.get('CLUSTER','NODE_TYPE')

DB_NAME = config.get('DATABASE','DB_NAME')
DB_USERNAME = config.get('DATABASE','DB_USERNAME')
DB_PASSWORD = config.get('DATABASE','DB_PASSWORD')
DB_PORT = int(config.get('DATABASE','DB_PORT'))

# IAM_ROLE
IAM_ROLE_NAME = config.get('IAM_ROLE', 'IAM_ROLE_NAME')

# Create clients for IAM, EC2 and Redshift

In [3]:
ec2 = boto3.resource('ec2',
                     region_name='us-west-2',
                     aws_access_key_id=KEY,
                     aws_secret_access_key=SECRET)

iam = boto3.client('iam',
                   region_name='us-west-2',
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)

redshift = boto3.client('redshift',
                        region_name='us-west-2',
                        aws_access_key_id=KEY,
                        aws_secret_access_key=SECRET)

# 1. IAM role

In this section, I create an IAM role that enables Redshift cluster to read data from S3 bucket.

## 1.1. Create a new IAM role

In [4]:
try:
    print('Creating a new IAM role...')
    assume_role_policy_doc = json.dumps({'Statement': [{'Action': 'sts:AssumeRole',
                                                        'Effect': 'Allow',
                                                        'Principal': {'Service': 'redshift.amazonaws.com'}}],
                                         'Version': '2012-10-17'})
    iam.create_role(Path='/',
                    RoleName=IAM_ROLE_NAME,
                    Description='Allows Redshift clusters to call AWS services on your behalf.',
                    AssumeRolePolicyDocument=assume_role_policy_doc)
except Exception as e:
    print(e)

Creating a new IAM role...


## 1.2. Attach Policy to the IAM Role

In [5]:
try:
    print('Attaching Policy...')
    response = iam.attach_role_policy(RoleName=IAM_ROLE_NAME,
                                      PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess')
    print(response)
except Exception as e:
    print(e)

Attaching Policy...
{'ResponseMetadata': {'RequestId': '6e4cd246-554a-47b3-b319-a963e36b87c0', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '6e4cd246-554a-47b3-b319-a963e36b87c0', 'content-type': 'text/xml', 'content-length': '212', 'date': 'Sat, 26 Mar 2022 06:16:12 GMT'}, 'RetryAttempts': 0}}


## 1.3. Get and save the IAM Role\'s ARN

In [6]:
try:
    print('Getting the IAM role\'s ARN...')
    IAM_ROLE_ARN = iam.get_role(RoleName=IAM_ROLE_NAME)['Role']['Arn']
    
    # Write IAM Role's ARN back to config file
    config.set('IAM_ROLE', 'IAM_ROLE_ARN', IAM_ROLE_ARN)
    with open('dwh.cfg', 'w') as config_file:
        config.write(config_file)
    
    print(IAM_ROLE_ARN)
except Exception as e:
    print(e)

Getting the IAM role's ARN...
arn:aws:iam::911990516410:role/dwhrole


# 2. Redshift cluster

## 2.1. Create a new Redshift cluster

In [7]:
try:
    response = redshift.create_cluster(        
        # Resource information
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=NUM_NODES,

        # Identifiers & Credentials
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_ID,
        MasterUsername=DB_USERNAME,
        MasterUserPassword=DB_PASSWORD,
        
        # IAM Roles (for S3 access)
        IamRoles=[IAM_ROLE_ARN])
    print(response)
except Exception as e:
    print(e)

{'Cluster': {'ClusterIdentifier': 'dwhcluster', 'NodeType': 'dc2.large', 'ClusterStatus': 'creating', 'MasterUsername': 'dwhuser', 'DBName': 'dwhdb', 'AutomatedSnapshotRetentionPeriod': 1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-012531206da2fd1e5', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-07d3120729cd38b09', 'PreferredMaintenanceWindow': 'fri:08:00-fri:08:30', 'PendingModifiedValues': {'MasterUserPassword': '****'}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNodes': 4, 'PubliclyAccessible': True, 'Encrypted': False, 'Tags': [], 'EnhancedVpcRouting': False, 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::911990516410:role/dwhrole', 'ApplyStatus': 'adding'}], 'MaintenanceTrackName': 'current'}, 'ResponseMetadata': {'RequestId': '5370d46a-8367-4af5-b481-32bc64ab2eee', 'HTTPStatusCode': 200, 'HTTPH

## 2.2. Describe the cluster

<font color='red'>***Note***: Run the following cell mutiple times until the cluster status has become `Available`.

In [13]:
def get_prop_df(props):
    pd.set_option('display.max_colwidth', -1)
    keys = ['ClusterIdentifier', 'NodeType', 'ClusterStatus', 'MasterUsername',
            'DBName', 'Endpoint', 'NumberOfNodes', 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keys]
    return pd.DataFrame(data=x, columns=['Key', 'Value'])


try:
    cluster_props = redshift.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0]
except Exception as e:
    print(e)


get_prop_df(cluster_props)

Unnamed: 0,Key,Value
0,ClusterIdentifier,dwhcluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,dwhuser
4,DBName,dwhdb
5,Endpoint,"{'Address': 'dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-07d3120729cd38b09
7,NumberOfNodes,4


## 2.3. Get and save the cluster's endpoint

<font color='red'>***Note***: DO NOT run the following cell unless the cluster status has become `Available`.

In [14]:
DB_ENDPOINT = cluster_props['Endpoint']['Address']

# Write endpoint back to config file
config.set('DATABASE', 'DB_ENDPOINT', DB_ENDPOINT)
with open('dwh.cfg', 'w') as config_file:
    config.write(config_file)

print('DB_ENDPOINT :: ', DB_ENDPOINT)

DB_ENDPOINT ::  dwhcluster.cqg4kbulextn.us-west-2.redshift.amazonaws.com


# 3. Open an inbound TCP port to access the cluster's endpoint

<font color='red'>***Note***: DO NOT RUN this unless the security rule is not allowed.

In [15]:
# try:
#     vpc = ec2.Vpc(id=cluster_props['VpcId'])
#     default_sg = list(vpc.security_groups.all())[0]
#     print(default_sg)
#     default_sg.authorize_ingress(GroupName=default_sg.group_name,
#                                  CidrIp='0.0.0.0/0',
#                                  IpProtocol='TCP',
#                                  FromPort=DB_PORT,
#                                  ToPort=DB_PORT)
# except Exception as e:
#     print(e)

ec2.SecurityGroup(id='sg-012531206da2fd1e5')
An error occurred (InvalidPermission.Duplicate) when calling the AuthorizeSecurityGroupIngress operation: the specified rule "peer: 0.0.0.0/0, TCP, from port: 5439, to port: 5439, ALLOW" already exists


# 4. Clean up resources

## 4.1. Clean up Redshift cluster

In [None]:
# # Uncomment & run to delete the created cluster
# try:
#     redshift.delete_cluster(ClusterIdentifier=CLUSTER_ID, SkipFinalClusterSnapshot=True)
# except Exception as e:
#     print(e)

In [None]:
# # Check cluster status
# def get_prop_df(props):
#     pd.set_option('display.max_colwidth', -1)
#     keys = ['ClusterIdentifier', 'NodeType', 'ClusterStatus', 'MasterUsername',
#             'DBName', 'Endpoint', 'NumberOfNodes', 'VpcId']
#     x = [(k, v) for k,v in props.items() if k in keys]
#     return pd.DataFrame(data=x, columns=['Key', 'Value'])


# try:
#     cluster_props = redshift.describe_clusters(ClusterIdentifier=CLUSTER_ID)['Clusters'][0]
#     get_prop_df(cluster_props)
# except Exception as e:
#     print(e)

## 4.2. Clean up IAM role

In [None]:
# # Uncomment & run to delete the created IAM role
# try:
#     iam.detach_role_policy(RoleName=IAM_ROLE_NAME, PolicyArn='arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess')
#     iam.delete_role(RoleName=IAM_ROLE_NAME)
# except Exception as e:
#     print(e)