# AWSCONNECT
Datawarehousing configuration:
- Connect to AWS-SDK
- Create client (EC2, S3, IAM, and Redshift)
- Execute steps:
    1. Setup IAM Role
    2. Start the Redshift Cluster  
    3. Open an incoming TCP port to access the cluster endpoint (Security Group Inbound Traffic Setup)
    4. Make sure you can connect to the cluster
    5. Update the Config File with DWH_ENDPOINT, HOST, and ARN
    6. Clean up resources: Destroy Redshift Cluster

### Connect to AWS-SDK

In [31]:
from time import time
import configparser
import matplotlib.pyplot as plt
import pandas as pd

In [41]:
# CONFIG
config = configparser.ConfigParser()
config.read('dwh.cfg')

KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

ARN                    = ''

if config.get('IAM_ROLE', 'ARN'):
    ARN                = config.get('IAM_ROLE', 'ARN')

DWH_CLUSTER_TYPE       = config.get("DWH","DWH_CLUSTER_TYPE")
DWH_NUM_NODES          = config.get("DWH","DWH_NUM_NODES")
DWH_NODE_TYPE          = config.get("DWH","DWH_NODE_TYPE")

DWH_CLUSTER_IDENTIFIER = config.get("DWH","DWH_CLUSTER_IDENTIFIER")
DWH_DB                 = config.get("DWH","DWH_DB")
DWH_DB_USER            = config.get("DWH","DWH_DB_USER")
DWH_DB_PASSWORD        = config.get("DWH","DWH_DB_PASSWORD")
DWH_PORT               = config.get("DWH","DWH_PORT")
DWH_IAM_ROLE_NAME      = config.get("DWH","DWH_IAM_ROLE_NAME")   

(DWH_DB_USER, DWH_DB_PASSWORD, DWH_DB)

def show_config(config):
    return {"Param":
                      ["DWH_CLUSTER_TYPE", "DWH_NUM_NODES", "DWH_NODE_TYPE", "DWH_CLUSTER_IDENTIFIER", "DWH_DB", "DWH_DB_USER", "DWH_DB_PASSWORD", "DWH_PORT", "DWH_IAM_ROLE_NAME"],
                  "Value":
                      [DWH_CLUSTER_TYPE, DWH_NUM_NODES, DWH_NODE_TYPE, DWH_CLUSTER_IDENTIFIER, DWH_DB, DWH_DB_USER, DWH_DB_PASSWORD, DWH_PORT, DWH_IAM_ROLE_NAME]
                 }

In [42]:
pd.DataFrame(show_config(config))

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,sparkifyCluster
4,DWH_DB,sparkifydwh
5,DWH_DB_USER,sparkifyadmin
6,DWH_DB_PASSWORD,2w#EOA8$7o*&
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,sparkifydwhrole


#### Create client (EC2, S3, IAM, and Redshift)

In [44]:
import boto3
ec2 = boto3.resource('ec2',
                       region_name="us-east-1",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                    )

s3 = boto3.resource('s3',
                    region_name="us-east-1",
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)

iam = boto3.client('iam',
                    region_name="us-east-1",
                   aws_access_key_id=KEY,
                   aws_secret_access_key=SECRET)

redshift = boto3.client('redshift',
                       region_name="us-east-1",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET)

#### Step 1: Setup IAM Role

In [45]:
import json

try:
    print('1.1 Creating a new IAM Role')
    dwhRole = iam.create_role(
        Path = '/',
        RoleName = DWH_IAM_ROLE_NAME,
        Description = "Allows Redshift clusters to call AWS services on your behalf.",
        AssumeRolePolicyDocument = json.dumps(
            {
                'Statement': [{
                    'Action': 'sts:AssumeRole',
                    'Effect': 'Allow',
                    'Principal': {
                        'Service': 'redshift.amazonaws.com'
                    }
                }],
                'Version': '2012-10-17'
            }
        )
    )
except Exception as e:
    print(e)
    
print('1.2 Attaching Policy')

iam.attach_role_policy(RoleName=DWH_IAM_ROLE_NAME,
                       PolicyArn="arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
                      )["ResponseMetadata"]["HTTPStatusCode"]

print('1.3 Get the IAM role ARN')
roleArn = iam.get_role(RoleName = DWH_IAM_ROLE_NAME)['Role']['Arn']

print(roleArn)

1.1 Creating a new IAM Role
An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name sparkifydwhrole already exists.
1.2 Attaching Policy
1.3 Get the IAM role ARN
arn:aws:iam::497861409537:role/sparkifydwhrole


#### Step 2: Start the Redshift Cluster

In [46]:
try:
    response = redshift.create_cluster(
        #HW
        ClusterType = DWH_CLUSTER_TYPE,
        NodeType = DWH_NODE_TYPE,
        NumberOfNodes = int(DWH_NUM_NODES),
        
        # Identifiers & Credentials
        DBName = DWH_DB,
        ClusterIdentifier = DWH_CLUSTER_IDENTIFIER,
        MasterUsername = DWH_DB_USER,
        MasterUserPassword = DWH_DB_PASSWORD,
        
        # Roles (for s3 access)
        IamRoles = [roleArn]
    )
    print(response)
except Exception as e:
    print(e)

{'Cluster': {'ClusterIdentifier': 'sparkifycluster', 'NodeType': 'dc2.large', 'ClusterStatus': 'creating', 'MasterUsername': 'sparkifyadmin', 'DBName': 'sparkifydwh', 'AutomatedSnapshotRetentionPeriod': 1, 'ClusterSecurityGroups': [], 'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-347d1676', 'Status': 'active'}], 'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0', 'ParameterApplyStatus': 'in-sync'}], 'ClusterSubnetGroupName': 'default', 'VpcId': 'vpc-5545f42f', 'PreferredMaintenanceWindow': 'thu:04:30-thu:05:00', 'PendingModifiedValues': {'MasterUserPassword': '****'}, 'ClusterVersion': '1.0', 'AllowVersionUpgrade': True, 'NumberOfNodes': 4, 'PubliclyAccessible': True, 'Encrypted': False, 'Tags': [], 'EnhancedVpcRouting': False, 'IamRoles': [{'IamRoleArn': 'arn:aws:iam::497861409537:role/sparkifydwhrole', 'ApplyStatus': 'adding'}], 'MaintenanceTrackName': 'current'}, 'ResponseMetadata': {'RequestId': '904e1ec7-9129-4cb3-b8da-db64d033f504', 'HTTPStatusCode': 200,

#### 2.1 Describe the cluster to see its status

In [47]:
import time
wait=30
def prettyRedshiftProps(props):
    pd.set_option('display.max_colwidth', -1)
    keysToShow = ["ClusterIdentifier", "NodeType", "ClusterStatus", "MasterUsername", "DBName", "Endpoint", "NumberOfNodes", 'VpcId']
    x = [(k, v) for k,v in props.items() if k in keysToShow]
    return pd.DataFrame(data=x, columns=["Key", "Value"])

# myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
# prettyRedshiftProps(myClusterProps)

status = 'creating'
while status == 'creating':
    myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
    redstatus = prettyRedshiftProps(myClusterProps)
    if redstatus.loc[2,'Value'] == 'creating':
        print('creating clusters...')
        time.sleep(wait)
    else:
        status = redstatus.loc[2,'Value']
        print(f'Cluster is {status}')
        
prettyRedshiftProps(myClusterProps)

creating clusters...
creating clusters...
creating clusters...
creating clusters...
creating clusters...
creating clusters...
Cluster is available


Unnamed: 0,Key,Value
0,ClusterIdentifier,sparkifycluster
1,NodeType,dc2.large
2,ClusterStatus,available
3,MasterUsername,sparkifyadmin
4,DBName,sparkifydwh
5,Endpoint,"{'Address': 'sparkifycluster.csyjfczfmp7g.us-east-1.redshift.amazonaws.com', 'Port': 5439}"
6,VpcId,vpc-5545f42f
7,NumberOfNodes,4


#### Visualize Cluster ENDPOINT and ROLE ARN

In [48]:
DWH_ENDPOINT = myClusterProps['Endpoint']['Address']
DWH_ROLE_ARN = myClusterProps['IamRoles'][0]['IamRoleArn']
print("DWH_ENDPOINT :: ", DWH_ENDPOINT)
print("DWH_ROLE_ARN :: ", DWH_ROLE_ARN)

DWH_ENDPOINT ::  sparkifycluster.csyjfczfmp7g.us-east-1.redshift.amazonaws.com
DWH_ROLE_ARN ::  arn:aws:iam::497861409537:role/sparkifydwhrole


#### Step 3: Open an incoming  TCP port to access the cluster endpoint

In [49]:
try:
    vpc = ec2.Vpc(id=myClusterProps['VpcId'])
    defaultSg = list(vpc.security_groups.all())[0]
    print('TCP Connection Open')
    defaultSg.authorize_ingress(
        GroupName='default',
        CidrIp='0.0.0.0/0',
        IpProtocol='TCP',
        FromPort=int(DWH_PORT),
        ToPort=int(DWH_PORT)
    )
except Exception as e:
    print(e)

TCP Connection Open
An error occurred (UnauthorizedOperation) when calling the AuthorizeSecurityGroupIngress operation: You are not authorized to perform this operation. Encoded authorization failure message: FSTn5XxQqTlfF1BN_9YYN7aP2PyEWVXMs6JMi5lH4HBFe1bJ86PzHX9WhSFt-z7ysdt0A-rrw2Nbu08twuv76KTYlk6NVkCJRQ564tQZIHqq-X4Nrq0Ksbk60jbJ374l_tj1qm24vA6Lv4xnnzkpXwR8TAhUCfER8EVbiSNFuMr4Ew3fslEpuLWoNCPx-KGII06fP3F4Ro_TXuH1WgtZSIkq8cEAkgTuiTarTMlIQ-kc4iEnHIoy0Fbik9I1FspNNUxTTPY7OWMJ5GrBzWFpZBf8YxB6mkIF6XT0IX9Ql5zGxW0sacxyiKT5YBCZuAOmI8ySO5KFUCgr54Z0EXzczElHI05SEWVVFFE-FfVHhk77VMqaoGdiyttC-pUjQ3gknXMHUseHxX7LHIlke7O62l9fUb-K1JHSUH2EF3LtLHwZDSHHbI7xZxjCGmVdbXDrRlD2Kg43AzfAZ-_bR3YQK_NhXOQ0yuBiZzTlhvTdolzbmBYN5JOv4vpIg3a2BIFQDvuzE2I4mkHNUsPOlfJHbXvGYAwZqHmwmfgv6VQKGiGjy06jXvbAvE1r2BNLBczfOqUzmtVTV1lBzTuMnxNaew4


#### Step 4: Make sure you can connect to the cluster

In [50]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [51]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DWH_DB_USER, DWH_DB_PASSWORD, DWH_ENDPOINT, DWH_PORT, DWH_DB)
%sql $conn_string

'Connected: sparkifyadmin@sparkifydwh'

#### Step 5: Update the Config File with DWH_ENDPOINT, HOST, and ARN

In [54]:
config.read_file(open('dwh.cfg'))

In [55]:
config.set('DWH', 'DWH_ENDPOINT', DWH_ENDPOINT)
config.set('IAM_ROLE', 'ARN', DWH_ROLE_ARN)
config.set('CLUSTER', 'HOST', DWH_ENDPOINT)

In [56]:
with open('dwh.cfg', 'w') as configfile:
    config.write(configfile)

### Step 6: Clean up resources: Destroy Redshift Cluster

In [57]:
#### CAREFUL!!
#-- Uncomment & run to delete the created resources
redshift.delete_cluster( ClusterIdentifier=DWH_CLUSTER_IDENTIFIER,  SkipFinalClusterSnapshot=True)
#### CAREFUL!!

{'Cluster': {'ClusterIdentifier': 'sparkifycluster',
  'NodeType': 'dc2.large',
  'ClusterStatus': 'deleting',
  'MasterUsername': 'sparkifyadmin',
  'DBName': 'sparkifydwh',
  'Endpoint': {'Address': 'sparkifycluster.csyjfczfmp7g.us-east-1.redshift.amazonaws.com',
   'Port': 5439},
  'ClusterCreateTime': datetime.datetime(2021, 6, 7, 5, 34, 28, 917000, tzinfo=tzlocal()),
  'AutomatedSnapshotRetentionPeriod': 1,
  'ClusterSecurityGroups': [],
  'VpcSecurityGroups': [{'VpcSecurityGroupId': 'sg-347d1676',
    'Status': 'active'}],
  'ClusterParameterGroups': [{'ParameterGroupName': 'default.redshift-1.0',
    'ParameterApplyStatus': 'in-sync'}],
  'ClusterSubnetGroupName': 'default',
  'VpcId': 'vpc-5545f42f',
  'AvailabilityZone': 'us-east-1b',
  'PreferredMaintenanceWindow': 'thu:04:30-thu:05:00',
  'PendingModifiedValues': {},
  'ClusterVersion': '1.0',
  'AllowVersionUpgrade': True,
  'NumberOfNodes': 4,
  'PubliclyAccessible': True,
  'Encrypted': False,
  'Tags': [],
  'EnhancedVpc

#### Step #6 - subitem: Describe the cluster to see its status

In [58]:
import time
wait=30
status = 'deleting'
while status == 'deleting':
    try:
        myClusterProps = redshift.describe_clusters(ClusterIdentifier=DWH_CLUSTER_IDENTIFIER)['Clusters'][0]
        redstatus = prettyRedshiftProps(myClusterProps)
        if redstatus.loc[2,'Value'] == status:
            time.sleep(wait)
        else:
            status = redstatus.loc[2,'Value']
    except:
        status = 'deleted'
    finally:
        print(f'Cluster is {status}')
        
prettyRedshiftProps(myClusterProps)

Cluster is deleting
Cluster is deleting
Cluster is deleting
Cluster is deleted


Unnamed: 0,Key,Value
0,ClusterIdentifier,sparkifycluster
1,NodeType,dc2.large
2,ClusterStatus,deleting
3,MasterUsername,sparkifyadmin
4,DBName,sparkifydwh
5,Endpoint,{'Port': 5439}
6,VpcId,vpc-5545f42f
7,NumberOfNodes,4


# Last step: Clean sensitive/private configurations

In [59]:
config.set('DWH', 'DWH_ENDPOINT', '')
config.set('IAM_ROLE', 'ARN', '')
config.set('CLUSTER', 'HOST', '')

In [29]:
with open('dwh.cfg', 'w') as configfile:
    config.write(configfile)

In [30]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

pd.DataFrame(show_config(config))

Unnamed: 0,Param,Value
0,DWH_CLUSTER_TYPE,multi-node
1,DWH_NUM_NODES,4
2,DWH_NODE_TYPE,dc2.large
3,DWH_CLUSTER_IDENTIFIER,sparkifyCluster
4,DWH_DB,sparkifydwh
5,DWH_DB_USER,sparkifyadmin
6,DWH_DB_PASSWORD,2w#EOA8$7o*&
7,DWH_PORT,5439
8,DWH_IAM_ROLE_NAME,sparkifydwhrole
