# AWS Resources deployment using Infrastructure-as-Code (on a Jupyter Notebook)

### Libraries

In [88]:
!python3 -m pip install --upgrade pip
# !python3 -m pip install boto3
# !python3 -m pip install requests
# !python3 -m pip install tqdm
# !python3 -m pip install pandas
# !python3 -m pip install s3fs
# !python3 -m pip install ipywidgets
# !python3 -m pip install -q -U paramiko
# !python3 -m pip install -q -U scp
# !pip install -q -U ipython-sql
# !pip install -q -U psycopg2-binary
# !python3 -m pip install -q -U Pygments

Defaulting to user installation because normal site-packages is not writeable


In [89]:
import IPython
import boto3
import time
import datetime
import os
import json
import requests
import pandas as pd
import paramiko
import scp
from zipfile import ZipFile
from urllib.request import urlopen
from tqdm.notebook import tqdm
import psycopg2
from botocore.config import Config

#### Some basic settings

In [90]:
#core
my_bucket_name = 'dantohe-my-experimental-iac-02'
# the covid-19 data lake is located in us-east-2
my_region = 'us-east-2'
stem = 'my-experimental'

#ec2
my_InstanceProfileName = f'{stem}-InstanceProfileName-iac-01'
ec2_pem_name      = f'{stem}-kp-june-2021-01'
my_role_name = f'{stem}-ec2-role-01'
my_security_group_name = f'{stem}-Airflow-security-group-01'
# https://aws.amazon.com/ec2/spot/pricing/
# t3.2xlarge
instance_size = 't3.xlarge'
instance_max_price = '0.12'

#redshift
my_redshift_role_name = f'{stem}-Redshift-role-01'
redshift_port = 5439
redshift_user = 'redshift'
# Only printable ASCII characters except for '/', '@', '"', ' ', '\', ''' may be used.
redshift_MasterUserPassword = 'kljhdfsKLJDD12345'
redshift_db=f'{stem}-capstone-db'
redshift_ClusterIdentifier=f'{stem}-redshift-cluster'
#https://aws.amazon.com/redshift/pricing/
redshift_NodeType='dc2.large'
# https://docs.aws.amazon.com/redshift/latest/mgmt/working-with-clusters.html
redshift_NumberOfNodes=1
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
redshift_ClusterType='single-node'

redshift_destination_external_schema_name = 'spectrum_schema'
redshift_destination_glue_database_name = 'covid-19'
redshift_destination_table_name = 'alleninstitute_metadata'


#### Configuring the clients

In [91]:
my_config = Config(
    region_name = my_region,
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

# client = boto3.client('kinesis', config=my_config)
ec2 = boto3.client('ec2', config=my_config)
iam = boto3.client('iam', config=my_config)
redshift = boto3.client('redshift', config=my_config)

#### Create an EC2 key-pair    
If the key already exists then don't do anything.    

In [92]:
key_exists = False
response = ec2.describe_key_pairs()['KeyPairs']
for key in response:
    if key['KeyName'] == ec2_pem_name:
        key_exists = True
    found_instance = ec2.describe_instances(
        Filters=[
            {
                'Name': 'key-name',
                'Values': [key['KeyName']]
            }
        ]
    )['Reservations']

if key_exists:
    print('key already exists')
else:
    ec2_pem_path = f'./{ec2_pem_name}.pem'
    if os.path.isfile(ec2_pem_path):
        os.remove(ec2_pem_path)
    ec2_keypair = ec2.create_key_pair(KeyName=ec2_pem_name)
    with open(ec2_pem_path, 'w+') as ec2_pem_file:
        ec2_pem_file.write(str(ec2_keypair['KeyMaterial']))
    !chmod 400 {ec2_pem_path}
    print(f'{ec2_pem_name} has been created sucessfully and the pem is available at\n{ec2_pem_path}')
    

my-experimental-kp-june-2021-01 has been created sucessfully and the pem is available at
./my-experimental-kp-june-2021-01.pem


## EC2 Resources    
IAM - refrences: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/iam.html#role   


### Create a role and attach the s3 access policies

#### A utility that checks if a given role already exists    
If already in place returns the role object, otherwise returns a None.    

In [93]:
def does_role_already_exist(role_name):
    roles = iam.list_roles()
    role_list = roles['Roles']
    requested_role= None

    for role in role_list:
        if role['RoleName'] == role_name:
            requested_role = role
            return requested_role
    return requested_role

### Create role for ec2 and attach s3 access policies

In [94]:
roles = iam.list_roles()
role_list = roles['Roles']
ec2_role= None

for key in role_list:
    if key['RoleName'] == my_role_name:
        ec2_role = key

if ec2_role is not None:
    print(f'Role {my_role_name} already exists')
else:
    ec2_role = iam.create_role(
        Path='/',
        RoleName=my_role_name,
        Description='',
        MaxSessionDuration=3600,
        AssumeRolePolicyDocument="""{
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": { "Service": "ec2.amazonaws.com"},
          "Action": "sts:AssumeRole"
        }
      ]
    }""".replace('<dw_bucket>', my_bucket_name))['Role']
    ### Also atach the S3 policy to the role
    for ec2_policy in  [
        'arn:aws:iam::aws:policy/AmazonS3FullAccess']:
        assert iam.attach_role_policy(
            RoleName=ec2_role['RoleName'],
            PolicyArn=ec2_policy)['ResponseMetadata']['HTTPStatusCode'] == 200
    
    print(f'{my_role_name} has been createed - the S3 policies have been attached')
#     ec2_role['Arn']

my-experimental-ec2-role-01 has been createed - the S3 policies have been attached


### Creates the instance profile AND adds the role to instance profile

In [95]:
instance_profiles = iam.list_instance_profiles()
instance_profiles_list = instance_profiles['InstanceProfiles']
ec2_instance_profile = None

# existing_instance_profile_names =[]

for key in instance_profiles_list:
    if key['InstanceProfileName'] == my_InstanceProfileName:
        ec2_instance_profile =key
#     existing_instance_profile_names.append(key['InstanceProfileName'])

# if my_InstanceProfileName in existing_instance_profile_names:
#     print(f'{my_InstanceProfileName} already exists')
if ec2_instance_profile is not None:
    print(f'{my_InstanceProfileName} already exists')
else:
    #creates the instaance profile
    ec2_instance_profile = iam.create_instance_profile(InstanceProfileName=my_InstanceProfileName)['InstanceProfile']
    iam.get_waiter('instance_profile_exists').wait(InstanceProfileName=my_InstanceProfileName)

    #adds the role to the instance profile
    assert iam.add_role_to_instance_profile(InstanceProfileName=ec2_instance_profile['InstanceProfileName'], RoleName=ec2_role['RoleName'])['ResponseMetadata']['HTTPStatusCode'] == 200
    print(f'{my_InstanceProfileName} has been created')

my-experimental-InstanceProfileName-iac-01 has been created


### Createting a security group

In [96]:
security_groups = ec2.describe_security_groups()
existing_security_groups = security_groups['SecurityGroups']

ec2_sg = None

for key in existing_security_groups:
    if key['GroupName'] == my_security_group_name:
      ec2_sg=key  
    
if ec2_sg is not None:
    print(f'The security group {my_security_group_name} already exists')
else:
    ec2_sg = ec2.create_security_group(
        Description='Allows 22 trafic',
        GroupName=my_security_group_name)
    ec2.authorize_security_group_ingress(CidrIp='0.0.0.0/0', FromPort=22, ToPort=22, GroupId=ec2_sg['GroupId'], IpProtocol='TCP')
    ec2.authorize_security_group_ingress(CidrIp='0.0.0.0/0', FromPort=8080, ToPort=8080, GroupId=ec2_sg['GroupId'], IpProtocol='TCP')
    ec2.authorize_security_group_ingress(CidrIp='0.0.0.0/0', FromPort=5555, ToPort=5555, GroupId=ec2_sg['GroupId'], IpProtocol='TCP')
    ec2.authorize_security_group_ingress(CidrIp='0.0.0.0/0', FromPort=3306, ToPort=3306, GroupId=ec2_sg['GroupId'], IpProtocol='TCP')
    print(f'The security group {my_security_group_name} has been created')
    print(f"SG ID: {ec2_sg['GroupId']}")

The security group my-experimental-Airflow-security-group-01 has been created
SG ID: sg-0978612379bde2037


### Requesting spot instance(s)

In [97]:
# !aws configure set region 'us-east-2'
my_session = boto3.session.Session()
my_region = my_session.region_name
ags_west = boto3.client('autoscaling', region_name=my_region)
print(f"We are in Region: {my_region}")

We are in Region: us-east-2


In [98]:
# time.sleep(30) #wait instance profile...
#Amazon Linux AMI - it has some issues and complications with installing mysql and airflow
# ec2_ami_id = 'ami-0aeeebd8d2ab47354'
#defaulting to ubuntu
# us-east-1 'ami-09e67e426f25ce0d7'
# us-west-2 'ami-03d5c68bab01f3496'
# us-east-2 'ami-00399ec92321828f5'
ec2_ami_id = 'ami-00399ec92321828f5'
ec2_spot = ec2.request_spot_instances(
    AvailabilityZoneGroup=my_region,
    InstanceCount=1,
    LaunchSpecification={
        'SecurityGroupIds': [ec2_sg['GroupId']],
        'EbsOptimized': False,
        'KeyName': ec2_pem_name,
        'ImageId': ec2_ami_id,
        'InstanceType': instance_size,
        'IamInstanceProfile': {
            'Arn': ec2_instance_profile['Arn']
        },
        "BlockDeviceMappings": [
            {
                "DeviceName": "/dev/sda1",
                "Ebs": {
                        "DeleteOnTermination": True,
                        "VolumeSize": 30,
                        "Encrypted": False,
                        "VolumeType": "gp2"
                }
            }
        ],
    },
    SpotPrice=instance_max_price,
    Type='one-time',
    InstanceInterruptionBehavior='terminate'
)
ec2_spot_id = ec2_spot['SpotInstanceRequests'][0]['SpotInstanceRequestId']
ec2.get_waiter('spot_instance_request_fulfilled').wait(SpotInstanceRequestIds=[ec2_spot_id])
print(f'Spot instance request: {ec2_spot_id}')

Spot instance request: sir-hsn6a88p


### Gets the instance ID

In [99]:
ec2_vm_id = ec2.describe_spot_instance_requests(SpotInstanceRequestIds=[ ec2_spot_id ]) \
    ['SpotInstanceRequests'] \
    [0] \
    ['InstanceId']
ec2.get_waiter('instance_status_ok').wait(InstanceIds=[ ec2_vm_id ])
print(f'InstanceIds: {ec2_vm_id}')

InstanceIds: i-00ba1532615d19b80


### Allocating a public IP address

In [100]:
ec2_ip = ec2.allocate_address(Domain='vpc')
print(f"PublicIp: {ec2_ip['PublicIp']}\nAllocationId: {ec2_ip['AllocationId']}")

PublicIp: 3.140.156.246
AllocationId: eipalloc-055ed1ae0ef5d2202


### Associates the IP address with the instance

In [101]:
ec2_vm_ip = ec2.associate_address(
     InstanceId = ec2_vm_id,
     AllocationId = ec2_ip["AllocationId"])
print(f"IP AssociationId: {ec2_vm_ip['AssociationId']}")

IP AssociationId: eipassoc-02260c8692a1870ae


## SSH

### SSH utilities

In [102]:
def get_ssh(ip, pem_path):
    print(f"ssh -i {pem_path} ubuntu@{ip}")
    ssh = paramiko.SSHClient()
    ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    ssh.connect(hostname=ip, username='ubuntu', pkey=paramiko.RSAKey.from_private_key_file(pem_path))
    return ssh

def run_via_ssh(
        ip,
        pem_path,
        commands,
        display_output=False):
    
    ssh = get_ssh(ip, pem_path)
    try:
        for command in tqdm(commands):
            stdin, stdout, stderr = ssh.exec_command(command)
            exit_status = stdout.channel.recv_exit_status()
            if exit_status == 0:
                print(('command executed successfuly:::', command))
                if display_output:
                    output_buffer = stdout.read().decode('utf-8')
                    if output_buffer:
                        print(f">>> {output_buffer}")
            else:
                error_buffer = stderr.read().decode('utf-8')
                print(('!!!failed', command))
                print(f"!!! {error_buffer}")
    finally:
        ssh.close()

# 
def print_python_code(code):
    from pygments import highlight
    from pygments.lexers import PythonLexer
    from pygments.formatters import HtmlFormatter
    import IPython
    formatter = HtmlFormatter()
    return IPython.display.HTML('<style type="text/css">{}</style>{}'.format(
        formatter.get_style_defs('.highlight'),
        highlight(code, PythonLexer(), formatter)))
   
    
def upload_dag_file(ip, pem_path, file_name, family_dir='airflow', display_file=True):
    code = None
    file_path = f'{file_name}'
    file_dir = os.path.dirname(file_path)
    if display_file:
        with open(file_path) as f:
            code = f.read()
        if not code:
            return None
    
    ssh = get_ssh(ip, pem_path)
    try:
        remote_file_dir = f'~/{family_dir}/{file_dir}'
        print(f"scp -i my-experimental-kp-june-2021-01.pem '{file_path}' 'ubuntu@{ip}:{remote_file_dir}'")
        scp_client = scp.SCPClient(ssh.get_transport())
        scp_client.put(files=[file_path], remote_path=remote_file_dir, preserve_times=True)
    finally:
        ssh.close()
    
    if display_file:
        return print_python_code(code)
    else:
        return file_path

### Instaling Airflow and libraries
This can be achieved using a requirements file but for better visibility it will be kept in this format for now.

In [103]:
run_via_ssh(
    ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        'sudo apt-get -y update',
#         'sudo apt-get install -y libmysqlclient-dev mysql-server',
#         f"sudo mysql -e \"SET GLOBAL explicit_defaults_for_timestamp = 1;\"",
#         f"sudo mysql -e \"DROP DATABASE IF EXISTS airflow;\"",       
#         f"sudo mysql -e \"CREATE DATABASE airflow CHARACTER SET UTF8mb3 COLLATE utf8_general_ci;\"",
#         f"sudo mysql -e \"CREATE USER 'airflow'@'localhost' IDENTIFIED BY 'airflow';\"",
#         f"sudo mysql -e \"GRANT ALL PRIVILEGES ON airflow.* TO 'airflow'@'localhost';\"",
#         f"sudo apt install -y redis-server",
        'sudo apt-get install -y python3 python3-pip python3-setuptools',
        'sudo pip3 install -U pip',
        'sudo pip3 install -U apache-airflow',
#         'sudo pip3 install -U apache-airflow[mysql]',
#         'sudo pip3 install -U apache-airflow[celery]',
        'sudo pip3 install -U boto3==1.15.0',
        'sudo pip3 install -U jsonpath_ng==1.5.3',
        'sudo pip3 install -U pandas==1.4.0',
        'sudo pip3 install -U redshift_connector==2.0.888',
        'sudo pip3 install -U sqlalchemy_redshift==0.8.6',
        'sudo pip3 install -U watchtower==2.0.1',
        'sudo pip3 install -U apache-airflow-providers-amazon',
        'sudo pip3 install -U tensorflow',
        'sudo pip3 install -U pandas',
        'sudo pip3 install -U scikit-learn',
        'sudo pip3 install -U numpy',
        'sudo pip3 install -U psycopg2-binary',
        'sudo pip3 install -U requests',
        'sudo pip3 install -U boto3',
        'sudo pip3 install -U matplotlib',
        'sudo pip3 install -U reportlab',
        'sudo pip3 install -U flower',
        'sudo pip3 install -U proj',
        'sudo pip3 install -U apache-airflow[postgres]',
        'sudo pip3 install -U apache-airflow[amazon]',
        'sudo pip3 install -U tqdm',
        'sudo pip3 install -U langdetect',
        'sudo pip3 install -U https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz',
        'sudo pip3 install -U https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz',
        'sudo pip3 install -U https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz',
        'sudo pip3 install -U redis'
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/30 [00:00<?, ?it/s]

('command executed successfuly:::', 'sudo apt-get -y update')
('command executed successfuly:::', 'sudo apt-get install -y python3 python3-pip python3-setuptools')
('command executed successfuly:::', 'sudo pip3 install -U pip')
('command executed successfuly:::', 'sudo pip3 install -U apache-airflow')
('command executed successfuly:::', 'sudo pip3 install -U boto3==1.15.0')
('command executed successfuly:::', 'sudo pip3 install -U jsonpath_ng==1.5.3')
('command executed successfuly:::', 'sudo pip3 install -U pandas==1.4.0')
('command executed successfuly:::', 'sudo pip3 install -U redshift_connector==2.0.888')
('command executed successfuly:::', 'sudo pip3 install -U sqlalchemy_redshift==0.8.6')
('command executed successfuly:::', 'sudo pip3 install -U watchtower==2.0.1')
('command executed successfuly:::', 'sudo pip3 install -U apache-airflow-providers-amazon')
('command executed successfuly:::', 'sudo pip3 install -U tensorflow')
('command executed successfuly:::', 'sudo pip3 install

### Configure Airflow

In [104]:
run_via_ssh(ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        'airflow db init',
        'sudo apt-get install -y crudini',
        "crudini --set ~/airflow/airflow.cfg core load_examples False",
        "crudini --set ~/airflow/airflow.cfg core load_default_connections True",
#         "crudini --set ~/airflow/airflow.cfg core sql_alchemy_conn 'mysql://airflow:airflow@localhost/airflow'",
#         "crudini --set ~/airflow/airflow.cfg core executor CeleryExecutor",
#         "crudini --set ~/airflow/airflow.cfg core sql_alchemy_schema airflow",
        "crudini --set ~/airflow/airflow.cfg scheduler min_file_process_interval 10",
        "crudini --set ~/airflow/airflow.cfg scheduler dag_dir_list_interval 3",
#         "crudini --set ~/airflow/airflow.cfg celery result_backend 'redis://127.0.0.1:6379/0'",
#         "crudini --set ~/airflow/airflow.cfg celery broker_url 'db+mysql://airflow:airflow@localhost/airflow'",
        'airflow db init',
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/7 [00:00<?, ?it/s]

('command executed successfuly:::', 'airflow db init')
('command executed successfuly:::', 'sudo apt-get install -y crudini')
('command executed successfuly:::', 'crudini --set ~/airflow/airflow.cfg core load_examples False')
('command executed successfuly:::', 'crudini --set ~/airflow/airflow.cfg core load_default_connections True')
('command executed successfuly:::', 'crudini --set ~/airflow/airflow.cfg scheduler min_file_process_interval 10')
('command executed successfuly:::', 'crudini --set ~/airflow/airflow.cfg scheduler dag_dir_list_interval 3')
('command executed successfuly:::', 'airflow db init')


### Create Airflow dag directory

In [105]:
run_via_ssh(
    ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        'mkdir -p ~/airflow/dags',
        f"mkdir -p ~/pipe/ml/data",
        f"mkdir -p ~/pipe/ml/model",
        f"mkdir -p ~/pipe/ml/working",
        f"mkdir -p ~/pipe/ml/images"
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/5 [00:00<?, ?it/s]

('command executed successfuly:::', 'mkdir -p ~/airflow/dags')
('command executed successfuly:::', 'mkdir -p ~/pipe/ml/data')
('command executed successfuly:::', 'mkdir -p ~/pipe/ml/model')
('command executed successfuly:::', 'mkdir -p ~/pipe/ml/working')
('command executed successfuly:::', 'mkdir -p ~/pipe/ml/images')


### Start Airflow

In [106]:
run_via_ssh(
    ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        'airflow users  create --role Admin --username admin --email admin --firstname admin --lastname admin --password admin',
        'airflow scheduler -D',
#         'airflow celery worker -D',
#         'airflow celery flower -D',
        'airflow webserver -p 8080 -D'
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/3 [00:00<?, ?it/s]

('command executed successfuly:::', 'airflow users  create --role Admin --username admin --email admin --firstname admin --lastname admin --password admin')
('command executed successfuly:::', 'airflow scheduler -D')
('command executed successfuly:::', 'airflow webserver -p 8080 -D')


### Accesing the environment 

In [107]:
print(f"chmod 600 {ec2_pem_name}.pem")
print(f"SSH      : ssh -i {ec2_pem_name} ubuntu@{ec2_ip['PublicIp']}")
print(f"WebServer: http://{ec2_ip['PublicIp']}:8080")
print(f"Login admin admin")
# print(f"Flower   : http://{ec2_ip['PublicIp']}:5555")

SSH      : ssh -i my-experimental-kp-june-2021-01 ubuntu@3.140.156.246
WebServer: http://3.140.156.246:8080
Login admin admin


## Refshift Setup

### Creates a role for Redshift and attach the needed policies.   

In [108]:
redshift_role = does_role_already_exist(my_redshift_role_name)

if redshift_role is None:
    redshift_role = iam.create_role(
        Path='/',
        RoleName=my_redshift_role_name,
        Description='role used for for capstone project',
        MaxSessionDuration=3600,
        AssumeRolePolicyDocument="""{
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "redshift.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
      ]
    }""")['Role']
#     attaching the policies
    for redshift_policy in  [
        'arn:aws:iam::aws:policy/AmazonS3FullAccess',
        'arn:aws:iam::aws:policy/AWSGlueConsoleFullAccess']:
        assert iam.attach_role_policy(
            RoleName=redshift_role['RoleName'],
            PolicyArn=redshift_policy)['ResponseMetadata']['HTTPStatusCode'] == 200
    print(f'Redshift role {my_redshift_role_name} has been created. The policies were also attached. ')
else:
    print(f'Redshift role {my_redshift_role_name} already exists ')
print(f"The Redshift role has been ceated with the ARN: {redshift_role['Arn']}")
redshift_role_arn = redshift_role['Arn']

Redshift role my-experimental-Redshift-role-01 has been created. The policies were also attached. 
The Redshift role has been ceated with the ARN: arn:aws:iam::986106953013:role/my-experimental-Redshift-role-01


### Create a security group for Redshift

In [110]:
redshift_sg = ec2.create_security_group(
    Description='Allows 5432 trafic',
    GroupName='Redshift')
ec2.authorize_security_group_ingress(CidrIp='0.0.0.0/0', FromPort=5439, ToPort=5439, GroupId=redshift_sg['GroupId'], IpProtocol='TCP')
print(f"Refshift security group {redshift_sg['GroupId']} created successfuly")

Refshift security group sg-0c3c96fd64b473074 created successfuly


### Allocate a public IP for Redshift

In [109]:
redshift_ip = ec2.allocate_address(Domain='vpc')
# [ redshift_ip['PublicIp'], redshift_ip['AllocationId'] ]
print(f"Redshift PublicIp: {redshift_ip['PublicIp']} for AllocationId: {redshift_ip['AllocationId']}")

Redshift PublicIp: 52.14.254.238 for AllocationId: eipalloc-0e76bf6be5fd01eea


### Create a Redshift cluster

In [111]:
# redshift_host = redshift_ip['PublicIp']
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
redshift_cluster = redshift.create_cluster(
    DBName=redshift_db,
    ClusterIdentifier=redshift_ClusterIdentifier,
    NodeType=redshift_NodeType,
    ClusterType=redshift_ClusterType,
#     AvailabilityZone=my_region,
#     change the next one (uncomment) if the redshift_ClusterType is multinode
#     see https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.create_cluster
#     NumberOfNodes=redshift_NumberOfNodes,
    MasterUsername=redshift_user,
    MasterUserPassword=redshift_MasterUserPassword,
    VpcSecurityGroupIds=[ redshift_sg['GroupId'] ],
    IamRoles=[ redshift_role['Arn'] ],
    ElasticIp=redshift_ip['PublicIp'],
    PubliclyAccessible=True,
    Encrypted=False)['Cluster']
redshift.get_waiter('cluster_available').wait(ClusterIdentifier=redshift_cluster['ClusterIdentifier'])
print(f"Redshift cluster {redshift_cluster['ClusterIdentifier']} has been created")

Redshift cluster my-experimental-redshift-cluster has been created


### Connecting to the cluster

In [112]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [113]:
redshift_url = f"postgresql://{redshift_user}:{redshift_MasterUserPassword}@{redshift_ip['PublicIp']}:5439/{redshift_db}"
print(f"user: {redshift_user}\npassword: {redshift_MasterUserPassword}\nserver: {redshift_ip['PublicIp']}:5439/{redshift_db}")    
print(f"Redshift connection string = '{redshift_url}'")
%sql $redshift_url

user: redshift
password: kljhdfsKLJDD12345
server: 52.14.254.238:5439/my-experimental-capstone-db
Redshift connection string = 'postgresql://redshift:kljhdfsKLJDD12345@52.14.254.238:5439/my-experimental-capstone-db'


### Using the AWS covid-19 Data Lake      
Abouy the covid-19 data lake : https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/     
The url to the CloudFormation template that will create the data lake within the account :https://us-east-2.console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/create/review?templateURL=https://covid19-lake.s3.us-east-2.amazonaws.com/cfn/CovidLakeStack.template.json&stackName=CovidLakeStack     
Additional resources: https://aws.amazon.com/blogs/big-data/exploring-the-public-aws-covid-19-data-lake/     


### Airflow additional setup based on Redshift configuration parameters 

#### Airflow variables

In [115]:
dt = datetime.datetime.today()
date_partition  = f"{dt.year}/{dt.month}/{dt.day}"
run_via_ssh(
    ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        f"airflow variables set 'model_training_data_path' '/home/ubuntu/pipe/ml/data/training.zip'",
        f"airflow variables set 'raw_data_file' '/home/ubuntu/pipe/ml/data/raw_data_file.csv'",
        f"airflow variables set 'raw_data_file_empty_columns_eliminated' '/home/ubuntu/pipe/ml/data/raw_data_file_empty_columns_eliminated.csv'",
        f"airflow variables set 'eliminated_papers_older_than_01_01_2020' '/home/ubuntu/pipe/ml/data/eliminated_papers_older_than_01_01_2020.csv'",
        f"airflow variables set 'eliminated_non_english_languages' '/home/ubuntu/pipe/ml/data/eliminated_non_english_languages.csv'",
        f"airflow variables set 'spacy_preprocessed' '/home/ubuntu/pipe/ml/data/spacy_preprocessed.csv'",
        f"airflow variables set 'intermediate_preprocessed_s3_key' 'data/preprocessed/{date_partition}/intermediate_preprocessed.csv'",
        f"airflow variables set 'spacy_preprocessed_s3_key' 'data/preprocessed/{date_partition}/spacy_preprocessed.csv'",
        f"airflow variables set 'model_output_path' '/home/ubuntu/pipe/ml/model'",
        f"airflow variables set 'unload_raw_data_to_s3_key' 'data/raw/{date_partition}'",
        f"airflow variables set 'unload_raw_data_to_s3_filename' 'alleninstitute_metadata_000'",
        f"airflow variables set 'model_max_length' '30'",
        f"airflow variables set 'model_vocab_size' '10000'",
        f"airflow variables set 'model_emb_dims' '64'",
        f"airflow variables set 'model_lstm_units' '128'",
        f"airflow variables set 'model_training_batch_size' '50'",
        f"airflow variables set 'model_training_epochs' '5'",
        f"airflow variables set 's3_redshift_iam_role' '{redshift_role['Arn']}'",
        f"airflow variables set 's3_redshift_region' '{my_region}'",
        f"airflow variables set 's3_staging_bucket' '{my_bucket_name}'",
        f"airflow variables set 'redshift_db' '{redshift_db}'",
        f"airflow variables set 'redshift_destination_external_schema_name' '{redshift_destination_external_schema_name}'",
        f"airflow variables set 'redshift_destination_glue_database_name' '{redshift_destination_glue_database_name}'",
        f"airflow variables set 'redshift_destination_table_name' '{redshift_destination_table_name}'",
        f"airflow variables set 'redshift_role_arn' '{redshift_role_arn}'",
        f"airflow variables set 'my_region' '{my_region}'",
        f"airflow variables set 'path_images_dir' '/home/ubuntu/pipe/ml/images'",
        f"airflow variables set 'path_working_dir' '/home/ubuntu/pipe/ml/working'",
        f"airflow variables set 's3_report_bucket' '{my_bucket_name}'",
        f"airflow variables set 's3_report_folder' 'reports'"
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/30 [00:00<?, ?it/s]

('command executed successfuly:::', "airflow variables set 'model_training_data_path' '/home/ubuntu/pipe/ml/data/training.zip'")
('command executed successfuly:::', "airflow variables set 'raw_data_file' '/home/ubuntu/pipe/ml/data/raw_data_file.csv'")
('command executed successfuly:::', "airflow variables set 'raw_data_file_empty_columns_eliminated' '/home/ubuntu/pipe/ml/data/raw_data_file_empty_columns_eliminated.csv'")
('command executed successfuly:::', "airflow variables set 'eliminated_papers_older_than_01_01_2020' '/home/ubuntu/pipe/ml/data/eliminated_papers_older_than_01_01_2020.csv'")
('command executed successfuly:::', "airflow variables set 'eliminated_non_english_languages' '/home/ubuntu/pipe/ml/data/eliminated_non_english_languages.csv'")
('command executed successfuly:::', "airflow variables set 'spacy_preprocessed' '/home/ubuntu/pipe/ml/data/spacy_preprocessed.csv'")
('command executed successfuly:::', "airflow variables set 'intermediate_preprocessed_s3_key' 'data/prepro

#### Airflow Redshift connection

In [116]:
redshift_url

'postgresql://redshift:kljhdfsKLJDD12345@52.14.254.238:5439/my-experimental-capstone-db'

In [127]:
redshift_endpoint = redshift_ip["PublicIp"]

run_via_ssh(
    ip=ec2_ip['PublicIp'],
    pem_path=ec2_pem_path,
    commands=[
        f""" airflow connections add \
        'redshift_db' --conn-uri '{redshift_url}'""",
    ])

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246


  0%|          | 0/1 [00:00<?, ?it/s]

('!!!failed', " airflow connections add         'redshift_db' --conn-uri 'postgresql://redshift:kljhdfsKLJDD12345@52.14.254.238:5439/my-experimental-capstone-db'")
!!! A connection with `conn_id`=redshift_db already exists.



#### Loading the dags into Airflow

In [129]:
for filename in tqdm(os.listdir('dags/')):
    if filename.endswith(".py"):
        upload_dag_file(
            ip=ec2_ip['PublicIp'],
            pem_path=ec2_pem_path,
            file_name=f'dags/{filename}',
            display_file=False)

  0%|          | 0/10 [00:00<?, ?it/s]

ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/spacy_utils.py' 'ubuntu@3.140.156.246:~/airflow/dags'
ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/language_utils.py' 'ubuntu@3.140.156.246:~/airflow/dags'
ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/transfer_utils.py' 'ubuntu@3.140.156.246:~/airflow/dags'
ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/cord19_ml_dag.py' 'ubuntu@3.140.156.246:~/airflow/dags'
ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/cleanup_utils.py' 'ubuntu@3.140.156.246:~/airflow/dags'
ssh -i ./my-experimental-kp-june-2021-01.pem ubuntu@3.140.156.246
scp -i my-experimental-kp-june-2021-01.pem 'dags/.py' 'ubuntu@3.140.156.246:~/a

## Clean Up

### Delete Redshift cluster

In [119]:
redshift.delete_cluster(ClusterIdentifier=redshift_cluster['ClusterIdentifier'], SkipFinalClusterSnapshot=True)
redshift.get_waiter('cluster_deleted').wait(ClusterIdentifier=redshift_cluster['ClusterIdentifier'])
print('Redshift cluster deleted.')

Redshift cluster deleted.


### Release Redshift public IP

In [120]:
ec2.release_address(AllocationId=redshift_ip['AllocationId'])

{'ResponseMetadata': {'RequestId': '4e59467c-38fc-4bf8-8231-3c92832af6cf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4e59467c-38fc-4bf8-8231-3c92832af6cf',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '229',
   'date': 'Tue, 15 Feb 2022 02:45:31 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

### Delete Redshift security group

In [121]:
ec2.delete_security_group(GroupId=redshift_sg['GroupId'])

{'ResponseMetadata': {'RequestId': 'fb858ed6-968c-4557-b8f5-717d68dc2d2c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'fb858ed6-968c-4557-b8f5-717d68dc2d2c',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '239',
   'date': 'Tue, 15 Feb 2022 02:45:41 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

### Delete Redshift role

In [122]:
for attached_policy in iam.list_attached_role_policies(RoleName=redshift_role['RoleName'])['AttachedPolicies']:
        iam.detach_role_policy(RoleName=redshift_role['RoleName'], PolicyArn=attached_policy['PolicyArn'])
for policy_name in iam.list_role_policies(RoleName=redshift_role['RoleName'])['PolicyNames']:
    iam.delete_role_policy(RoleName=redshift_role['RoleName'], PolicyName=policy_name)
iam.delete_role(RoleName=redshift_role['RoleName'])

{'ResponseMetadata': {'RequestId': '04271081-4224-4693-8d9b-fa30def3e73a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '04271081-4224-4693-8d9b-fa30def3e73a',
   'content-type': 'text/xml',
   'content-length': '200',
   'date': 'Tue, 15 Feb 2022 02:45:46 GMT'},
  'RetryAttempts': 0}}

### Cancel the spot instance request

In [130]:
ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=[ ec2_spot_id ])
# ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=[ 'sir-rk8sj4bj' ])

{'CancelledSpotInstanceRequests': [{'SpotInstanceRequestId': 'sir-hsn6a88p',
   'State': 'cancelled'}],
 'ResponseMetadata': {'RequestId': '56f8d530-bfe8-405e-9142-f10d8f692529',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '56f8d530-bfe8-405e-9142-f10d8f692529',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '426',
   'date': 'Tue, 15 Feb 2022 03:25:20 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

### Terminate the spot instance

In [131]:
ec2.terminate_instances(InstanceIds=[ ec2_vm_id ])
# ec2.terminate_instances(InstanceIds=[ 'i-080cf3bee321c8357' ])

{'TerminatingInstances': [{'CurrentState': {'Code': 32,
    'Name': 'shutting-down'},
   'InstanceId': 'i-00ba1532615d19b80',
   'PreviousState': {'Code': 16, 'Name': 'running'}}],
 'ResponseMetadata': {'RequestId': 'f28d9375-1990-46f5-bb29-981848742a27',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'f28d9375-1990-46f5-bb29-981848742a27',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'accept-encoding',
   'content-type': 'text/xml;charset=UTF-8',
   'transfer-encoding': 'chunked',
   'date': 'Tue, 15 Feb 2022 03:25:23 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

### Wait for the instance to terminate and release the IP address 

In [None]:
ec2.get_waiter('instance_terminated').wait(InstanceIds=[ ec2_vm_id ])
ec2.release_address(AllocationId=ec2_ip['AllocationId'])

#if stuck needs to be removed manualy from the console ( VPC -> Elastic IPs)
# ec2.get_waiter('instance_terminated').wait(InstanceIds=['i-080cf3bee321c8357'])
# ec2.release_address(AllocationId='eipassoc-026af42e9e3b0fb02')



### Delete security group

In [None]:
ec2.delete_security_group(GroupId=ec2_sg['GroupId'])

# ec2.delete_security_group(GroupId='sg-0a1592d4d67e19433')

### Delete the key-pair

In [None]:
ec2.delete_key_pair(KeyName=ec2_pem_name)

### Detach and delete the role policies

In [None]:
for attached_policy in iam.list_attached_role_policies(RoleName=ec2_role['RoleName'])['AttachedPolicies']:
    iam.detach_role_policy(RoleName=ec2_role['RoleName'], PolicyArn=attached_policy['PolicyArn'])
for policy_name in iam.list_role_policies(RoleName=ec2_role['RoleName'])['PolicyNames']:
    iam.delete_role_policy(RoleName=ec2_role['RoleName'], PolicyName=policy_name)

### Remove role from instance profile

In [None]:
iam.remove_role_from_instance_profile(InstanceProfileName=ec2_instance_profile['InstanceProfileName'], RoleName=ec2_role['RoleName'])

### Delete instance profile

In [None]:
iam.delete_instance_profile(InstanceProfileName=ec2_instance_profile['InstanceProfileName'])

### Delete the role

In [None]:
iam.delete_role(RoleName=ec2_role['RoleName'])

## ALL DONE

## Refrences    
https://www.cloudwalker.io/2019/09/30/airflow-scale-out-with-redis-and-celery/   
https://aws.amazon.com/blogs/big-data/a-public-data-lake-for-analysis-of-covid-19-data/     
https://us-east-2.console.aws.amazon.com/cloudformation/home?region=us-east-2#/stacks/create/review?templateURL=https://covid19-lake.s3.us-east-2.amazonaws.com/cfn/CovidLakeStack.template.json&stackName=CovidLakeStack
https://aws.amazon.com/blogs/big-data/exploring-the-public-aws-covid-19-data-lake/   
https://medium.com/@hudsonmendes/data-pipeline-for-data-science-part-1-problem-solution-fit-3b092880efa3     


