# Amazon Redshift - Load TSV Data Into Redshift

TODO: Describe scenario

In [None]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
tsv_prefix = 'amazon-reviews-pds/tsv'

# Set S3 destination paths
s3_destination_path_tsv = 's3://{}/{}'.format(bucket, tsv_prefix)


## Setup Amazon Redshift

To create an Amazon Redshift cluster, follow these steps:


### Collect Configuration Parameters (VPC ID, Security Group ID etc.)

In [None]:
%%bash

#### Get VPC ID
# --filters "Name=tag:Name,Values=eksctl-${AWS_CLUSTER_NAME}-cluster/VPC"
# Make sure this VPC is the same this notebook is running within
# Make sure this VPC has the following 2 properties enabled
#     DNS resolution = Enabled
#     DNS hostnames = Enabled
# This allows private, internal access to Redshift from this SageMaker notebook using the fully qualified endpoint name

export vpc_id=$(aws ec2 describe-vpcs  --query "Vpcs[0].VpcId" --output text)
export sub_id=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=${vpc_id}" --query "Subnets[0].SubnetId" --output text)

# --group-name eks-fsx-security-group
# --description "FSx for Lustre Security Group"
#  --vpc-id ${vpc_id}
# --query "Groups[0].GroupId"
export sec_id=$(aws ec2 describe-security-groups --filters "Name=vpc-id,Values=${vpc_id}" --query "SecurityGroups[0].GroupId"  --output text)
echo $sec_id

In [None]:
# TODO: This security group might need to have port 5349 open
# COPY FROM ABOVE
SECURITY_GROUP_ID='xxxxxx'

### Define Redshift Parameters

In [None]:
# Redshift configuration parameters
DB_NAME = 'dsoaws'
CLUSTER_IDENTIFIER = 'dsoaws'
CLUSTER_TYPE = 'multi-node'

# Note that only some Instance Types support Redshift Query Editor 
# (https://docs.aws.amazon.com/redshift/latest/mgmt/query-editor.html)
NODE_TYPE = 'dc2.large'
NUMBER_NODES = '2' 

MASTER_USER_NAME = 'dsoaws'
MASTER_USER_PW = '<password>'

# TODO: Must create a new IAM Role with at least S3 Access to your data bucket that you are loading into Redshift
IAM_ROLE = '<IAM_ROLE>'


### Create Redshift Cluster

In [None]:
redshift = boto3.client('redshift')

response = redshift.create_cluster(
        DBName=DB_NAME,
        ClusterIdentifier=CLUSTER_IDENTIFIER,
        ClusterType=CLUSTER_TYPE,
        NodeType=NODE_TYPE,
        NumberOfNodes=int(NUMBER_NODES),       
        MasterUsername=MASTER_USER_NAME,
        MasterUserPassword=MASTER_USER_PW,
        IamRoles=[IAM_ROLE],
        VpcSecurityGroupIds=[SECURITY_GROUP_ID],
        Port=5439,
        PubliclyAccessible=False
)

print(response)


## Setup Redshift Connection Via SQLAlchemy
https://pypi.org/project/SQLAlchemy/

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd


In [None]:
# TODO: get Endpoint name programatically
redshift_endpoint = '<endpoint-name>'
redshift_port = '5439'

SCHEMA = 'public'
table_name_tsv = 'amazon_reviews_tsv'


In [None]:
# Connect to Redshift database engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(MASTER_USER_NAME, MASTER_USER_PW, redshift_endpoint, redshift_port, DB_NAME))


In [None]:
# Configure Session
session = sessionmaker()
session.configure(bind=engine)
s = session()
SetPath = "SET search_path TO %s" % SCHEMA
s.execute(SetPath)

### Create TSV Table In Redshift

In [None]:
statement = """DROP TABLE IF EXISTS {}""".format(table_name_tsv)

s = session()
s.execute(statement)

In [None]:
statement = """CREATE TABLE {}( 
         marketplace varchar(2),
         customer_id varchar(8),
         review_id varchar(14),
         product_id varchar(10),
         product_parent varchar(9),
         product_title varchar(400),
         product_category varchar(24),
         star_rating int,
         helpful_votes int,
         total_votes int,
         vine varchar(1),
         verified_purchase varchar(1),
         review_headline varchar(128),
         review_body varchar(65535),
         review_date varchar(10)
)""".format(table_name_tsv)


In [None]:
s.execute(statement)
s.commit()

## Load TSV Data From S3 Into Redshift

Note: The statement below runs for approx. 30min. 

In [None]:
statement = """COPY {} (marketplace, customer_id, review_id, product_id, 
                product_parent, product_title, product_category, star_rating, helpful_votes, total_votes, 
                vine, verified_purchase, review_headline, review_body, review_date) 
            FROM \'{}/\'
            IAM_ROLE \'{}\'
            IGNOREHEADER 1 DELIMITER '\\t' 
            GZIP TRUNCATECOLUMNS;""".format(table_name_tsv, s3_destination_path_tsv, IAM_ROLE)
print(statement)

In [None]:
s = session()
s.execute(statement)
s.commit()

### Run a sample query

In [None]:
df = pd.read_sql_query("""SELECT product_category,
                            COUNT(star_rating) AS count_star_rating
                        FROM {}
                        GROUP BY product_category
                        ORDER BY count_star_rating DESC""".format(table_name_tsv), engine)

In [None]:
df.head(5)