# Amazon Redshift - Load TSV Data Into Redshift

TODO: Describe scenario

<img src="img/redshift_load_tsv.png" width="55%" align="left">

In [None]:
import boto3
import sagemaker

# Connect to Redshift
redshift = boto3.client('redshift')

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
tsv_prefix = 'amazon-reviews-pds/tsv'

# Set S3 destination paths
s3_destination_path_tsv = 's3://{}/{}'.format(bucket, tsv_prefix)


## Setup Redshift Connection Via SQLAlchemy
https://pypi.org/project/SQLAlchemy/

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd


In [None]:
# Redshift configuration parameters

redshift_cluster_identifier = 'dsoaws'
database_name = 'dsoaws'

master_user_name = 'dsoaws'
master_user_pw = 'Password9'

redshift_endpoint_address = ''
redshift_port = '5439'

schema = 'redshift'
table_name_tsv = 'amazon_reviews_tsv'

iam_role = ''


In [None]:
# Set Redshift endpoint address & IAM Role
clusters = redshift.describe_clusters()

for clr in clusters['Clusters']:
    if clr['ClusterIdentifier'] == redshift_cluster_identifier:
        endpoint_address = clr['Endpoint']['Address']
        iam_role = clr['IamRoles'][0]['IamRoleArn']
        break;
    else:
        endpoint_address =''
        iam_role=''
        print("Cluster and/or IAM Role not found.")

print(endpoint_address)
print(iam_role)

In [None]:
# Connect to Redshift database engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name))


In [None]:
# Configure Session
session = sessionmaker()
session.configure(bind=engine)
s = session()
# set_path = "SET search_path TO %s" % schema
# s.execute(set_path)

### Create TSV Table In Redshift

In [None]:
statement = """CREATE SCHEMA IF NOT EXISTS {}""".format(schema)

s = session()
s.execute(statement)
s.commit()

In [None]:
statement = """DROP TABLE IF EXISTS {}.{}""".format(schema, table_name_tsv)

s = session()
s.execute(statement)


In [None]:
statement = """CREATE TABLE {}.{}( 
         marketplace varchar(2),
         customer_id varchar(8),
         review_id varchar(14),
         product_id varchar(10),
         product_parent varchar(9),
         product_title varchar(400),
         product_category varchar(24),
         star_rating int,
         helpful_votes int,
         total_votes int,
         vine varchar(1),
         verified_purchase varchar(1),
         review_headline varchar(128),
         review_body varchar(65535),
         review_date varchar(10)
)""".format(schema, table_name_tsv)


In [None]:
s.execute(statement)
s.commit()

## Load TSV Data From S3 Into Redshift

Note: The statement below runs for approx. 30min. 

In [None]:
statement = """COPY {}.{} (marketplace, customer_id, review_id, product_id, 
                product_parent, product_title, product_category, star_rating, helpful_votes, total_votes, 
                vine, verified_purchase, review_headline, review_body, review_date) 
            FROM \'{}/\'
            IAM_ROLE \'{}\'
            IGNOREHEADER 1 DELIMITER '\\t' 
            GZIP TRUNCATECOLUMNS;""".format(schema, table_name_tsv, s3_destination_path_tsv, iam_role)
print(statement)

In [None]:
s = session()
s.execute(statement)
s.commit()

### Run a sample query

In [None]:
df = pd.read_sql_query("""SELECT product_category,
                            COUNT(star_rating) AS count_star_rating
                        FROM {}
                        GROUP BY product_category
                        ORDER BY count_star_rating DESC""".format(table_name_tsv), engine)

In [None]:
df.head(5)