# Amazon Redshift - Unload Parquet Data To S3

TODO: Describe scenario

In [None]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
parquet_prefix_unload = 'amazon-reviews-pds/parquet-from-redshift'

# Set S3 destination paths
s3_destination_path_parquet_unload = 's3://{}/{}'.format(bucket, parquet_prefix_unload)


### Define Redshift Parameters

In [None]:
# Redshift configuration parameters
DB_NAME = 'dsoaws'

MASTER_USER_NAME = 'dsoaws'
MASTER_USER_PW = '<password>'

# TODO: Must create a new IAM Role with at least S3 Access to your data bucket that you are loading into Redshift
IAM_ROLE = '<IAM_ROLE>'

# TODO: get Endpoint name programatically
redshift_endpoint = '<endpoint-name>'
redshift_port = '5439'

SCHEMA = 'public'
table_name_tsv = 'amazon_reviews_tsv'

## Setup Redshift Connection Via SQLAlchemy
https://pypi.org/project/SQLAlchemy/

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd


In [None]:
# Connect to Redshift database engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(MASTER_USER_NAME, MASTER_USER_PW, redshift_endpoint, redshift_port, DB_NAME))


In [None]:
# Configure Session
session = sessionmaker()
session.configure(bind=engine)
s = session()
SetPath = "SET search_path TO %s" % SCHEMA
s.execute(SetPath)

## Unload Parquet Data From Redshift To S3

In [None]:
statement = """UNLOAD ('SELECT marketplace, customer_id, review_id, product_id, product_parent, 
                        product_title, product_category, star_rating, helpful_votes, total_votes, 
                        vine, verified_purchase, review_headline, review_body, review_date FROM {}')
                TO '{}/'
                IAM_ROLE '{}'
                PARQUET PARALLEL ON 
                PARTITION BY (product_category)""".format(table_name_tsv, s3_destination_path_parquet_unload, IAM_ROLE)

print(statement)

### Note: This query execution takes approx. 20min and might not show up as finished in the notebook (Check Redshift in the AWS console directly).

In [None]:
s = session()
s.execute(statement)
s.commit()

### List S3 directory

In [None]:
!aws s3 ls $s3_destination_path_parquet_unload/