# Amazon Redshift - Unload Parquet Data To S3

TODO: Describe scenario
<img src="img/c3-11.png" width="90%" align="left">

In [None]:
import boto3
import sagemaker

# Connect to Redshift
redshift = boto3.client('redshift')

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
parquet_prefix_unload = 'amazon-reviews-pds/parquet-from-redshift'

# Set S3 destination paths
s3_destination_path_parquet_unload = 's3://{}/{}'.format(bucket, parquet_prefix_unload)


### Define Redshift Parameters

In [None]:
# Redshift configuration parameters
redshift_cluster_identifier = 'dsoaws'
database_name = 'dsoaws'

master_user_name = 'dsoaws'
master_user_pw = '<password>'

redshift_port = '5439'

schema = 'redshift'
table_name_tsv = 'amazon_reviews_tsv'

In [None]:
# Set Redshift endpoint address & IAM Role
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)

redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

print(redshift_endpoint_address)
print(iam_role)

## Setup Redshift Connection Via SQLAlchemy
https://pypi.org/project/SQLAlchemy/

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd


In [None]:
# Connect to Redshift database engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name))


In [None]:
# Configure Session
session = sessionmaker()
session.configure(bind=engine)
s = session()
#set_path = "SET search_path TO %s" % schema
#s.execute(set_path)

## Unload Parquet Data From Redshift To S3

In [None]:
statement = """UNLOAD ('SELECT marketplace, customer_id, review_id, product_id, product_parent, 
                        product_title, product_category, star_rating, helpful_votes, total_votes, 
                        vine, verified_purchase, review_headline, review_body, review_date FROM {}.{}')
                TO '{}/'
                IAM_ROLE '{}'
                PARQUET PARALLEL ON 
                PARTITION BY (product_category)""".format(schema, table_name_tsv, s3_destination_path_parquet_unload, iam_role)

print(statement)

### Note: This query execution takes approx. 20min and might not show up as finished in the notebook (Check Redshift in the AWS console directly).

In [None]:
s = session()
s.execute(statement)
s.commit()

### List S3 directory

In [None]:
!aws s3 ls $s3_destination_path_parquet_unload/