# Amazon Redshift - Load TSV Data Into Redshift

TODO: Describe scenario

<img src="img/c3-10.png" width="90%" align="left">

In [123]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Connect to Redshift
redshift = boto3.client('redshift')
secretsmanager = boto3.client('secretsmanager')

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
tsv_prefix = 'amazon-reviews-pds/tsv'

# Set S3 path to TSV data
s3_path_tsv = 's3://{}/{}'.format(bucket, tsv_prefix)


### Setup Redshift Connection Via SQLAlchemy
https://pypi.org/project/SQLAlchemy/

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [125]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd


#### Get Redshift credentials

In [126]:
import json

secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')
cred = json.loads(secret['SecretString'])

master_user_name = cred[0]['username']
master_user_pw = cred[1]['password']

#### Redshift configuration parameters

In [127]:
redshift_cluster_identifier = 'dsoaws'

database_name_redshift = 'dsoaws'
database_name_athena = 'dsoaws'

redshift_port = '5439'

schema_redshift = 'redshift'
schema_athena = 'athena'

table_name_tsv = 'amazon_reviews_tsv'

#### Get Redshift endpoint address & IAM Role

In [None]:
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)

redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']
iam_role = response['Clusters'][0]['IamRoles'][0]['IamRoleArn']

print(redshift_endpoint_address)
print(iam_role)

#### Connect to Redshift database engine

In [129]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(master_user_name, master_user_pw, redshift_endpoint_address, redshift_port, database_name))


#### Configure Session

In [157]:
session = sessionmaker()
session.configure(bind=engine)
s = session()

## Redshift Spectrum
Amazon Redshift Spectrum directly queries data in S3, using the same SQL syntax of Amazon Redshift. You can also run queries that span both the frequently accessed data stored locally in Amazon Redshift and your full datasets stored cost-effectively in S3.

To use Redshift Spectrum, your cluster needs authorization to access data catalog in Amazon Athena and your data files in Amazon S3. You provide that authorization by referencing an AWS Identity and Access Management (IAM) role that is attached to your cluster. 

To use this capability in from your Amazon SageMaker notebook:

* Register your Athena database `dsoaws` with Redshift Spectrum
* Query Your Data in Amazon S3

### Register Athena Database `dsoaws` with Redshift Spectrum to access the data directly in S3 

In [131]:
statement = """
CREATE EXTERNAL SCHEMA IF NOT EXISTS {} FROM DATA CATALOG 
    DATABASE '{}' 
    IAM_ROLE '{}'
    REGION '{}'
    CREATE EXTERNAL DATABASE IF NOT EXISTS
""".format(schema_athena, database_name_athena, iam_role, region_name)

print(statement)


CREATE EXTERNAL SCHEMA IF NOT EXISTS athena FROM DATA CATALOG 
    DATABASE 'dsoaws' 
    IAM_ROLE 'arn:aws:iam::806570384721:role/DSOAWS_Redshift'
    REGION 'us-east-1'
    CREATE EXTERNAL DATABASE IF NOT EXISTS



In [61]:
s.execute(statement)
s.commit()

## Congratulations, we now see our previously created Athena tables in Redshift. 

### Run a sample query

In [132]:
statement = """
SELECT product_category, COUNT(star_rating) AS count_star_rating
    FROM {}.{}
    GROUP BY product_category
    ORDER BY count_star_rating DESC
""".format(schema_athena, table_name_tsv)

print(statement)


SELECT product_category, COUNT(star_rating) AS count_star_rating
    FROM athena.amazon_reviews_tsv
    GROUP BY product_category
    ORDER BY count_star_rating DESC



In [63]:
df = pd.read_sql_query(statement, engine)
df.head(5)

Unnamed: 0,product_category,count_star_rating
0,Books,19531329
1,Digital_Ebook_Purchase,17622415
2,Wireless,9002021
3,PC,6908554
4,Home,6221559


## Create local Redshift tables with Customer Reviews data of each year for the last 5 years

### Create `redshift` schema

In [133]:
statement = """CREATE SCHEMA IF NOT EXISTS {}""".format(schema_redshift)

s = session()
s.execute(statement)
s.commit()

### Create Redshift tables for every year

In [134]:
# Create table function, pass session, table name prefix and start & end year

def create_redshift_table_tsv(session, table_name_prefix, start_year, end_year):
    for year in range(start_year, end_year+1, 1):
        current_table_name = table_name_prefix+'_'+str(year)
        statement = """
        CREATE TABLE IF NOT EXISTS redshift.{}( 
             marketplace varchar(2),
             customer_id varchar(8),
             review_id varchar(14),
             product_id varchar(10) DISTKEY,
             product_parent varchar(9),
             product_title varchar(400),
             product_category varchar(24),
             star_rating int,
             helpful_votes int,
             total_votes int,
             vine varchar(1),
             verified_purchase varchar(1),
             review_headline varchar(128),
             review_body varchar(65535),
             review_date varchar(10),
             year int)  SORTKEY (product_category)
        """.format(current_table_name)

        #print(statement)
        session.execute(statement)
    session.commit()
        
    print("Done.")


In [135]:
create_redshift_table_tsv(s, 'amazon_reviews_tsv', 2011, 2015)

Done.


### Insert Data from Athena table into local Redshift table

In [158]:
# INSERT INTO function, pass session, table name prefix and start & end year

def insert_into_redshift_table_tsv(session, table_name_prefix, start_year, end_year):
    for year in range(start_year, end_year+1, 1):
        current_table_name = table_name_prefix+'_'+str(year)
        statement = """
            INSERT 
            INTO
                redshift.{}
                SELECT
                    marketplace,
                    customer_id,
                    review_id,
                    product_id,
                    product_parent,
                    product_title,
                    product_category,
                    star_rating,
                    helpful_votes,
                    total_votes,
                    vine,
                    verified_purchase,
                    review_headline,
                    review_body,
                    review_date,
                    CAST(DATE_PART_YEAR(TO_DATE(review_date, 'YYYY-MM-DD')) AS INTEGER) AS year             
                FROM
                    athena.amazon_reviews_tsv             
                WHERE
                    year = {}
            """.format(current_table_name, year)
        #print(statement)
        session.execute(statement)
        session.commit()        
    print("Done.")

#### Note: The `INSERT INTO` takes approx. 10min/table, please be patient and minimize the no. of tables.

In [159]:
insert_into_redshift_table_tsv(s, 'amazon_reviews_tsv', 2011, 2015)

Done.


### Run a sample query using `UNION ALL` across 2 tables aka. 2 years

In [1]:
statement = """
SELECT product_category, COUNT(star_rating) AS count_star_rating, year
FROM redshift.amazon_reviews_tsv_2014
GROUP BY redshift.amazon_reviews_tsv_2014.product_category, year
UNION ALL
SELECT product_category, COUNT(star_rating) AS count_star_rating, year
FROM redshift.amazon_reviews_tsv_2015
GROUP BY redshift.amazon_reviews_tsv_2015.product_category, year
ORDER BY count_star_rating DESC, year ASC
"""

print(statement)


SELECT product_category, COUNT(star_rating) AS count_star_rating, year
FROM redshift.amazon_reviews_tsv_2014
GROUP BY redshift.amazon_reviews_tsv_2014.product_category, year
UNION ALL
SELECT product_category, COUNT(star_rating) AS count_star_rating, year
FROM redshift.amazon_reviews_tsv_2015
GROUP BY redshift.amazon_reviews_tsv_2015.product_category, year
ORDER BY count_star_rating DESC, year ASC



In [172]:
df = pd.read_sql_query(statement, engine)
df.head(20)

Unnamed: 0,product_category,count_star_rating,year
0,Digital_Ebook_Purchase,6615914,2014
1,Digital_Ebook_Purchase,4533519,2015
2,Books,3472631,2014
3,Wireless,2998518,2015
4,Wireless,2830482,2014
5,Books,2808751,2015
6,Apparel,2369754,2015
7,Home,2172297,2015
8,Apparel,2122455,2014
9,Home,1999452,2014


# This completes this notebook. High Five!