# Explore Redshift

In [None]:
!pip install -q SQLAlchemy==1.3.13

In [None]:
import pandas as pd

# Settings
redshift_schema = 'redshift'
redshift_cluster_identifier = 'dsoaws'
redshift_host = 'dsoaws'
redshift_database = 'dsoaws'
redshift_port = '5439'
table_name = 'amazon_reviews_tsv'


## Load the Redshift Secrets from Secrets Manager

In [None]:
import json
import boto3

secretsmanager = boto3.client('secretsmanager')

secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')
cred = json.loads(secret['SecretString'])

redshift_username = cred[0]['username']
redshift_pw = cred[1]['password']

In [None]:
redshift = boto3.client('redshift')

response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)

redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']

print(redshift_endpoint_address)

## Create the Redshift Query Engine 

In [None]:
from sqlalchemy import create_engine

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database))

### Showing blazing fast Redshift APPROXIMATE COUNT

In [None]:
%%time
df = pd.read_sql_query("""SELECT approximate count(distinct customer_id)
                                FROM {}.{}
                                GROUP BY product_category""".format(redshift_schema, table_name), engine)

In [None]:
%%time
df = pd.read_sql_query("""SELECT count(distinct customer_id)
                                FROM {}.{}
                                GROUP BY product_category""".format(redshift_schema, table_name), engine)

### Showing visualization

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'


In [None]:
df = pd.read_sql_query("""SELECT product_category,
                COUNT(star_rating) AS count_star_rating
                FROM {}.{}
                GROUP BY product_category
                ORDER BY count_star_rating DESC""".format(redshift_schema, table_name), engine)

In [None]:
df.head()

In [None]:
# Set size and style to use
plt.figure(figsize=(10,10))
plt.style.use('seaborn-whitegrid')

# Create Seaborn barplot
barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)

# Set title
plt.title("Number of Ratings per Product Category (Redshift)")

# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews
plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])
plt.xlim(0, 20000000)
plt.xlabel("Number of Ratings")
plt.ylabel("Product Category")

plt.tight_layout()

# Export plot if needed
# plt.savefig('ratings_per_category.png', dpi=300)

# Show the barplot
plt.show(barplot)

## Query Athena

In [None]:
athena_schema = 'athena'


In [None]:
statement = """SELECT product_category, COUNT(star_rating) AS count_star_rating
                FROM {}.{}
                GROUP BY product_category
                ORDER BY count_star_rating DESC""".format(athena_schema, table_name_tsv)

print(statement)

In [None]:
df = pd.read_sql_query(statement, engine)
df.head(5)

In [None]:
# Set size and style to use
plt.figure(figsize=(10,10))
plt.style.use('seaborn-whitegrid')

# Create Seaborn barplot
barplot = sns.barplot(y='product_category', x='count_star_rating', data = df, saturation=1)

# Set title
plt.title("Number of Ratings per Product Category (Athena via Redshift Spectrum)")

# Set x-axis ticks to match scale from 10mio reviews to 20mio reviews
plt.xticks([100000, 1000000, 5000000, 10000000, 15000000, 20000000], ['100K', '1m', '5m', '10m','15m','20m'])
plt.xlim(0, 20000000)
plt.xlabel("Number of Ratings")
plt.ylabel("Product Category")

plt.tight_layout()

# Export plot if needed
# plt.savefig('ratings_per_category.png', dpi=300)

# Show the barplot
plt.show(barplot)