# Explore Data In Redshift - Part 2

In [2]:
# Install SQL Alchemy
!pip install -q SQLAlchemy==1.3.13

### Set Redshift Connection Parameters

In [3]:
redshift_schema = 'redshift'
redshift_cluster_identifier = 'dsoaws'
redshift_host = 'dsoaws'
redshift_database = 'dsoaws'
redshift_port = '5439'
redshift_table_2015 = 'amazon_reviews_tsv_2015'
redshift_table_2014 = 'amazon_reviews_tsv_2014'

### Load the Redshift Secrets from Secrets Manager

In [4]:
import json
import boto3

secretsmanager = boto3.client('secretsmanager')

secret = secretsmanager.get_secret_value(SecretId='dsoaws_redshift_login')
cred = json.loads(secret['SecretString'])

redshift_username = cred[0]['username']
redshift_pw = cred[1]['password']

In [5]:
redshift = boto3.client('redshift')
response = redshift.describe_clusters(ClusterIdentifier=redshift_cluster_identifier)
redshift_endpoint_address = response['Clusters'][0]['Endpoint']['Address']

print(redshift_endpoint_address)

dsoaws.cypuzlfd0zvn.us-east-1.redshift.amazonaws.com


### Create the Redshift Query Engine 

In [6]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(redshift_username, redshift_pw, redshift_endpoint_address, redshift_port, redshift_database))


  """)


### Prepare For Visualizations

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'


## 1. Which product categories have had the most reviews in the last 2 years?

In [8]:
statement = """
SELECT
    year,
    product_category,
    COUNT(star_rating) AS count_star_rating  
FROM
    {}.{} 
GROUP BY
    {}.{}.product_category,
    year 
UNION ALL  
SELECT
    year,
    product_category,
    COUNT(star_rating) AS count_star_rating  
FROM
    {}.{}
GROUP BY
    {}.{}.product_category,
    year 
ORDER BY
    count_star_rating DESC,
    year DESC
""".format(redshift_schema, redshift_table_2015, redshift_schema, redshift_table_2015, redshift_schema, redshift_table_2014, redshift_schema, redshift_table_2014)

print(statement)


SELECT
    year,
    product_category,
    COUNT(star_rating) AS count_star_rating  
FROM
    redshift.amazon_reviews_tsv_2015 
GROUP BY
    redshift.amazon_reviews_tsv_2015.product_category,
    year 
UNION ALL  
SELECT
    year,
    product_category,
    COUNT(star_rating) AS count_star_rating  
FROM
    redshift.amazon_reviews_tsv_2014
GROUP BY
    redshift.amazon_reviews_tsv_2014.product_category,
    year 
ORDER BY
    count_star_rating DESC,
    year DESC



In [9]:
df = pd.read_sql_query(statement, engine)

In [10]:
df.head()

Unnamed: 0,year,product_category,count_star_rating
0,2014,Digital_Ebook_Purchase,13231828
1,2015,Digital_Ebook_Purchase,9067038
2,2014,Books,6945262
3,2015,Wireless,5997036
4,2014,Wireless,5660964


In [None]:
# TODO: Visualization

# 2. Which products have the most helpful reviews in the last year?
How long are those reviews?

In [12]:
statement = """
SELECT product_title,
       helpful_votes,
       star_rating,
       LENGTH(review_body) AS review_body_length,
       SUBSTR(review_body, 1, 100) AS review_body_substr
FROM {}.{}
ORDER BY helpful_votes DESC LIMIT 10
""".format(redshift_schema, redshift_table_2015)

print(statement)


SELECT product_title,
       helpful_votes,
       star_rating,
       LENGTH(review_body) AS review_body_length,
       SUBSTR(review_body, 1, 100) AS review_body_substr
FROM redshift.amazon_reviews_tsv_2015
ORDER BY helpful_votes DESC LIMIT 10



In [None]:
df = pd.read_sql_query(statement, engine)

In [None]:
df

In [None]:
# TODO: Visualization

# 3. 

In [None]:
statement = """

""".format(redshift_schema, redshift_table_2015)

print(statement)

In [None]:
df = pd.read_sql_query(statement, engine)

In [None]:
df

In [None]:
# TODO: Visualization

# 4. 

In [None]:
statement = """

""".format(redshift_schema, redshift_table_2015)

print(statement)

In [None]:
df = pd.read_sql_query(statement, engine)

In [None]:
df

In [None]:
# TODO: Visualization

# 5. 

In [None]:
statement = """

""".format(redshift_schema, redshift_table_2015)

print(statement)

In [None]:
df = pd.read_sql_query(statement, engine)

In [None]:
df

In [None]:
# TODO: Visualization