Follow these:
* https://medium.com/datactw/imdb-dataset-visualization-data-analytics-using-pandas-97b5c6f03c6d

In [None]:
%%bash
pip install -q --upgrade pip
pip install -q pandas==0.23.0
pip install -q numpy==1.14.3
pip install -q matplotlib==3.0.3
pip install -q seaborn==0.8.1
pip install -q PyAthena==1.8.0

In [None]:
# Imports & Settings

import boto3
import sagemaker

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set Athena database & table 
database_name = 'dsoaws'
table_name = 'amazon_reviews_parquet'

In [None]:
# PyAthena imports
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

7. TODO:  What is the correlation between the number of reviews and the average rating for each product?  (Answer:  0.029)
```
SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating
FROM {}.{} 
TABLESAMPLE BERNOULLI(10)
GROUP BY product_id
LIMIT 1000
```
8. TODO:  What is the correlation between the number of helpful votes and the rating for each review?  (-0.00926)
```
SELECT helpful_votes, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
```
9. TODO:  What is the correlation between the length of a review and the rating for each review? (-0.1662)
```
SELECT product_id, LENGTH(review_body) as length_review_body, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
```

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)

In [None]:
sns.set_style = 'seaborn-whitegrid'

sns.set(rc={"font.style":"normal",
            "axes.facecolor":"white",
            'grid.color': '.8',
             'grid.linestyle': '-',
            "figure.facecolor":"white",
            "figure.titlesize":20,
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":True,
            'axes.labelsize':10,
           'figure.figsize':(10.0, 10.0),
            'xtick.labelsize':10,
            'font.size':10,
            'ytick.labelsize':10})

In [None]:
# Helper code to display values on bars

def show_values_barplot(axs, space):
    def _show_on_plot(ax):
        for p in ax.patches:
            _x = p.get_x() + p.get_width() + float(space)
            _y = p.get_y() + p.get_height()
            value = round(float(p.get_width()),2)
            ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_plot(ax)
    else:
        _show_on_plot(axs)

# At the product level, find the correlation between `count_reviews` and `avg_star_rating`.

In [None]:
# SQL statement
statement = """
SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating
FROM {}.{} 
TABLESAMPLE BERNOULLI(10)
GROUP BY product_id
LIMIT 1000
""".format(database_name, table_name)

print(statement)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

In [None]:
df.corr()

# At the review level, find the correlation between `helpful_votes` and `star_rating`.

In [None]:
# SQL statement
statement = """
SELECT helpful_votes, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
""".format(database_name, table_name)

print(statement)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

In [None]:
df.corr()

# At the review level, find the correlation between `length(review_body)` and `star_rating`.


In [None]:
# SQL statement
statement = """
SELECT product_id, LENGTH(review_body) as length_review_body, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
""".format(database_name, table_name)

print(statement)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

In [None]:
df.corr()

# TODO:  Histogram of LENGTH(review_body)

In [None]:
# SQL statement
statement = """
SELECT LENGTH(review_body) as review_body_length
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 10000
""".format(database_name, table_name)

print(statement)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df.head(10)

In [None]:
import matplotlib
# x-axis is the review_body_length
# y-axis is teh number of reviews
matplotlib.pyplot.hist(df['review_body_length'], bins=10,
                       range=[0,200],
                       density=False,
                       weights=None,
                       cumulative=False,
                       bottom=None,
                       histtype='bar',
                       align='mid',
                       orientation='vertical',
                       rwidth=None,
                       log=False,
                       color=None,
                       label=None,
                       stacked=False
                      )

# TODO:  Possibly use this HISTOGRAM() function in Presto - or equivalent (ie. WIDTH_BUCKET, etc)

In [None]:
# SQL statement
statement = """
SELECT HISTOGRAM(LENGTH(review_body)) as review_body_length_histo
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 10000
""".format(database_name, table_name)

print(statement)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)

In [None]:
histo_dict = eval(df.values[0][0].replace('=', ':'))

In [None]:
df_histo = pd.DataFrame.from_dict(histo_dict)