Follow these:
* https://medium.com/datactw/imdb-dataset-visualization-data-analytics-using-pandas-97b5c6f03c6d

In [1]:
%%bash
pip install -q --upgrade pip
pip install -q pandas==0.23.0
pip install -q numpy==1.14.3
pip install -q matplotlib==3.0.3
pip install -q seaborn==0.8.1
pip install -q PyAthena==1.8.0

In [2]:
# Imports & Settings

import boto3
import sagemaker

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set Athena database & table 
database_name = 'dsoaws'
table_name = 'amazon_reviews_parquet'

In [3]:
# PyAthena imports
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

In [4]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = 's3://{0}/athena/staging'.format(bucket)

In [5]:
sns.set_style = 'seaborn-whitegrid'

sns.set(rc={"font.style":"normal",
            "axes.facecolor":"white",
            'grid.color': '.8',
             'grid.linestyle': '-',
            "figure.facecolor":"white",
            "figure.titlesize":20,
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":True,
            'axes.labelsize':10,
           'figure.figsize':(10.0, 10.0),
            'xtick.labelsize':10,
            'font.size':10,
            'ytick.labelsize':10})

In [6]:
# Helper code to display values on bars

def show_values_barplot(axs, space):
    def _show_on_plot(ax):
        for p in ax.patches:
            _x = p.get_x() + p.get_width() + float(space)
            _y = p.get_y() + p.get_height()
            value = round(float(p.get_width()),2)
            ax.text(_x, _y, value, ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_plot(ax)
    else:
        _show_on_plot(axs)

# At the product level, find the correlation between `count_reviews` and `avg_star_rating`.

In [45]:
# SQL statement
statement = """
SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating
FROM {}.{} 
TABLESAMPLE BERNOULLI(10)
GROUP BY product_id
LIMIT 1000
""".format(database_name, table_name)

print(statement)


SELECT product_id, COUNT(*) AS count_reviews, AVG(star_rating) AS avg_star_rating
FROM dsoaws.amazon_reviews_parquet 
TABLESAMPLE BERNOULLI(10)
GROUP BY product_id
LIMIT 1000



In [46]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

Unnamed: 0,product_id,count_reviews,avg_star_rating
0,B000G1ODEC,1,5.000000
1,0780022025,2,5.000000
2,6301730038,1,4.000000
3,6302550424,1,5.000000
4,B00005QATI,1,5.000000
5,B000059HCN,4,3.750000
6,B00005QATY,8,4.625000
7,B000056BRB,1,2.000000
8,B00004REDL,1,5.000000
9,630192990X,2,5.000000


In [47]:
df.corr()

Unnamed: 0,count_reviews,avg_star_rating
count_reviews,1.0,0.029016
avg_star_rating,0.029016,1.0


# At the review level, find the correlation between `helpful_votes` and `star_rating`.

In [48]:
# SQL statement
statement = """
SELECT helpful_votes, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
""".format(database_name, table_name)

print(statement)


SELECT helpful_votes, star_rating
FROM dsoaws.amazon_reviews_parquet
TABLESAMPLE BERNOULLI(10)
LIMIT 1000



In [49]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

Unnamed: 0,helpful_votes,star_rating
0,56,5
1,35,3
2,1,5
3,2,1
4,2,1
5,5,5
6,0,1
7,2,1
8,1,5
9,0,5


In [50]:
df.corr()

Unnamed: 0,helpful_votes,star_rating
helpful_votes,1.0,-0.009259
star_rating,-0.009259,1.0


# At the review level, find the correlation between `length(review_body)` and `star_rating`.


In [52]:
# SQL statement
statement = """
SELECT product_id, LENGTH(review_body) as length_review_body, star_rating
FROM {}.{}
TABLESAMPLE BERNOULLI(10)
LIMIT 1000
""".format(database_name, table_name)

print(statement)


SELECT product_id, LENGTH(review_body) as length_review_body, star_rating
FROM dsoaws.amazon_reviews_parquet
TABLESAMPLE BERNOULLI(10)
LIMIT 1000


In [53]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# Load query results into Pandas DataFrame and show results
df = as_pandas(cursor)
df

Unnamed: 0,product_id,length_review_body,star_rating
0,B00943M2F0,37,1
1,B005IFC47E,443,3
2,B00ARSOX1W,16,4
3,B00R280UQM,125,5
4,B00FYQ1NJ2,396,3
5,B00J2BMN4Y,163,5
6,B00KKXCJQU,108,5
7,B00AO6I0M0,727,4
8,B00WSBF0VI,18,5
9,B00MNAG4MI,308,4


In [54]:
df.corr()

Unnamed: 0,length_review_body,star_rating
length_review_body,1.0,-0.1662
star_rating,-0.1662,1.0
