# Amazon Athena - Convert TSV Data To Parquet

TODO: Describe scenario

<img src="img/c3-08.png" width="90%" align="left">

In [None]:
import boto3
import sagemaker

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()

# Set S3 prefixes
parquet_prefix = 'amazon-reviews-pds/parquet'

# Set S3 destination path
s3_destination_path_parquet = 's3://{}/{}'.format(bucket, parquet_prefix)

# Set Athena parameters
database_name = 'dsoaws'
table_name_tsv = 'amazon_reviews_tsv'
table_name_parquet = 'amazon_reviews_parquet'


### Install PyAthena

In [None]:
# Install PyAthena
!pip install -q PyAthena==1.8.0

In [None]:
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

### Create Parquet Files from TSV Table
Partioned by Product Category

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = 's3://{0}/staging/athena'.format(bucket)

In [None]:
# SQL statement to execute
statement = """CREATE TABLE IF NOT EXISTS {}.{}
WITH (format = 'PARQUET', external_location = '{}', partitioned_by = ARRAY['product_category']) AS
SELECT marketplace,
         customer_id,
         review_id,
         product_id,
         product_parent,
         product_title,
         star_rating,
         helpful_votes,
         total_votes,
         vine,
         verified_purchase,
         review_headline,
         review_body,
         CAST(YEAR(DATE(review_date)) AS INTEGER) AS year,
         DATE_DIFF('day', DATE('1970-01-01'), DATE(review_date)) AS review_date,
         product_category
FROM {}.{}""".format(database_name, table_name_parquet, s3_destination_path_parquet, database_name, table_name_tsv)

print(statement)


### Execute statement using connection cursor
This takes a few minutes.  Please be patient.

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

### Load partitions by running `MSCK REPAIR TABLE`

In [None]:
statement = 'MSCK REPAIR TABLE {}.{}'.format(database_name, table_name_parquet)

In [None]:
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

### Run a sample query

In [None]:
statement = """SELECT * FROM {}.{}
    WHERE product_category = 'Digital_Video_Download' LIMIT 500""".format(database_name, table_name_parquet)

print(statement)

In [None]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

In [None]:
df = as_pandas(cursor)
df.head(5)