# Invoke a Model from Amazon Athena

### Install PyAthena

In [None]:
!pip install -q PyAthena==1.8.0

In [None]:
from pyathena import connect
from pyathena.pandas_cursor import PandasCursor
from pyathena.util import as_pandas

In [None]:
import boto3
import sagemaker
import pandas as pd

# Get region 
session = boto3.session.Session()
region_name = session.region_name

# Get SageMaker session & default S3 bucket
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role() 

In [None]:
# Set S3 prefixes
tsv_prefix = 'amazon-reviews-pds/tsv'

# Set Athena parameters
database_name = 'dsoaws'
table_name_tsv = 'amazon_reviews_tsv'
table_name = 'product_reviews'

In [None]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = 's3://{}/athena/staging'.format(bucket)

In [None]:
# SQL statement to execute
statement = """
CREATE TABLE IF NOT EXISTS {}.{} AS 
SELECT review_id, review_body 
FROM {}.{}
""".format(database_name, table_name, database_name, table_name_tsv)

print(statement)

In [None]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

In [None]:
statement = 'SELECT * FROM {}.{} LIMIT 10'.format(database_name, table_name)
cursor.execute(statement)

In [None]:
df_show = as_pandas(cursor)
df_show

## SageMaker Model Endpoint

In [None]:
ep_name = 'reviews-endpoint'

In [None]:
statement = """
USING FUNCTION predict_star_rating(review_body STRING) 
    RETURNS STRING TYPE
    SAGEMAKER_INVOKE_ENDPOINT WITH (sagemaker_endpoint = {})

SELECT review_body, predict_star_rating(review_body) AS star_rating FROM {}.{} LIMIT 10
""".format(ep_name, database_name, table_name)

In [None]:
# Execute statement using connection cursor
cursor = connect(region_name=region_name, s3_staging_dir=s3_staging_dir).cursor()
cursor.execute(statement)

# TODO: Use Athena data to train model

In [None]:
from sagemaker import RandomCutForest

prefix = 'athena-ml/anomalydetection'
execution_role = sagemaker.get_execution_role()
session = sagemaker.Session()

# specify general training job information
rcf = RandomCutForest(role=execution_role,
                      train_instance_count=1,
                      train_instance_type='ml.c4.8xlarge',
                      data_location='s3://{}/{}/'.format(bucket, prefix),
                      output_path='s3://{}/{}/output'.format(bucket, prefix),
                      num_samples_per_tree=512,
                      num_trees=50)

# Run the training job using the results we got from the Athena query earlier
rcf.fit(rcf.record_set(results.number.values.reshape(-1,1)))

print('Training job name: {}'.format(rcf.latest_training_job.job_name))

rcf_inference = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.c4.8xlarge',
)

print('\nEndpoint name (used by Athena): {}'.format(rcf_inference.endpoint))