# Amazon Glue DataBrew

# Prerequisites

Add The Following Trust Relationship To Your IAM Role

```
    {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
        "Service": "databrew.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
```

and Access Policy For Glue DataBrew.

# Imports and Settings

In [None]:
import boto3
import json
import sagemaker
import pandas as pd
from botocore.exceptions import ClientError

session = boto3.session.Session()
sts = session.client(service_name='sts')
account_id = sts.get_caller_identity().get('Account') 
print(account_id)

region = session.region_name
print(region)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
print(bucket)

role = sagemaker.get_execution_role()
print(role)

db = boto3.client('databrew')

# Create Dataset

In [None]:
import time
timestamp = int(time.time())

In [None]:
dataset_name = 'reviews-dataset-{}'.format(timestamp)
input_bucket='amazon-reviews-pds'
key='parquet/'

In [None]:
response = db.create_dataset(
    Name=dataset_name,
    Input={
        'S3InputDefinition': {
            'Bucket': input_bucket,
            'Key': key
        },
    },
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#dataset-details?dataset={}&tab=preview">Dataset</a></b>'.format(region, dataset_name)))


In [None]:
response = db.describe_dataset(
    Name=dataset_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

## Get Dataset Resource ARN

In [None]:
dataset_arn = response['ResourceArn']
print(dataset_name)
print(dataset_arn)

# Create Recipe

In [None]:
recipe_name='reviews-dataset-recipe-{}'.format(timestamp)

# View Recipe File

In [None]:
!pygmentize ./amazon-reviews-dataset-recipe.json

# Load Recipe File

In [None]:
# Read file
with open('./amazon-reviews-dataset-recipe.json', 'r') as file:
    file_object=file.read()

# Parse file
recipe_steps = json.loads(file_object)

print(json.dumps(recipe_steps, indent=4, sort_keys=True, default=str))

# Create Recipe From File

In [None]:
response = db.create_recipe(
    Description='Amazon Customer Reviews Recipe',
    Name=recipe_name,
    Steps=recipe_steps
)

In [None]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

## Create Project

In [None]:
project_name = 'reviews-dataset-project-{}'.format(timestamp)

In [None]:
response = db.create_project(
    DatasetName=dataset_name,
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn=role
)

In [None]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#project-workspace?project={}&view=grid">Project</a></b>'.format(region, project_name)))


# Create Recipe Job

In [None]:
job_name = 'reviews-dataset-recipe-job-{}'.format(timestamp)
output_bucket = bucket
output_key = 'databrew/'

# TODO: Add Data Brew Trust Relation to IAM Role

In [None]:
response = db.create_recipe_job(
#    DatasetName=dataset_name,
    Name=job_name,
    LogSubscription='ENABLE',
    MaxCapacity=10,
    MaxRetries=0,
    Outputs=[
        {
            'Format': 'CSV',
            'PartitionColumns': [],
            'Location': {
                'Bucket': output_bucket,
                'Key': output_key
            },
            'Overwrite': True
        },
    ],
    ProjectName=project_name,
#     RecipeReference={
#         'Name': recipe_name
#     },
    RoleArn=role,
    Timeout=2880
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#job-details?job={}&tab=history">Recipe Job</a></b>'.format(region, job_name)))


# Start Job Run

In [None]:
response = db.start_job_run(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

# Get Job Run ID

In [None]:
job_run_id = response['RunId']
print(job_run_id)

# List Job Run

In [None]:
response = db.list_job_runs(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

In [None]:
status = response['JobRuns'][0]['State']
print(status)

# _Wait For The Job Run To Complete. The Job Runs For About 30min._

In [None]:
%%time

import time

response = db.list_job_runs(Name=job_name)

while response['JobRuns'][0]['State'] == 'RUNNING':
    response = db.list_job_runs(Name=job_name)
    status = response['JobRuns'][0]['State']
    print('Job Run State: {}'.format(status))
    time.sleep(15)

# Review S3 Bucket With CSV File

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/s3/buckets/{}?region={}&prefix={}">S3 Bucket</a></b>'.format(output_bucket, region, output_key)))

# Show the CSV Files

In [None]:
part_file='{}_part00000.csv'.format(job_name)
print(part_file)

In [None]:
s3_output_bucket='{}/{}'.format(output_bucket, output_key)
print(s3_output_bucket)

In [None]:
!aws s3 cp s3://$s3_output_bucket$part_file ./

In [None]:
import csv

df_reviews = pd.read_csv('./amazon-customer-reviews-dataset-recipe-job_part00000.csv')

In [None]:
df_reviews.head()

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}