# Amazon Glue DataBrew

# Prerequisites

Add The Following Trust Relationship To Your IAM Role

```
    {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
        "Service": "databrew.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
```

and Access Policy For Glue DataBrew.

# Imports and Settings

In [1]:
import boto3
import json
import sagemaker
import pandas as pd
from botocore.exceptions import ClientError

session = boto3.session.Session()
sts = session.client(service_name='sts')
account_id = sts.get_caller_identity().get('Account') 
print(account_id)

region = session.region_name
print(region)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
print(bucket)

role = sagemaker.get_execution_role()
print(role)

db = boto3.client('databrew')

231218423789
us-east-1
sagemaker-us-east-1-231218423789
arn:aws:iam::231218423789:role/TeamRole


# Create Dataset

In [4]:
import time
timestamp = int(time.time())

In [5]:
dataset_name = 'reviews-dataset-{}'.format(timestamp)
input_bucket='amazon-reviews-pds'
key='parquet/'

In [6]:
response = db.create_dataset(
    Name=dataset_name,
    Input={
        'S3InputDefinition': {
            'Bucket': input_bucket,
            'Key': key
        },
    },
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-1610754714",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "37",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:51:54 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNuYKHknoAMFxsA=",
            "x-amz-cf-id": "xtehPGxwnR94PJm0Ki0OoqVnuYfWu6CmXSF1vYOgCo9VjpF5ZLoKFw==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "f3f71469-de19-4648-b686-49d5964f34e9",
            "x-amzn-trace-id": "Root=1-60022a9a-0c1998157756f3c45e3d986e",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "f3f71469-de19-4648-b686-49d5964f34e9",
        "RetryAttempts": 0
    }
}


In [7]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#dataset-details?dataset={}&tab=preview">Dataset</a></b>'.format(region, dataset_name)))


In [8]:
response = db.describe_dataset(
    Name=dataset_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "CreateDate": "2021-01-15 23:51:54.657000+00:00",
    "CreatedBy": "arn:aws:sts::231218423789:assumed-role/TeamRole/SageMaker",
    "Input": {
        "S3InputDefinition": {
            "Bucket": "amazon-reviews-pds",
            "Key": "parquet/"
        }
    },
    "Name": "reviews-dataset-1610754714",
    "ResourceArn": "arn:aws:databrew:us-east-1:231218423789:dataset/reviews-dataset-1610754714",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "526",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:51:54 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNuYMFtfoAMFhRw=",
            "x-amz-cf-id": "0etzAXUzU97jWopkMuFoW51Z5ZUo1VG1KLic19SsM4w5_dSWXSGEFQ==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "6207d1d4-bedc-4d0a-afd4-1c600e6bf566",
            "x-amzn-trace

## Get Dataset Resource ARN

In [9]:
dataset_arn = response['ResourceArn']
print(dataset_name)
print(dataset_arn)

reviews-dataset-1610754714
arn:aws:databrew:us-east-1:231218423789:dataset/reviews-dataset-1610754714


# Create Recipe

In [10]:
recipe_name='reviews-dataset-recipe-{}'.format(timestamp)

# View Recipe File

In [12]:
!pygmentize ./amazon-reviews-dataset-recipe.json

[
  {
    [94m"Action"[39;49;00m: {
      [94m"Operation"[39;49;00m: [33m"DELETE"[39;49;00m,
      [94m"Parameters"[39;49;00m: {
        [94m"sourceColumns"[39;49;00m: [33m"[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"[39;49;00m
      }
    }
  }
]


# Load Recipe File

In [13]:
# Read file
with open('./amazon-reviews-dataset-recipe.json', 'r') as file:
    file_object=file.read()

# Parse file
recipe_steps = json.loads(file_object)

print(json.dumps(recipe_steps, indent=4, sort_keys=True, default=str))

[
    {
        "Action": {
            "Operation": "DELETE",
            "Parameters": {
                "sourceColumns": "[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"
            }
        }
    }
]


# Create Recipe From File

In [14]:
response = db.create_recipe(
    Description='Amazon Customer Reviews Recipe',
    Name=recipe_name,
    Steps=recipe_steps
)

In [15]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-recipe-1610754714",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "44",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:52:01 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNuZQHDCIAMFgSw=",
            "x-amz-cf-id": "lEPSLWUPgIV8IEnRvoQefpkJrGfXk1RlIXrSSiqZV1C3Mzj6jwx2ew==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "7c01248e-e790-4afd-9a43-82f81053384d",
            "x-amzn-trace-id": "Root=1-60022aa1-4b8469312f58f825058b0f09",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "7c01248e-e790-4afd-9a43-82f81053384d",
        "RetryAttempts": 0
    }
}


## Create Project

In [16]:
project_name = 'reviews-dataset-project-{}'.format(timestamp)

In [18]:
response = db.create_project(
    DatasetName=dataset_name,
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn=role
)

In [19]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-project-1610754714",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "45",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:52:18 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNubzGrGIAMFvbQ=",
            "x-amz-cf-id": "ohszVYJmnea5zunVopzCxGsBECoe--oSl9AVts7H0Dfgxj7viAkzKg==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "7d89a2fc-681b-47b0-b544-b0a481ac7998",
            "x-amzn-trace-id": "Root=1-60022ab1-7e8c68ba6b53ee6c6009e93b",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "7d89a2fc-681b-47b0-b544-b0a481ac7998",
        "RetryAttempts": 0
    }
}


In [20]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#project-workspace?project={}&view=grid">Project</a></b>'.format(region, project_name)))


# Create Recipe Job

In [22]:
job_name = 'reviews-dataset-recipe-job-{}'.format(timestamp)
output_bucket = bucket
output_key = 'databrew/'

# TODO: Add Data Brew Trust Relation to IAM Role

In [23]:
response = db.create_recipe_job(
#    DatasetName=dataset_name,
    Name=job_name,
    LogSubscription='ENABLE',
    MaxCapacity=10,
    MaxRetries=0,
    Outputs=[
        {
            'Format': 'CSV',
            'PartitionColumns': [],
            'Location': {
                'Bucket': output_bucket,
                'Key': output_key
            },
            'Overwrite': True
        },
    ],
    ProjectName=project_name,
#     RecipeReference={
#         'Name': recipe_name
#     },
    RoleArn=role,
    Timeout=2880
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "reviews-dataset-recipe-job-1610754714",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "48",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:52:24 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNucvGHjoAMFkqA=",
            "x-amz-cf-id": "yeKVRvl5myfGp4V4rFjdFxuZy3R25BloW8UuvYLH39nIadfePxpcsA==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "d5af0969-a285-4e83-9da8-1fc40cf6e246",
            "x-amzn-trace-id": "Root=1-60022ab7-71a2f620576e278d697cf720",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "d5af0969-a285-4e83-9da8-1fc40cf6e246",
        "RetryAttempts": 0
    }
}


In [24]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#job-details?job={}&tab=history">Recipe Job</a></b>'.format(region, job_name)))


# Start Job Run

In [25]:
response = db.start_job_run(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "79",
            "content-type": "application/json",
            "date": "Fri, 15 Jan 2021 23:52:25 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "ZNuc2FUkoAMFwHg=",
            "x-amz-cf-id": "82yg5fH4R_3qXvVqenRKhTQpUprq7cvlzU09yz6zdcooX5tsI9hsEw==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "a9da9a47-8e21-4c8d-b411-ff6fa97fb570",
            "x-amzn-trace-id": "Root=1-60022ab8-34825f8f7f077fc465b67b99",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "a9da9a47-8e21-4c8d-b411-ff6fa97fb570",
        "RetryAttempts": 0
    },
    "RunId": "db_5ae96799210402b4091914ee6a02232b4ef63c180bb8e57f1403cf7bf06fa8a3"
}


# Get Job Run ID

In [26]:
job_run_id = response['RunId']
print(job_run_id)

db_5ae96799210402b4091914ee6a02232b4ef63c180bb8e57f1403cf7bf06fa8a3


# List Job Run

In [27]:
response = db.list_job_runs(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "JobRuns": [
        {
            "Attempt": 0,
            "DatasetName": "reviews-dataset-1610754714",
            "ExecutionTime": 0,
            "JobName": "reviews-dataset-recipe-job-1610754714",
            "LogGroupName": "/aws-glue-databrew/jobs-reviews-dataset-recipe-job-1610754714",
            "LogSubscription": "ENABLE",
            "Outputs": [
                {
                    "Format": "CSV",
                    "Location": {
                        "Bucket": "sagemaker-us-east-1-231218423789",
                        "Key": "databrew/"
                    },
                    "Overwrite": true,
                    "PartitionColumns": []
                }
            ],
            "RecipeReference": {
                "Name": "reviews-dataset-recipe-1610754714",
                "RecipeVersion": "LATEST_WORKING"
            },
            "RunId": "db_5ae96799210402b4091914ee6a02232b4ef63c180bb8e57f1403cf7bf06fa8a3",
            "StartedBy": "arn:aws:sts::231

In [28]:
status = response['JobRuns'][0]['State']
print(status)

RUNNING


# _Wait For The Job Run To Complete. The Job Runs For About 30min._

In [29]:
%%time

import time

response = db.list_job_runs(Name=job_name)

while response['JobRuns'][0]['State'] == 'RUNNING':
    response = db.list_job_runs(Name=job_name)
    status = response['JobRuns'][0]['State']
    print('Job Run State: {}'.format(status))
    time.sleep(15)

Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run Sta

# Review S3 Bucket With CSV File

In [30]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/s3/buckets/{}?region={}&prefix={}">S3 Bucket</a></b>'.format(output_bucket, region, output_key)))

# Show the CSV Files

In [31]:
part_file='{}_part00000.csv'.format(job_name)
print(part_file)

reviews-dataset-recipe-job-1610754714_part00000.csv


In [32]:
s3_output_bucket='{}/{}'.format(output_bucket, output_key)
print(s3_output_bucket)

sagemaker-us-east-1-231218423789/databrew/


In [33]:
!aws s3 cp s3://$s3_output_bucket$part_file ./

download: s3://sagemaker-us-east-1-231218423789/databrew/reviews-dataset-recipe-job-1610754714_part00000.csv to ./reviews-dataset-recipe-job-1610754714_part00000.csv


In [34]:
import csv

df_reviews = pd.read_csv('./amazon-customer-reviews-dataset-recipe-job_part00000.csv')

FileNotFoundError: [Errno 2] No such file or directory: './amazon-customer-reviews-dataset-recipe-job_part00000.csv'

In [None]:
df_reviews.head()

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}