# Amazon Glue DataBrew

In [1]:
!pip install -q --upgrade pip 

In [3]:
!pip install -q awscli==1.18.203 boto3==1.16.43 botocore==1.19.43

In [None]:
# # Restart the kernel to pick up pip installed libraries
# from IPython.core.display import HTML
# HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Prerequisites

Add The Following Trust Relationship To Your IAM Role

```
    {
      "Sid": "",
      "Effect": "Allow",
      "Principal": {
        "Service": "databrew.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
```

and Access Policy For Glue DataBrew.

# Imports and Settings

In [5]:
import boto3
import json
import sagemaker
import pandas as pd
from botocore.exceptions import ClientError

session = boto3.session.Session()
sts = session.client(service_name='sts')
account_id = sts.get_caller_identity().get('Account') 
print(account_id)

region = session.region_name
print(region)

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
print(bucket)

role = sagemaker.get_execution_role()
print(role)

db = boto3.client('databrew')

231218423789
us-east-1
sagemaker-us-east-1-231218423789
arn:aws:iam::231218423789:role/TeamRole


# Describe Project

In [None]:
# response = db.describe_project(
#     Name='dlai'
# )

# print(json.dumps(response, indent=4, sort_keys=True, default=str))

# Describe Dataset

In [None]:
# response = db.describe_dataset(
#     Name='amazon-reviews-parquet'
# )

# print(json.dumps(response, indent=4, sort_keys=True, default=str))

# Create Dataset

In [6]:
dataset_name = 'amazon-customer-reviews-dataset'
input_bucket='amazon-reviews-pds'
key='parquet/'

In [7]:
response = db.create_dataset(
    Name=dataset_name,
    Input={
        'S3InputDefinition': {
            'Bucket': input_bucket,
            'Key': key
        },
    },
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "amazon-customer-reviews-dataset",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "42",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:17:39 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLCqlESZoAMF3AA=",
            "x-amz-cf-id": "vqynRgE6DDruwNJhF8quSJ1aS007v99yZdr8PEAQOk_afA4lksflSw==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "c28f8b7b-c8b0-43f9-8362-9005f20929ff",
            "x-amzn-trace-id": "Root=1-5fe77e43-2b070829300471d72c31c065",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "c28f8b7b-c8b0-43f9-8362-9005f20929ff",
        "RetryAttempts": 0
    }
}


In [8]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#dataset-details?dataset={}&tab=preview">Dataset</a></b>'.format(region, dataset_name)))


In [9]:
response = db.describe_dataset(
    Name=dataset_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "CreateDate": "2020-12-26 18:17:39.866000+00:00",
    "CreatedBy": "arn:aws:sts::231218423789:assumed-role/TeamRole/SageMaker",
    "Input": {
        "S3InputDefinition": {
            "Bucket": "amazon-reviews-pds",
            "Key": "parquet/"
        }
    },
    "Name": "amazon-customer-reviews-dataset",
    "ResourceArn": "arn:aws:databrew:us-east-1:231218423789:dataset/amazon-customer-reviews-dataset",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "536",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:19:04 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLC32HrRoAMF9Cg=",
            "x-amz-cf-id": "NCM5PH9NBMmhSkTKk-aRWZzncFUugf3L96pENZ0hA6AfjqL7QLskNQ==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "3ce19654-5e30-4256-8a1f-da7ad076717b",
            "x-

## Get Dataset Resource ARN

In [10]:
dataset_arn = response['ResourceArn']
print(dataset_name)
print(dataset_arn)

amazon-customer-reviews-dataset
arn:aws:databrew:us-east-1:231218423789:dataset/amazon-customer-reviews-dataset


# Describe Recipe

In [18]:
recipe_name='amazon-customer-reviews-dataset-recipe'

In [None]:
# response = db.describe_recipe(
#     Name='dlai-recipe'
# )

# print(json.dumps(response, indent=4, sort_keys=True, default=str))

# View Recipe File

In [19]:
!pygmentize ./amazon-reviews-dataset-recipe.json

[
  {
    [94m"Action"[39;49;00m: {
      [94m"Operation"[39;49;00m: [33m"DELETE"[39;49;00m,
      [94m"Parameters"[39;49;00m: {
        [94m"sourceColumns"[39;49;00m: [33m"[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"[39;49;00m
      }
    }
  }
]


# Load Recipe File

In [16]:
# Read file
with open('./amazon-reviews-dataset-recipe.json', 'r') as file:
    file_object=file.read()

# Parse file
recipe_steps = json.loads(file_object)

print(json.dumps(recipe_steps, indent=4, sort_keys=True, default=str))

[
    {
        "Action": {
            "Operation": "DELETE",
            "Parameters": {
                "sourceColumns": "[\"marketplace\",\"customer_id\",\"review_id\",\"product_id\",\"product_parent\",\"product_title\",\"helpful_votes\",\"total_votes\",\"vine\",\"verified_purchase\",\"review_headline\",\"review_date\",\"year\"]"
            }
        }
    }
]


# Create Recipe From File

In [20]:
response = db.create_recipe(
    Description='Amazon Customer Reviews Recipe',
    Name=recipe_name,
    Steps=recipe_steps
)

In [21]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "amazon-customer-reviews-dataset-recipe",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "49",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:21:38 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLDP9G-ZoAMFxtQ=",
            "x-amz-cf-id": "KcMHpEMFEed5hkJ2Z7so0xEFZd4FmUp5Yj1r_IPdrYo4hZsZX4uxLA==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "236e8e6b-0a83-4c54-b197-eff900255b99",
            "x-amzn-trace-id": "Root=1-5fe77f32-119028a20e13aceb4ff930f0",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "236e8e6b-0a83-4c54-b197-eff900255b99",
        "RetryAttempts": 0
    }
}


## Create Project

In [22]:
project_name = 'amazon-customer-reviews-dataset-project'

In [24]:
response = db.create_project(
    DatasetName=dataset_name,
    Name=project_name,
    RecipeName=recipe_name,
    Sample={
        'Size': 500,
        'Type': 'FIRST_N'
    },
    RoleArn='arn:aws:iam::231218423789:role/service-role/AWSGlueDataBrewServiceRole-antje'
)

In [25]:
print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "amazon-customer-reviews-dataset-project",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "50",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:22:12 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLDVHFsNIAMFTxg=",
            "x-amz-cf-id": "XwbQHbziEpMCuZvpjyfO5AK3CiHnW6l6chUXw1eGvgUPT5ISGNwHZA==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "5b63138f-dcfe-4a77-9c04-af26f9284959",
            "x-amzn-trace-id": "Root=1-5fe77f53-6c59f6981d72b40710388d2e",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "5b63138f-dcfe-4a77-9c04-af26f9284959",
        "RetryAttempts": 0
    }
}


In [26]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#project-workspace?project={}&view=grid">Project</a></b>'.format(region, project_name)))


# Create Recipe Job

In [None]:
# response = db.describe_job(
#     Name='test-job'
# )

# print(json.dumps(response, indent=4, sort_keys=True, default=str))

In [27]:
job_name = 'amazon-customer-reviews-dataset-recipe-job'
output_bucket = bucket
output_key = 'databrew/'

In [30]:
response = db.create_recipe_job(
#    DatasetName=dataset_name,
    Name=job_name,
    LogSubscription='ENABLE',
    MaxCapacity=10,
    MaxRetries=0,
    Outputs=[
        {
            'Format': 'CSV',
            'PartitionColumns': [],
            'Location': {
                'Bucket': output_bucket,
                'Key': output_key
            },
            'Overwrite': True
        },
    ],
    ProjectName=project_name,
#     RecipeReference={
#         'Name': recipe_name
#     },
    RoleArn='arn:aws:iam::231218423789:role/service-role/AWSGlueDataBrewServiceRole-antje',
    Timeout=2880
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "Name": "amazon-customer-reviews-dataset-recipe-job",
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "53",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:26:35 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLD-QG0ooAMFigg=",
            "x-amz-cf-id": "oZ7SMZOWCSzLh9bLH8lADaghjf9DHQ3BvrkvCamggwXs9H6ZtfDvWw==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "ff4c0e10-6c18-4e5f-87bc-6b52b77c7bab",
            "x-amzn-trace-id": "Root=1-5fe7805b-65f21d0d7c76a5994878fe26",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "ff4c0e10-6c18-4e5f-87bc-6b52b77c7bab",
        "RetryAttempts": 0
    }
}


In [31]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/databrew/home?region={}#job-details?job={}&tab=history">Recipe Job</a></b>'.format(region, job_name)))


# Start Job Run

In [32]:
response = db.start_job_run(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "79",
            "content-type": "application/json",
            "date": "Sat, 26 Dec 2020 18:26:44 GMT",
            "via": "1.1 ddeb8679359f033dad405557c487bfdd.cloudfront.net (CloudFront)",
            "x-amz-apigw-id": "YLD_jGoIoAMFXMw=",
            "x-amz-cf-id": "JPhdLGMxw221AR4ixDav-jZHpFGtYakGBrQMk5rVp8Rzd5DHCrx-Jg==",
            "x-amz-cf-pop": "IAD89-C3",
            "x-amzn-requestid": "8d4ad6d3-63c8-42ee-a129-db5498d269d7",
            "x-amzn-trace-id": "Root=1-5fe78063-7fedff736f2e2eb71717407c",
            "x-cache": "Miss from cloudfront"
        },
        "HTTPStatusCode": 200,
        "RequestId": "8d4ad6d3-63c8-42ee-a129-db5498d269d7",
        "RetryAttempts": 0
    },
    "RunId": "db_e013ce63e5a8e2111571cbe92ca4bc779735f15db562e7c033ad2f195ac631dd"
}


# Get Job Run ID

In [33]:
job_run_id = response['RunId']
print(job_run_id)

db_e013ce63e5a8e2111571cbe92ca4bc779735f15db562e7c033ad2f195ac631dd


# List Job Run

In [34]:
response = db.list_job_runs(
    Name=job_name
)

print(json.dumps(response, indent=4, sort_keys=True, default=str))

{
    "JobRuns": [
        {
            "Attempt": 0,
            "DatasetName": "amazon-customer-reviews-dataset",
            "ExecutionTime": 0,
            "JobName": "amazon-customer-reviews-dataset-recipe-job",
            "LogGroupName": "/aws-glue-databrew/jobs-amazon-customer-reviews-dataset-recipe-job",
            "LogSubscription": "ENABLE",
            "Outputs": [
                {
                    "Format": "CSV",
                    "Location": {
                        "Bucket": "sagemaker-us-east-1-231218423789",
                        "Key": "databrew/"
                    },
                    "Overwrite": true,
                    "PartitionColumns": []
                }
            ],
            "RecipeReference": {
                "Name": "amazon-customer-reviews-dataset-recipe",
                "RecipeVersion": "LATEST_WORKING"
            },
            "RunId": "db_e013ce63e5a8e2111571cbe92ca4bc779735f15db562e7c033ad2f195ac631dd",
            "StartedBy

In [35]:
status = response['JobRuns'][0]['State']
print(status)

RUNNING


# _Wait For The Job Run To Complete. The Job Runs For About 30min._

In [36]:
%%time

import time

response = db.list_job_runs(Name=job_name)

while response['JobRuns'][0]['State'] == 'RUNNING':
    response = db.list_job_runs(Name=job_name)
    status = response['JobRuns'][0]['State']
    print('Job Run State: {}'.format(status))
    time.sleep(15)

Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run State: RUNNING
Job Run Sta

# Review S3 Bucket With CSV File

In [37]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/s3/buckets/{}?region={}&prefix={}">S3 Bucket</a></b>'.format(output_bucket, region, output_key)))

# Show the CSV Files

In [50]:
part_file='{}_part00000.csv'.format(job_name)
print(part_file)

amazon-customer-reviews-dataset-recipe-job_part00000.csv


In [51]:
s3_output_bucket='{}/{}'.format(output_bucket, output_key)
print(s3_output_bucket)

sagemaker-us-east-1-231218423789/databrew/


In [47]:
!aws s3 cp s3://$s3_output_bucket$part_file ./

download: s3://sagemaker-us-east-1-231218423789/databrew/amazon-customer-reviews-dataset-recipe-job_part00000.csv to ./amazon-customer-reviews-dataset-recipe-job_part00000.csv


In [48]:
import csv

df_reviews = pd.read_csv('./amazon-customer-reviews-dataset-recipe-job_part00000.csv')

In [49]:
df_reviews.head()

Unnamed: 0,star_rating,review_body
0,5,"After attending a few Qigong classes, I wanted..."
1,4,Krauss traces the remarkable transformation in...
2,4,"Rebecca, a dental hygienist, receives a call a..."
3,5,"\\""BARED TO YOU\\"" is a sizzling, red-hot pass..."
4,5,Good characters and plot line. I spent a pleas...
