# Setup Comprehend Through CLI/API:
https://docs.aws.amazon.com/comprehend/latest/dg/get-started-customclass.html

# Setup Comprehend Through AWS Console 

https://docs.aws.amazon.com/comprehend/latest/dg/getting-started-document-classification.html

Good example of using Comprehend for Positive/Negative Sentiment:  https://github.com/aws-samples/amazon-comprehend-custom-entity/blob/master/3-AWS-Comprehend-Negative-Custom-Classifier.ipynb

# Make sure this SageMakerNotebookExecutionRole has access to Comprehend

In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [2]:
comprehend = boto3.client('comprehend')

In [3]:
%store -r noheader_train_s3_uri

print(noheader_train_s3_uri)

s3://sagemaker-us-east-1-835319576252/data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv


In [4]:
!aws s3 ls $noheader_train_s3_uri

2020-05-16 05:20:30   13660650 amazon_reviews_us_Digital_Software_v1_00_noheader.csv


# Create Data Access Role for Comprehend

## Create Policy

In [5]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "comprehend.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

## Create Role and Attach Policies

In [6]:
iam_comprehend_role_name = 'DSOAWS_Comprehend'

In [7]:
import json
import boto3
from botocore.exceptions import ClientError

try:
    iam = boto3.client('iam')

    iam_role_comprehend = iam.create_role(
        RoleName=iam_comprehend_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Comprehend Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

Role already exists


In [8]:
comprehend_s3_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:GetObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}/*".format(bucket)
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket)
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:PutObject"
            ],
            "Resource": [
                "arn:aws:s3:::{}/*".format(bucket)
            ],
            "Effect": "Allow"
        }
    ]
}

print(comprehend_s3_policy_doc)


{'Version': '2012-10-17', 'Statement': [{'Action': ['s3:GetObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252/*'], 'Effect': 'Allow'}, {'Action': ['s3:ListBucket'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252'], 'Effect': 'Allow'}, {'Action': ['s3:PutObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252/*'], 'Effect': 'Allow'}]}


# Attach Policy to Role

In [9]:
response = iam.put_role_policy(
    RoleName=iam_comprehend_role_name,
    PolicyName='DSOAWS_ComprehendPolicyToS3',
    PolicyDocument=json.dumps(comprehend_s3_policy_doc)
)

# Train the Model

In [10]:
prefix = 'models'

s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')
print(s3_output_job)

s3://sagemaker-us-east-1-835319576252/models/comprehend/output


In [11]:
iam_role_comprehend_arn = iam_role_comprehend['Role']['Arn']

In [12]:
import datetime

id = str(datetime.datetime.now().strftime("%s"))

training_job = comprehend.create_document_classifier(
    DocumentClassifierName='Amazon-Customer-Reviews-Classifier-'+ id,
    DataAccessRoleArn=iam_role_comprehend_arn,
    InputDataConfig={
        'S3Uri': noheader_train_s3_uri
    },
    OutputDataConfig={
        'S3Uri': s3_output_job
    },
    LanguageCode='en'
)

In [13]:
import time

jobArn = training_job['DocumentClassifierArn']

max_time = time.time() + 3 * 60 * 60 # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn = jobArn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print("Custom classifier: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        print('')
        print('Status {}'.format(status))
        print('')
        print(describe_custom_classifier["DocumentClassifierProperties"])
        break
        
    time.sleep(5)

Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: SUBMITTED
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRA

Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: T

Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINED

Status TRAINED

{'DocumentClassifierArn': 'arn:aws:comprehend:us-east-1:835319576252:document-classifier/Amazon-Customer-Reviews-Classifier-1589606450', 'LanguageCode': 'en', 'Status': 'TRAINED', 'SubmitTime': datetime.datetime(2020, 5, 16, 5, 20, 50, 733000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2020, 5, 16, 6, 11, 15, 762000, tzinfo=tzlocal()), 'TrainingStartTime': datetime.datetime(2020, 5, 16, 5, 25, 18, 570000, tzinfo=tzlocal()), 'TrainingEndTime': 

# _Please Wait Until the ^^ Classifier ^^ is Trained Above._

In [14]:
!aws s3 ls 

2020-01-08 11:27:57 amazon-forecast-data-835319576252
2020-01-03 18:12:44 aws-athena-query-results-us-east-1-835319576252
2019-12-30 14:34:28 aws-glue-scripts-835319576252-us-east-1
2019-12-30 14:34:29 aws-glue-temporary-835319576252-us-east-1
2019-11-19 00:07:42 aws-kubeflow-workshop
2019-12-11 18:28:37 aws-logs-835319576252-us-east-1
2020-01-01 13:57:23 cf-templates-wn3lggj1tszd-us-east-1
2020-01-07 21:28:57 cf-templates-wn3lggj1tszd-us-west-2
2020-01-14 19:12:29 cfregly-sfn-helloworld
2020-01-14 19:16:15 cfregly-sfn-helloworld-cloudtrail-log-storage
2019-10-06 20:49:39 cloudtrail-awslogs-835319576252-fykrpo1n-isengard-do-not-delete
2020-05-10 18:40:55 cloudtrail-dsoaws-835319576252
2019-12-30 14:29:35 data-science-on-aws-isengard
2020-02-17 18:45:47 data-science-on-aws-us-east-2
2019-10-06 21:04:32 do-not-delete-gatedgarden-audit-835319576252
2020-05-10 18:40:47 dsoaws-data-upload-835319576252
2020-01-03 17:40:30 explore-reviews-dataset-targetbucket-1hdsco0rhrsbb
20

# Show Results of the Classifier

In [15]:
print(describe_custom_classifier["DocumentClassifierProperties"])

{'DocumentClassifierArn': 'arn:aws:comprehend:us-east-1:835319576252:document-classifier/Amazon-Customer-Reviews-Classifier-1589606450', 'LanguageCode': 'en', 'Status': 'TRAINED', 'SubmitTime': datetime.datetime(2020, 5, 16, 5, 20, 50, 733000, tzinfo=tzlocal()), 'EndTime': datetime.datetime(2020, 5, 16, 6, 11, 15, 762000, tzinfo=tzlocal()), 'TrainingStartTime': datetime.datetime(2020, 5, 16, 5, 25, 18, 570000, tzinfo=tzlocal()), 'TrainingEndTime': datetime.datetime(2020, 5, 16, 6, 9, 29, 26000, tzinfo=tzlocal()), 'InputDataConfig': {'S3Uri': 's3://sagemaker-us-east-1-835319576252/data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv'}, 'OutputDataConfig': {'S3Uri': 's3://sagemaker-us-east-1-835319576252/models/comprehend/output/835319576252-CLR-b402cd9f1511f8d543bc5aa1457e0055/output/output.tar.gz'}, 'ClassifierMetadata': {'NumberOfLabels': 5, 'NumberOfTrainedDocuments': 27905, 'NumberOfTestDocuments': 3100, 'EvaluationMetrics': {'Accuracy': 0.5284, 'Precision': 0.5257, 'Recall': 

In [16]:
model_arn = describe_custom_classifier["DocumentClassifierProperties"]["DocumentClassifierArn"]
print(model_arn)

arn:aws:comprehend:us-east-1:835319576252:document-classifier/Amazon-Customer-Reviews-Classifier-1589606450


In [17]:
import os
#Retrieve the S3URI from the model output and create jobkey variable.
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]
print(job_output)

path_prefix = 's3://{}/'.format(bucket)

job_key = os.path.relpath(job_output, path_prefix)

print(job_key)

s3://sagemaker-us-east-1-835319576252/models/comprehend/output/835319576252-CLR-b402cd9f1511f8d543bc5aa1457e0055/output/output.tar.gz
models/comprehend/output/835319576252-CLR-b402cd9f1511f8d543bc5aa1457e0055/output/output.tar.gz


In [18]:
!aws s3 cp $job_output ./output.tar.gz


Completed 289 Bytes/289 Bytes (5.0 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-1-835319576252/models/comprehend/output/835319576252-CLR-b402cd9f1511f8d543bc5aa1457e0055/output/output.tar.gz to ./output.tar.gz


In [19]:
#Download the model metrics
s3 = boto3.resource('s3')

s3.Bucket(bucket).download_file(job_key, './output.tar.gz')

In [20]:
#Unpack the gzip file
!tar xvzf ./output.tar.gz

output/
output/confusion_matrix.json


In [21]:
import json

with open('./output/confusion_matrix.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

{
  "confusion_matrix": [
    [
      400,
      155,
      44,
      10,
      11
    ],
    [
      151,
      265,
      163,
      24,
      17
    ],
    [
      56,
      148,
      282,
      107,
      27
    ],
    [
      17,
      27,
      107,
      277,
      192
    ],
    [
      15,
      14,
      31,
      146,
      414
    ]
  ],
  "labels": [
    "1",
    "2",
    "3",
    "4",
    "5"
  ],
  "type": "multi_class",
  "all_labels": [
    "1",
    "2",
    "3",
    "4",
    "5"
  ]
}


In [22]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.8.7


In [23]:
from IPython.display import HTML, display
import tabulate
table = [['', '1', '2', '3', '4', '5', '(Predicted)'],
         ['1', data['confusion_matrix'][0][0], data['confusion_matrix'][0][1], data['confusion_matrix'][0][2], data['confusion_matrix'][0][3], data['confusion_matrix'][0][4]],
         ['2', data['confusion_matrix'][1][0], data['confusion_matrix'][1][1], data['confusion_matrix'][1][2], data['confusion_matrix'][1][3], data['confusion_matrix'][1][4]],
         ['3', data['confusion_matrix'][2][0], data['confusion_matrix'][2][1], data['confusion_matrix'][2][2], data['confusion_matrix'][2][3], data['confusion_matrix'][2][4]],
         ['4', data['confusion_matrix'][3][0], data['confusion_matrix'][3][1], data['confusion_matrix'][3][2], data['confusion_matrix'][3][3], data['confusion_matrix'][3][4]],
         ['5', data['confusion_matrix'][4][0], data['confusion_matrix'][4][1], data['confusion_matrix'][4][2], data['confusion_matrix'][4][3], data['confusion_matrix'][4][4]],
         ['(Actual)']]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1,2,3,4,5,6
,1.0,2.0,3.0,4.0,5.0,(Predicted)
1,400.0,155.0,44.0,10.0,11.0,
2,151.0,265.0,163.0,24.0,17.0,
3,56.0,148.0,282.0,107.0,27.0,
4,17.0,27.0,107.0,277.0,192.0,
5,15.0,14.0,31.0,146.0,414.0,
(Actual),,,,,,


# Deploy Endpoint

In [24]:
inference_endpoint_response = comprehend.create_endpoint(
    EndpointName='comprehend-inference-endpoint',
    ModelArn = model_arn,
    DesiredInferenceUnits = 1
)

In [25]:
endpoint_arn = inference_endpoint_response["EndpointArn"]
print(endpoint_arn)

arn:aws:comprehend:us-east-1:835319576252:document-classifier-endpoint/comprehend-inference-endpoint


# Predict with Endpoint

In [26]:
describe_response = comprehend.describe_endpoint(
    EndpointArn = endpoint_arn
)
print(describe_response)

{'EndpointProperties': {'EndpointArn': 'arn:aws:comprehend:us-east-1:835319576252:document-classifier-endpoint/comprehend-inference-endpoint', 'Status': 'CREATING', 'ModelArn': 'arn:aws:comprehend:us-east-1:835319576252:document-classifier/Amazon-Customer-Reviews-Classifier-1589606450', 'DesiredInferenceUnits': 1, 'CurrentInferenceUnits': 0, 'CreationTime': datetime.datetime(2020, 5, 16, 6, 11, 20, 740000, tzinfo=tzlocal()), 'LastModifiedTime': datetime.datetime(2020, 5, 16, 6, 11, 20, 740000, tzinfo=tzlocal())}, 'ResponseMetadata': {'RequestId': '96832574-749d-4138-bd2d-c7181d7b7879', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '96832574-749d-4138-bd2d-c7181d7b7879', 'content-type': 'application/x-amz-json-1.1', 'content-length': '400', 'date': 'Sat, 16 May 2020 06:11:20 GMT'}, 'RetryAttempts': 0}}


In [27]:
import time

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_response = comprehend.describe_endpoint(
        EndpointArn = endpoint_arn
    )
    status = describe_response["EndpointProperties"]["Status"]
    print("Endpoint: {}".format(status))
    
    if status == "IN_SERVICE" or status == "IN_ERROR":
        break
        
    time.sleep(5)

Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CREATING
Endpoint: CR

In [28]:
txt = """I loved it!  I will recommend this to everyone."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

{
  "Classes": [
    {
      "Name": "5",
      "Score": 0.8985000252723694
    },
    {
      "Name": "4",
      "Score": 0.07240000367164612
    },
    {
      "Name": "1",
      "Score": 0.010900000110268593
    }
  ],
  "ResponseMetadata": {
    "RequestId": "bb03f60f-7a09-4a04-9d79-ada00caab4e3",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "bb03f60f-7a09-4a04-9d79-ada00caab4e3",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "136",
      "date": "Sat, 16 May 2020 06:21:57 GMT"
    },
    "RetryAttempts": 0
  }
}


In [30]:
txt = """It's OK."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

{
  "Classes": [
    {
      "Name": "3",
      "Score": 0.73580002784729
    },
    {
      "Name": "2",
      "Score": 0.11640000343322754
    },
    {
      "Name": "4",
      "Score": 0.08299999684095383
    }
  ],
  "ResponseMetadata": {
    "RequestId": "36099dda-7e0d-4885-9949-0c8a46921930",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "36099dda-7e0d-4885-9949-0c8a46921930",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "133",
      "date": "Sat, 16 May 2020 06:21:57 GMT"
    },
    "RetryAttempts": 0
  }
}


In [29]:
txt = """Really bad.  I hope they don't make this anymore."""

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

{
  "Classes": [
    {
      "Name": "1",
      "Score": 0.5094000101089478
    },
    {
      "Name": "2",
      "Score": 0.27810001373291016
    },
    {
      "Name": "3",
      "Score": 0.1670999974012375
    }
  ],
  "ResponseMetadata": {
    "RequestId": "41e299b2-74d9-41d0-acf2-81bab495aaa6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "x-amzn-requestid": "41e299b2-74d9-41d0-acf2-81bab495aaa6",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "134",
      "date": "Sat, 16 May 2020 06:21:57 GMT"
    },
    "RetryAttempts": 0
  }
}


# Navigate to Comprehend in the AWS Console
![Comprehend Console](img/comprehend-console.png)