# Setup Comprehend Through CLI/API:
https://docs.aws.amazon.com/comprehend/latest/dg/get-started-customclass.html

# Setup Comprehend Through AWS Console 

https://docs.aws.amazon.com/comprehend/latest/dg/getting-started-document-classification.html

Good example of using Comprehend for Positive/Negative Sentiment:  https://github.com/aws-samples/amazon-comprehend-custom-entity/blob/master/3-AWS-Comprehend-Negative-Custom-Classifier.ipynb

# Make sure this SageMakerNotebookExecutionRole has access to Comprehend

In [87]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [88]:
comprehend = boto3.client('comprehend')

In [89]:
%store -r noheader_train_s3_uri

print(noheader_train_s3_uri)

s3://sagemaker-us-east-1-835319576252/data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv


In [90]:
!aws s3 ls $noheader_train_s3_uri

2020-03-28 04:35:35   15164605 amazon_reviews_us_Digital_Software_v1_00_noheader.csv


# Setup IAM Roles and Policies
TODO:  Fix this.

In [81]:
data_access_role = 'arn:aws:iam::835319576252:role/service-role/AmazonComprehendServiceRole-dsoaws'

In [82]:
print(data_access_role)

arn:aws:iam::835319576252:role/service-role/AmazonComprehendServiceRole-dsoaws


# Create Data Access Role for Comprehend

## Create Policy

In [107]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "comprehend.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

## Create Role and Attach Policies

In [108]:
iam_comprehend_role_name = 'DSOAWS_Comprehend'

In [109]:
import json
import boto3
from botocore.exceptions import ClientError

try:
    iam = boto3.client('iam')

    iam_role_comprehend = iam.create_role(
        RoleName=iam_comprehend_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Comprehend Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

Role already exists


In [106]:
comprehend_s3_policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "s3:GetObject"
            ],
            "Resource": [
                "arn:aws:s3:::sagemaker-us-east-1-835319576252/*"
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::sagemaker-us-east-1-835319576252"
            ],
            "Effect": "Allow"
        },
        {
            "Action": [
                "s3:PutObject"
            ],
            "Resource": [
                "arn:aws:s3:::sagemaker-us-east-1-835319576252/*"
            ],
            "Effect": "Allow"
        }
    ]
}

print(comprehend_s3_policy_doc)


{'Version': '2012-10-17', 'Statement': [{'Action': ['s3:GetObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252/*'], 'Effect': 'Allow'}, {'Action': ['s3:ListBucket'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252'], 'Effect': 'Allow'}, {'Action': ['s3:PutObject'], 'Resource': ['arn:aws:s3:::sagemaker-us-east-1-835319576252/*'], 'Effect': 'Allow'}]}


In [111]:
try:
    policy_comprehend_to_s3 = iam.create_policy(
      PolicyName='DSOAWS_ComprehendPolicyToS3',
      PolicyDocument=json.dumps(comprehend_s3_policy_doc)
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        print("Policy already exists")
    else:
        print("Unexpected error: %s" % e)

Policy already exists


# Train our Model

In [112]:
prefix = 'models'

s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')
print(s3_output_job)

s3://sagemaker-us-east-1-835319576252/models/comprehend/output


In [114]:
import datetime

id = str(datetime.datetime.now().strftime("%s"))

training_job = comprehend.create_document_classifier(
    DocumentClassifierName='Amazon-Customer-Reviews-Classifier-'+ id,
    DataAccessRoleArn=iam_role_comprehend,
    InputDataConfig={
        'S3Uri': noheader_train_s3_uri
    },
    OutputDataConfig={
        'S3Uri': s3_output_job
    },
    LanguageCode='en'
)

ParamValidationError: Parameter validation failed:
Invalid type for parameter DataAccessRoleArn, value: {'Role': {'Path': '/', 'RoleName': 'DSOAWS_Comprehend', 'RoleId': 'AROA4E7HNG26H42NHPBXP', 'Arn': 'arn:aws:iam::835319576252:role/DSOAWS_Comprehend', 'CreateDate': datetime.datetime(2020, 3, 28, 21, 30, 24, tzinfo=tzlocal()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Effect': 'Allow', 'Principal': {'Service': 'comprehend.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}}, 'ResponseMetadata': {'RequestId': '97a6f043-c59d-4c59-866f-ec9c7d5a469d', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '97a6f043-c59d-4c59-866f-ec9c7d5a469d', 'content-type': 'text/xml', 'content-length': '792', 'date': 'Sat, 28 Mar 2020 21:30:23 GMT'}, 'RetryAttempts': 0}}, type: <class 'dict'>, valid types: <class 'str'>

In [None]:
jobArn = training_job['DocumentClassifierArn']

max_time = time.time() + 3 * 60 * 60 # 3 hours
while time.time() < max_time:
    describe_custom_classifier = comprehend.describe_document_classifier(
        DocumentClassifierArn = jobArn
    )
    status = describe_custom_classifier["DocumentClassifierProperties"]["Status"]
    print("Custom classifier: {}".format(status))
    
    if status == "TRAINED" or status == "IN_ERROR":
        break
        
    time.sleep(5)

Custom classifier: SUBMITTED
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: TRAINING
Custom classifier: 

In [None]:
model_arn = describe_custom_classifier["DocumentClassifierProperties"]["DocumentClassifierArn"]
print(model_arn)

In [None]:
#Retrieve the S3URI from the model output and create jobkey variable.
job_output = describe_custom_classifier["DocumentClassifierProperties"]["OutputDataConfig"]["S3Uri"]
path_prefix = 's3://{}/'.format(bucket)
job_key = os.path.relpath(job_output, path_prefix)


In [None]:
#Download the model metrics
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(job_key, './output.tar.gz')

In [None]:
#Unpack the gzip file
!tar xvzf ./output.tar.gz


In [None]:
import json

with open('./output/confusion_matrix.json') as json_file:
    data = json.load(json_file)
print(json.dumps(data, indent=2, default=str))

In [None]:
!pip install tabulate

In [None]:
from IPython.display import HTML, display
import tabulate
table = [["","NEGATIVE","POSITIVE","(Predicted)"],
         ["NEGATIVE",data['confusion_matrix'][0][0], data['confusion_matrix'][0][1]],
         ["POSTIVE",data['confusion_matrix'][1][0], data['confusion_matrix'][1][1]],
         ["(Actual)"]]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

# Deploy Endpoint

In [None]:
inference_endpoint_response = comprehend.create_endpoint(
    EndpointName='comprehend-inference-endpoint',
    ModelArn = model_arn,
    DesiredInferenceUnits = 1
)

In [None]:
endpoint_arn = inference_endpoint_response["EndpointArn"]


# Predict with Endpoint

In [None]:
# #endpoint_arn = inference_endpoint_response["EndpointArn"]

# # TODO:  get account_id
# #account_id = sess.get_account??

# endpoint_arn = 'arn:aws:comprehend:{}:{}:document-classifier-endpoint/reviews-star-rating'.format(region, account_id)


In [None]:
describe_response = comprehend.describe_endpoint(
    EndpointArn = endpoint_arn
)

In [None]:
import time

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_response = comprehend.describe_endpoint(
        EndpointArn = endpoint_arn
    )
    status = describe_response["EndpointProperties"]["Status"]
    print("Endpoint: {}".format(status))
    
    if status == "IN_SERVICE" or status == "IN_ERROR":
        break

In [None]:
txt = "This product is awesome."

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

In [None]:
txt = "This product is ok."

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))

In [None]:
txt = "This product is terrible."

response = comprehend.classify_document(
    Text= txt,
    EndpointArn = endpoint_arn
)

import json
print(json.dumps(response, indent=2, default=str))