# Moderate Text in documents using Amazon Textract

***
This notebook provides a walkthrough of [Amazon Textract APIs](https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html) to extract text, forms and tables. You can then search extracted text for unsafe content.
***

# Initialize stuff

In [None]:
# Initialise Notebook
import boto3
from IPython.display import HTML, display, Image as IImage
from PIL import Image, ImageDraw, ImageFont
import time
import os
from trp import Document

In [None]:
# Curent AWS Region. Use this to choose corresponding S3 bucket with sample content

mySession = boto3.session.Session()
awsRegion = mySession.region_name

In [None]:
# Init clients
textract = boto3.client('textract')
s3 = boto3.client('s3')

In [None]:
# S3 bucket that contains sample images and videos

# We are providing sample images and videos in this bucket so
# you do not have to manually download/upload test images and videos.

bucketName = "aws-workshops-" + awsRegion

In [None]:
# Create temporary directory
# This directory is not needed to call Rekognition APIs.
# We will only use this directory to download images from S3 bucket and drwaw bounding boxes
# around recognized celebrities to show them here in the notebook.

!mkdir m1tmp
tempFolder = 'm1tmp/'

# Detect text in document
***

In [None]:
imageName = "content-moderation/media/simple-document-image.jpg"

In [None]:
display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))

#### Call Textract to detect text in the image

In [None]:
# Call Amazon Textract
response = textract.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': bucketName,
            'Name': imageName
        }
    })

#### Review the raw JSON reponse from Textract

In [None]:
# Show JSON response returned by Textract Detect Document Text API
# In the JSON response below, you will see text, confidence score and additional information.

display(response)

#### Display detected lines

In [None]:
# Print detected text
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print ('\033[94m' +  item["Text"] + '\033[0m')

In [None]:
unsafeWords = ["WA"]
for item in response["Blocks"]:
    if item["BlockType"] == "WORD":
        if(item["Text"] in unsafeWords):
            print("Detected unsafe word: {}".format(item["Text"]))

# Analyze Form

In [None]:
imageName = "content-moderation/media/employmentapp.png"

In [None]:
display(IImage(url=s3.generate_presigned_url('get_object', Params={'Bucket': bucketName, 'Key': imageName})))

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': bucketName,
            'Name': imageName
        }
    },
    FeatureTypes=["FORMS"])

In [None]:
display(response)

In [None]:
unsafeField = "Address"

doc = Document(response)
for page in doc.pages:
    for field in page.form.fields:
        if(unsafeField in "{}".format(field.key)):
            print("Found personal information field => Key: {}, Value: {}".format(field.key, field.value))

***
### References
- https://docs.aws.amazon.com/textract/latest/dg/API_Operations.html

***