In [None]:
!apt-get update -y
!apt-get install poppler-utils -y

In [None]:
!pip install amazon-textract-textractor[pdf] -q

In [None]:
import pandas as pd
from collections import OrderedDict
import boto3
from PIL import Image

In [None]:
textract = boto3.client('textract')

## Extraindo o texto de uma imagem

In [None]:
def load_image(filename):
    with open(filename, "rb") as imageFile:
      f = imageFile.read()
      return bytearray(f)

def save_words(textract_response):
    words=[]
    blocks = textract_response['Blocks']
    for block in blocks:
        if block['BlockType'] == 'WORD':
           words.append(block)
    return words

In [None]:
file_dir = 'documents'
sample_file = f'{file_dir}/emenda.jpg'
Image.open(sample_file)

In [None]:
response = textract.detect_document_text(
    Document={
        'Bytes': load_image(sample_file),
    }
)

response

In [None]:
words = save_words(response)
len(words)

In [None]:
text = ''
for word in words:
    text += word['Text']+' '
print(text)

### Utilizando biblioteca de alto nível

In [None]:
from textractor import Textractor

In [None]:
extractor = Textractor(region_name="us-east-1")

In [None]:
document = extractor.detect_document_text(file_source=sample_file, save_image=True)

In [None]:
document.pages

In [None]:
document.pages[0].lines

In [None]:
document.pages[0].words

In [None]:
document.words.visualize()

## Análise de grandes documentos

In [None]:
import sagemaker
from textractor.data.constants import TextractFeatures

sess = sagemaker.Session()
bucket = sess.default_bucket()
print(bucket)

In [None]:
file_dir = './documents'
sample_file = f'{file_dir}/acordao.pdf'
document = extractor.start_document_text_detection(
    file_source=sample_file,
    s3_upload_path=f"s3://{bucket}/temp/"
)

In [None]:
document.pages

In [None]:
document.pages[0].text

In [None]:
document = extractor.start_document_analysis(
    file_source=sample_file,
    s3_upload_path=f"s3://{bucket}/temp/",
    features=[TextractFeatures.FORMS]
)

In [None]:
document.get("relator")