__Copyright 2023 Daavid Stein. All Rights Reserved__

In [121]:
import boto3
from botocore.exceptions import ClientError
from collections import defaultdict
import pandas as pd

In [122]:
class TextractWrapper:
    """Encapsulates Textract functions."""

    def __init__(self, textract_client, s3_resource, sqs_resource):
        """
        :param textract_client: A Boto3 Textract client.
        :param s3_resource: A Boto3 Amazon S3 resource.
        :param sqs_resource: A Boto3 Amazon SQS resource.
        """
        self.textract_client = textract_client
        self.s3_resource = s3_resource
        self.sqs_resource = sqs_resource
        self.BUCKET = "s3://costcurve-invoices/"


    def analyze_file(
        self, *, document_file_name=None, document_bytes=None
    ):
        """
        Detects text and additional elements, such as forms or tables, in a local image
        file or from in-memory byte data.
        The image must be in PNG or JPG format.

        :param feature_types: The types of additional document features to detect.
        :param document_file_name: The name of a document image file.
        :param document_bytes: In-memory byte data of a document image.
        :return: The response from Amazon Textract, including a list of blocks
                 that describe elements detected in the image.
        """
        if document_file_name is not None:
            with open(document_file_name, "rb") as document_file:
                document_bytes = document_file.read()
        try:
            response = self.textract_client.analyze_expense(
                Document={"Bytes": document_bytes},
            )
        except ClientError:
            print("Couldn't detect text.")
            raise
        else:
            return response

In [123]:
client = boto3.client("textract")
s3 = boto3.resource("s3")
sqs = boto3.resource("sqs")

In [124]:
textract = TextractWrapper(textract_client=client, s3_resource=s3, sqs_resource=sqs
                          )
expense = textract.analyze_file(document_file_name="../../data/invoices/jpg/1.jpg")
#response = textract.analyze_file(document_file_name="1.jpg")

In [125]:
my_items = defaultdict(list)
for expense_doc in expense["ExpenseDocuments"]:
    for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for expense_fields in line_items["LineItemExpenseFields"]:
                    my_items[expense_fields["Type"]["Text"]].append(expense_fields["ValueDetection"]["Text"])

In [126]:
items_table = pd.DataFrame(data=my_items)
items_table

Unnamed: 0,PRODUCT_CODE,ITEM,QUANTITY,UNIT_PRICE,PRICE,EXPENSE_ROW
0,FRSQUID-,FRESH SQUID 5-8 R&T (LOLIGO) 1/10# (TUBS) WILD...,1.0,75.99,75.99,FRSQUID- FRESH SQUID 5-8 R&T (LOLIGO) 1/10# (T...
1,FRGRPRE,"RED GROUPER FILLET, SKIN OFF, 1-3 WILD-GULF (50#)",51.0,18.99,968.49,"FRGRPRE RED GROUPER FILLET, SKIN OFF, 1-3 WILD..."
2,FRSALFC,"FRESH SALMON FILLET, CFW, SKIN ON, FARM, SCOTT...",9.8,9.99,97.9,"FRSALFC FRESH SALMON FILLET, CFW, SKIN ON, FAR..."
3,FRMAHIP,"FRESH MAHI PORTIONS (6OZ BLOCK), WILD (20#)",20.9,10.99,229.69,"FRMAHIP FRESH MAHI PORTIONS (6OZ BLOCK), WILD ..."
4,FRSCAL-10,"FRESH SEA SCALLOPS, U/10 CT. DRY (SEA TRADE) 1...",2.0,289.99,579.98,"FRSCAL-10 FRESH SEA SCALLOPS, U/10 CT. DRY (SE..."
5,aCrab-Lu,Pasteurized Lump Blue Crab Meat (Pelagicus)-Pa...,12.0,32.99,395.88,aCrab-Lu Pasteurized Lump Blue Crab Meat (Pela...
6,1620PDT,16/20 P&D Tail-on Raw White Shrimp-Packer 5/2#...,20.0,7.99,159.8,1620PDT 16/20 P&D Tail-on Raw White Shrimp-Pac...
7,06-Mahi P,6oz. Mahi Portions (IVP)-Packer 1/10# Wild-Per...,0.0,9.49,0.0,06-Mahi P 6oz. Mahi Portions (IVP)-Packer 1/10...
8,SERVICE,SHIPPING AND HANDLING,1.0,3.0,3.0,SERVICE SHIPPING AND HANDLING\n1 3.00 3.00
