Use Bedrock to extract the data

In [None]:
import boto3
from langchain.llms import Bedrock

def get_bedrock_client(region="us-east-1"):
    bedrock_client = boto3.client("bedrock-runtime", region_name=region)
    return bedrock_client

def create_bedrock_llm(bedrock_client, model_version_id):
    bedrock_llm = Bedrock(
        model_id=model_version_id, 
        client=bedrock_client,
        model_kwargs={'temperature': 0, 'maxTokenCount': 2048}
        )
    return bedrock_llm

# Creating all clients for chain
client = get_bedrock_client()
llm = create_bedrock_llm(client, "amazon.titan-text-express-v1")

In [None]:
def extract_data(model, document):
    # Create prompt to parse the invoice with Bedrock LLM
    prompt_template = """
    Parse the invoice below. Extract data following the instructions into a flat JSON object.
    Extract the following fields into a JSON record:
    - file name ## skip if not present
    - doit payer id ## skip if not present
    - tax invoice number ## can also be a (vat) credit note number; leave empty if not present
    - original invoice number ## or original tax invoice number; leave empty if not present
    - invoice date
    - original invoice date ## leave empty if not present; leave empty if not present
    - due date
    - tax registration number ## tax registration, ABN number or GST/HST Registration number; leave empty if not present,
    - billing period
    - aws account number
    - total amount ## without currency, formatted as float number, can be negative if in parentheses
    - total amount currency  ## use currency code instead of symbol
    - total VAT/tax amount ## without currency, formatted as float number, can be negative if in parentheses
    - vat/tax currency ## use currency code instead of symbol
    - exchange rate
    - address company
    - address ATTN
    - address country ## convert country code to a full country name
    
    Return the extracted fields in the valid JSON format: only JSON objects and arrays are allowed without any comments or other text. Keep it as simple as possible and ensure the JSON is valid.
    Skip the fields that are not present in the invoice. DO NOT OBFUSCATE DATA!
    Be careful with the currency symbols, which are not always in the invoice.
    Try to extract the fields even if the invoice format differs from the one below. and the fields are not in the same order. 
    My job depends on it! And I will be very grateful to you! Will pay you an extra 1000$ if you do it without errors!
    
    <invoice>
    {invoice}
    <invoice>
    """

    prompt = prompt_template.format(invoice=document)
    result = model(prompt)
    # strip ``` from the beginning and end of the result matching the template
    result = result.replace("```", "")
    # strip tabular-data-json from the result
    result = result.replace("tabular-data-json", "")
    
    # escape single slash characters
    result = result.replace("\\", "\\\\")
     
    return result

In [None]:
def remove_footer(text):
    # remove everything after one of the following lines (including the line itself)
    lines = [
        "* May include estimated US sales tax, VAT, ST, GST and CT.",
        "Amazon Web Services EMEA SARL",
        "Amazon Web Services Australia Pty Ltd",
        "AMAZON WEB SERVICES EMEA SARL",
        "Amazon Web Services Canada, Inc.",
        "Amazon Web Services EMEA SARL, Luxembourg, Zweigniederlassung Zürich",
    ]
    for line in lines:
        if line in text:
            return text.split(line)[0]
    return text

In [None]:
import os
from langchain.document_loaders import PyMuPDFLoader

# scan all documents in the folder (recursively)
def scan_folder(folder):
    documents = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".pdf"):
                loader = PyMuPDFLoader(os.path.join(root, file))
                data = loader.load()
                invoice = remove_footer(data[0].page_content)
                # get parent folder name
                parent_folder = os.path.basename(os.path.dirname(os.path.join(root, file)))
                # extract doit payer id from the parent folder name
                payer_id = parent_folder.split("_")[1]
                # add file name to the invoice
                invoice = f"File name: {file}\nDoiT payer id: {payer_id}\n" + invoice
                documents.append(invoice)
    return documents

documents = scan_folder("./data")
print(f"Found {len(documents)} documents")

In [None]:
import json
import pandas
import time

# measure time
start = time.time()

# Initialize an empty list to store the results
results = []

# Loop over the first max documents
max = 228
for i, document in enumerate(documents[:max]):
    try:
        # Extract data from the document
        result = extract_data(llm, document)
        # Append the result to the list
        results.append(json.loads(result))
        print(f"Processed document {i+1} of {max}")
    except Exception as e:
        print(e)
        print(f"Failed to process document {i+1} of {max}")
    
# Convert the list of dictionaries to a DataFrame
df = pandas.DataFrame(results)

# Export the DataFrame to a CSV file
df.to_csv("invoices.csv", index=False)

# measure time
end = time.time()
print(f"Time elapsed: {end - start} seconds")



In [None]:
# select random documents from the list
import random
random_docs =random.choices(documents, k=1)
for i, document in enumerate(random_docs):
    print(document)
