In [22]:
import boto3
from langchain.llms import Bedrock

def get_bedrock_client(region="us-east-1"):
    bedrock_client = boto3.client("bedrock-runtime", region_name=region)
    return bedrock_client

def create_bedrock_llm(bedrock_client, model_version_id):
    bedrock_llm = Bedrock(
        model_id=model_version_id, 
        client=bedrock_client,
        model_kwargs={'temperature': 0, 'maxTokenCount': 2048}
        )
    return bedrock_llm

# Creating all clients for chain
client = get_bedrock_client()
llm = create_bedrock_llm(client, "amazon.titan-text-express-v1")

In [23]:
def extract_data(model, document):
    example = """
    {
        "file_name": "2023-09-14_Invoice_EUINIL23_454999.pdf",
        "doit_payer_id": "doitintl-payer-1053",
        "tax_invoice_number": "EUINIL23-454999",
        "original_invoice_number": "EUINIL23-443172",
        "invoice_date": "September 14, 2023",
        "original_invoice_date": "",
        "due_date": "October 14, 2023",
        "tax_registration_number": "",
        "billing_period": "September 1 - September 30, 2023",
        "aws_account": "123412341234",
        "total_amount": "11,633.41",
        "total_amount_currency": "USD",
        "total_vat_tax_amount": "1,690.33",
        "vat_tax_currency": "ILS",
        "exchange_rate": "3.819",
        "address_company": "Scylla DB Ltd",
        "address_attn": "Noam Ehrlich",
        "address_country": "Israel"
    }
    """
    # Create prompt to parse the invoice with Bedrock LLM
    prompt_template = """
    Parse the invoice below. Extract data following the instructions into a flat JSON object.
    Extract the following fields into a JSON record:
    - file name ## skip if not present
    - doit payer id ## skip if not present
    - tax invoice number ## tax registration number; invoice number; (vat) credit note number; leave empty if not present
    - original invoice number ## original tax invoice number; leave empty if not present
    - invoice date
    - original invoice date ## leave empty if not present; leave empty if not present
    - due date
    - tax registration number ## tax registration; ABN number; GST/HST Registration number; leave empty if not present,
    - billing period
    - aws account number ## Account number; aws account number; cannot be empty
    - total amount ## without currency; add minus sign if parentheses around or has a minus prefix; float number
    - total amount currency  ## use currency code instead of symbol
    - total VAT/tax amount ## without currency; add minus sign if parentheses around or has a minus prefix; float number
    - vat/tax currency ## use currency code instead of symbol
    - exchange rate ## exchange rate (1 USD = ?); leave empty if not found
    - address company  ## bill to address; address; cannot contain Amazon Web Services
    - address ATTN
    - address country ## convert country code to a full country name
    
    Return the extracted fields in the valid JSON format: only JSON objects and arrays are allowed without any comments or other text. Keep it as simple as possible and ensure the JSON is valid.
    Skip the fields that are not present in the invoice. DO NOT OBFUSCATE ANY DATA!
    Be careful with the currency symbols, which are not always in the invoice.
    Try to extract the fields even if the invoice format differs from the one below. and the fields are not in the same order. 
    My job depends on it! And I will be very grateful to you! Will pay you an extra 1000$ if you do it without errors!
    
    <example>
    {example}
    <example>
    
    <invoice>
    {invoice}
    <invoice>
    
    JSON:
    """

    prompt = prompt_template.format(invoice=document, example=example)
    result = model(prompt)
    
    # remove everything before the first { and after the last }
    result = result[result.find("{"):result.rfind("}")+1]
    
    # escape single slash characters
    result = result.replace("\\", "\\\\")
     
    return result

In [24]:
def remove_footer(text):
    # remove everything after one of the following lines (including the line itself)
    lines = [
        "* May include estimated US sales tax, VAT, ST, GST and CT.",
        "Amazon Web Services EMEA SARL",
        "Amazon Web Services Australia Pty Ltd",
        "AMAZON WEB SERVICES EMEA SARL",
        "Amazon Web Services Canada, Inc.",
        "Amazon Web Services EMEA SARL, Luxembourg, Zweigniederlassung Zürich",
    ]
    for line in lines:
        if line in text:
            return text.split(line)[0]
    return text

In [25]:
import os
from langchain.document_loaders import PyMuPDFLoader

# scan all documents in the folder (recursively)
def scan_folder(folder):
    documents = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".pdf"):
                loader = PyMuPDFLoader(os.path.join(root, file))
                data = loader.load()
                invoice = remove_footer(data[0].page_content)
                # get parent folder name
                parent_folder = os.path.basename(os.path.dirname(os.path.join(root, file)))
                # extract doit payer id from the parent folder name
                payer_id = parent_folder.split("_")[1]
                # add file name to the invoice
                invoice = f"File name: {file}\nDoiT payer id: {payer_id}\n" + invoice
                documents.append(invoice)
    return documents

documents = scan_folder("./data")
print(f"Found {len(documents)} documents")

Found 228 documents


In [27]:
import json
import pandas
import time

# measure time
start = time.time()

# Initialize an empty list to store the results
results = []

# Loop over the first max documents
max = 228
for i, document in enumerate(documents[:max]):
    try:
        # Extract data from the document
        result = extract_data(llm, document)
        # Append the result to the list
        results.append(json.loads(result))
        print(f"Processed document {i+1} of {max}")
    except Exception as e:
        print(e)
        print(f"Failed to process document {i+1} of {max}")
    
# Convert the list of dictionaries to a DataFrame
df = pandas.DataFrame(results)

# Export the DataFrame to a CSV file
df.to_csv("invoices.csv", index=False)

# measure time
end = time.time()
print(f"Time elapsed: {end - start} seconds")



Processed document 1 of 228
Processed document 2 of 228
Processed document 3 of 228
Processed document 4 of 228
Processed document 5 of 228
Processed document 6 of 228
Processed document 7 of 228
Processed document 8 of 228
Processed document 9 of 228
Processed document 10 of 228
Processed document 11 of 228
Processed document 12 of 228
Processed document 13 of 228
Processed document 14 of 228
Processed document 15 of 228
Processed document 16 of 228
Processed document 17 of 228
Processed document 18 of 228
Processed document 19 of 228
Processed document 20 of 228
Processed document 21 of 228
Processed document 22 of 228
Processed document 23 of 228
Processed document 24 of 228
Processed document 25 of 228
Processed document 26 of 228
Processed document 27 of 228
Processed document 28 of 228
Processed document 29 of 228
Processed document 30 of 228
Processed document 31 of 228
Processed document 32 of 228
Processed document 33 of 228
Processed document 34 of 228
Processed document 35 o

In [28]:
# select random documents from the list
print(documents[70])


File name: 2023-09-19_Invoice_AUIN23_1695554.pdf
DoiT payer id: doitintl-payer-1493
Tax Invoice
Email or talk to us about your AWS account or bill, visit console.aws.amazon.com/support
More information regarding your service charges is available by accessing your Billing Management Console
Account number:
258034378100
ABN Number:
32112669726
Address:
EQUILIBRIUM INTERACTIVE PTY LTD
ATTN: Noam Ehrlich
Level 1
77 King Street
Perth, WA, 6000, AU
Invoice Summary
Tax Invoice Number:
AUIN23-1695554
Tax Invoice Date:
September 19, 2023
TOTAL AMOUNT DUE ON
October 19, 2023
AUD 817.76
TOTAL Tax
AUD 74.34
This Tax Invoice is for the billing period September 1 - September 30, 2023
You have selected AUD as your preferred payment currency.
Invoice Summary
AWS Service Charges (1 USD = 1.5683987 AUD)
USD 521.40
AUD 817.76
1 x Amazon Relational Database Service (one time fee)
USD 474.00
AUD 743.42
Net Charges (After Credits/Discounts, excl. Tax)
USD 474.00
AUD 743.42
Total GST Amount at 10%
USD 47.40
