Use Bedrock to extract the data

In [1]:
import boto3
from langchain.llms import Bedrock

def get_bedrock_client(region="us-east-1"):
    bedrock_client = boto3.client("bedrock-runtime", region_name=region)
    return bedrock_client

def create_bedrock_llm(bedrock_client, model_version_id):
    bedrock_llm = Bedrock(
        model_id=model_version_id, 
        client=bedrock_client,
        model_kwargs={'temperature': 0, 'maxTokenCount': 1000}
        )
    return bedrock_llm

# Creating all clients for chain
client = get_bedrock_client()
llm = create_bedrock_llm(client, "amazon.titan-text-express-v1")

In [46]:
def extract_data(model, document):
    # Create prompt to parse the invoice with Bedrock LLM
    prompt_template = """
    Parse the invoice below. Make sure to use the first address in the document as a billing address, and extract the following fields:
    - file name ## skip if not present
    - doit payer id ## skip if not present
    - tax invoice number ## can also be a (vat) credit note number
    - original invoice number  ## or original tax invoice number; leave empty if not present
    - invoice date
    - original invoice date  ## leave empty if not present
    - due date
    - tax registration number  ## tax registration, ABN number or GST/HST Registration number; leave empty if not present
    - billing period
    - aws account number
    - total amount ## without currency, formatted as float number, negative if in parentheses
    - total amount currency  ## use currency code instead of symbol
    - total VAT/tax amount ## without currency, formatted as float number, negative if in parentheses
    - vat/tax currency ## use currency code instead of symbol
    - exchange rate
    - address company
    - address ATTN
    - address country  ## convert country code to country name
    
    Return the extracted fields in the valid JSON format: only JSON objects and arrays are allowed without any comments or other text. Try to keep it as simple as possible and make sure that the JSON is valid.
    Skip the fields that are not present in the invoice. 
    Be careful with the currency symbols, they are not always present in the invoice.
    Try to extract the fields even if the invoice format is different from the one below. and the fields are not in the same order. 
    
    <example>
    {{
        "file_name": "2023-10-02_Invoice_EUINIL23_456031.pdf",
        "doit_payer_id": "doitintl-payer-837",
        "tax_invoice_number": "EUINIL23-456031",
        "original_invoice_number": "",
        "invoice_date": "October 2, 2023",
        "original_invoice_date": "",
        "due_date": "November 1, 2023",
        "tax_registration_number": "514554328",
        "billing_period": "September 1 - September 30, 2023",
        "aws_account_number": "925241040434",
        "total_amount": "72083.96",
        "total_amount_currency": "USD",
        "total_vat_tax_amount": "10473.71",
        "vat_tax_currency": "ILS",
        "exchange_rate": "3.824",
        "address_company": "DoiT International",
        "address_attn": "Noam Ehrlich",
        "address_country": "Israel"
    }}
    <example>
    
    My job depends on it! And I will be very grateful to you! Will pay you extra 1000$ if you do it without errors!
    
    <invoice>
    {invoice}
    <invoice>
    """

    prompt = prompt_template.format(invoice=document)
    result = model(prompt)
    # strip ``` from the beginning and end of the result matching the template
    result = result.replace("```", "")
    # strip tabular-data-json from the result
    result = result.replace("tabular-data-json", "")
    
    # escape single slash characters
    result = result.replace("\\", "\\\\")
     
    return result

In [23]:
def remove_footer(text):
    # remove everything after one of the following lines (including the line itself)
    lines = [
        "* May include estimated US sales tax, VAT, ST, GST and CT.",
        "Amazon Web Services EMEA SARL",
        "Amazon Web Services Australia Pty Ltd",
        "AMAZON WEB SERVICES EMEA SARL",
        "Amazon Web Services Canada, Inc.",
        "Amazon Web Services EMEA SARL, Luxembourg, Zweigniederlassung Zürich",
    ]
    for line in lines:
        if line in text:
            return text.split(line)[0]
    return text

In [24]:
import os
from langchain.document_loaders import PyMuPDFLoader

# scan all documents in the folder (recursively)
def scan_folder(folder):
    documents = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".pdf"):
                loader = PyMuPDFLoader(os.path.join(root, file))
                data = loader.load()
                invoice = remove_footer(data[0].page_content)
                # get parent folder name
                parent_folder = os.path.basename(os.path.dirname(os.path.join(root, file)))
                # extract doit payer id from the parent folder name
                payer_id = parent_folder.split("_")[1]
                # add file name to the invoice
                invoice = f"File name: {file}\nDoiT payer id: {payer_id}\n" + invoice
                documents.append(invoice)
    return documents

In [48]:
import json

documents = scan_folder("./data")

result = extract_data(llm, documents[22])
print(result)

try:
    j = json.loads(result)
    print(j)
except Exception as e:
    print(e)
    





{
    "file_name": "2023-10-02_Invoice_EUINIL23_456137.pdf",
    "doit_payer_id": "doitintl-payer-735",
    "tax_invoice_number": "EUINIL23-456137",
    "original_invoice_number": "",
    "invoice_date": "October 2, 2023",
    "original_invoice_date": "",
    "due_date": "November 1, 2023",
    "tax_registration_number": "514554328",
    "billing_period": "September 1 - September 30, 2023",
    "aws_account_number": "922054565647",
    "total_amount": "23,530.42",
    "total_amount_currency": "USD",
    "total_vat_tax_amount": "13,073.99",
    "vat_tax_currency": "ILS",
    "exchange_rate": "3.824",
    "address_company": "DoiT International",
    "address_attn": "Vadim Solovey",
    "address_country": "Israel"
}

{'file_name': '2023-10-02_Invoice_EUINIL23_456137.pdf', 'doit_payer_id': 'doitintl-payer-735', 'tax_invoice_number': 'EUINIL23-456137', 'original_invoice_number': '', 'invoice_date': 'October 2, 2023', 'original_invoice_date': '', 'due_date': 'November 1, 2023', 'tax_regis

In [38]:
import json
import pandas
import time

# measure time
start = time.time()

# Initialize an empty list to store the results
results = []

# Loop over the first max documents
max = 50
for i, document in enumerate(documents[:max]):
    try:
        # Extract data from the document
        result = extract_data(llm, document)
        # Append the result to the list
        results.append(json.loads(result))
        print(f"Processed document {i+1} of {max}")
    except Exception as e:
        print(e)
        print(f"Failed to process document {i+1} of {max}")
    
# Convert the list of dictionaries to a DataFrame
df = pandas.DataFrame(results)

# Export the DataFrame to a CSV file
df.to_csv("invoices.csv", index=False)

# measure time
end = time.time()
print(f"Time elapsed: {end - start} seconds")



Processed document 1 of 50
Processed document 2 of 50
Processed document 3 of 50
Processed document 4 of 50
Processed document 5 of 50
Processed document 6 of 50
Processed document 7 of 50
Processed document 8 of 50
Processed document 9 of 50
Processed document 10 of 50
Processed document 11 of 50
Processed document 12 of 50
Processed document 13 of 50
Processed document 14 of 50
Processed document 15 of 50
Processed document 16 of 50
Processed document 17 of 50
Processed document 18 of 50
Processed document 19 of 50
Processed document 20 of 50
Processed document 21 of 50
Processed document 22 of 50
Expecting value: line 3 column 1 (char 2)
Failed to process document 23 of 50
Processed document 24 of 50
Processed document 25 of 50
Processed document 26 of 50
Processed document 27 of 50
Processed document 28 of 50
Processed document 29 of 50
Processed document 30 of 50
Processed document 31 of 50
Processed document 32 of 50
Processed document 33 of 50
Processed document 34 of 50
Process