# Playground for invoice processing with OpenAI API

### Define AWS invoice and credit record model

In [None]:
from typing import Optional
from pydantic import BaseModel, Field

# Define a new Pydantic model with field descriptions and tailored for AWS Invoice/Credit Record.
class AwsInvoiceCredit(BaseModel):
    file_name: str = Field(description="AWS Invoice PDF file name.")
    doit_payer_id: str = Field(description="Doit Payer ID. Can be extracted from the parent folder name.")
    aws_account_number: str = Field(description="AWS Account number.")
    address_company: str = Field(description="Address or Bill to Address company name. Use first line of the address. Usually, it is the company name.")
    address_attn: str = Field(description="Address or Bill to Address ATTN. Use second line of the address. Usually, it is the name of the person.")
    address_country: str = Field(description="Bill to address country. Use last line of the address. Usually, it is the country name. Convert short country code to a full country name.")
    amazon_company_name: str = Field(description="Amazon Web Services company name. Usually, it is Amazon Web Services, Inc. but can be different for different countries.")
    amazon_company_branch: Optional[str] = Field(default="", description="Amazon Web Services company branch. Usually, it is after Amazon Web Services EMEA SARL but can be different for different countries; leave empty if not present")
    document_type: str = Field(description="Document Type. Can be 'Invoice' or 'Credit Note' only. Credit Note can be Credit Memo or Credit Adjustment Note.")
    billing_period: str = Field(description="Billing Period; Two dates separated by a dash; leave empty if not present")
    tax_registration_number: Optional[str] = Field(default="", description="Tax Registration Number; ABN Number; GST Number; GST/HST Registration number; Issued To; usually the next number after AWS Account Number; leave empty if not present")
    invoice_number: str = Field(description="Invoice Number from the Invoice Summary")
    invoice_date: Optional[str] = Field(default="", description="Invoice Date from the Invoice Summary.")
    original_invoice_number: Optional[str] = Field(default="", description="Original Invoice Number from the Invoice Summary of Credit Memo/Note; leave empty if not present")
    original_invoice_date: Optional[str] = Field(default="", description="Original Invoice Date from the Invoice Adjustment Summary of Credit Memo/Note.")
    total_amount: float = Field(description="Total Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_amount_currency: str = Field(description="Total Amount Currency from the Invoice Summary; use currency code instead of symbol")
    total_vat_tax_amount: Optional[float] = Field(default=None, description="Total VAT/Tax Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_vat_tax_currency: Optional[str] = Field(default="", description="VAT/Tax Currency from the Invoice Summary; use currency code instead of symbol")
    net_charges_usd: Optional[float] = Field(default=None, description="USD Net Charges (After Credits/Discounts, excl. Tax) in USD from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix; leave empty if not present")
    net_charges_non_usd: Optional[float] = Field(default=None, description="Net Charges (After Credits/Discounts, excl. Tax) in local currency (not USD) from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix; leave empty if not present")
    net_charges_currency: Optional[str] = Field(default="", description="Net Charges local currency (not USD); use currency code instead of symbol; leave empty if not present")
    vat_percentage: Optional[float] = Field(default=None, description="Extract VAT percent (without % sign) from one of these fields: VAT - <number>%; VAT in <percent>; GST amount at <percent>; HST Amount at <percent>; leave empty if not present or not a number between 0 and 100")
    exchange_rate: Optional[float] = Field(default=None, description="Exchange Rate from the Invoice Summary Table (1 USD = ?); leave empty if not found")

In [None]:
def remove_footer(text):
    # remove everything after one of the following lines (including the line itself)
    lines = [
        "* May include estimated US sales tax, VAT, ST, GST and CT.",
    ]
    for line in lines:
        if line in text:
            return text.split(line)[0]
    return text

In [None]:
import os
from langchain.document_loaders import PyMuPDFLoader

# scan all documents in the folder (recursively)
def scan_folder(folder):
    documents = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".pdf"):
                loader = PyMuPDFLoader(os.path.join(root, file))
                data = loader.load()
                invoice = remove_footer(data[0].page_content)
                # get parent folder name
                parent_folder = os.path.basename(os.path.dirname(os.path.join(root, file)))
                # extract doit payer id from the parent folder name
                payer_id = parent_folder.split("_")[1]
                # add file name to the invoice
                invoice = f"File name: {file}\nDoiT payer id: {payer_id}\n" + invoice
                documents.append(invoice)
    return documents

all_documents = scan_folder("./data/12-2023")
print(f"Found {len(all_documents)} documents")

In [None]:
import asyncio
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
import textwrap

async def extract_data(model, document, sem=asyncio.Semaphore(1)):
    async with sem:
        # Update the prompt to match the new query and desired format.
        try:
            # Instantiate the parser with the new model.
            parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
            # Get the file name from the first line of the document
            file_name = document.split("\n")[0].split(":")[1].strip()
            # Update the prompt to match the new query and desired format.
            prompt = PromptTemplate(
                template=textwrap.dedent(
                    """
                    Extract data from the AWS Invoice or Credit document into a flat JSON object.
                    {format_instructions}
                    {request}
                    <document>
                    {invoice}
                    <document>
                    
                    JSON:
                    """
                ),
                input_variables=["request", "invoice"],
                partial_variables={
                    "format_instructions": parser.get_format_instructions(),
                },
            )
            # Generate the input using the updated prompt.
            parsing_request = textwrap.dedent(
                """
                Return the extracted fields in the valid JSON format: only JSON objects and arrays are allowed without any comments or other text.
                Convert all dates to "Month name Day, Year" format with no leading zeros.
                Keep it as simple as possible and ensure the JSON is valid. 
                Skip the fields that are not present in the invoice.
                Be careful with the currency symbols, which are not always in the invoice.
                Try to extract the fields even if the invoice format differs and the fields are not in the same order. 
                My job depends on it! And I will be very grateful to you! Will pay you an extra 1000$ if you do it without errors!
                """
            )
            chain = LLMChain(llm=model, prompt=prompt)
            retries = 2 # number of retries
            while retries > 0:
                try:
                    output = await chain.arun(request=parsing_request, invoice=document)
                    # remove everything before the first { and after the last }
                    output = output[output.find("{"):output.rfind("}")+1]
                    parsed = parser.parse(output)
                    return parsed
                except Exception as e:
                    retries -= 1
                    if retries == 0:
                        raise Exception(f"Error processing document {file_name}: {e}")
        except Exception as ex:
            # returning and not raising the exception to continue processing other documents
            return Exception(f"Error processing document {file_name}: {ex}")

### Select LLM model

In [None]:
import os
import boto3
from langchain.chat_models import ChatOpenAI
from langchain.llms import Bedrock

def get_model(model_name="gpt-4-1106-preview", model_kwargs=None):
    model = None
    if model_name.startswith("gpt-"):
        # OpenAI API
        model = ChatOpenAI(
            model=model_name,
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            temperature=0,
            max_tokens=4096,
        )
    else:
        # Bedrock API
        client = boto3.client("bedrock-runtime", region_name="us-east-1")
        model = Bedrock(
            model_id=model_name, 
            client=client,
            model_kwargs=model_kwargs,
            )
    return model

In [None]:
import asyncio
import pandas as pd
import time

# Instantiate the semaphore to limit the number of concurrent requests.
# Approximate number of tokens per request is 1000-1500, so 50 requests will be 75k tokens
# a single request takes 10 seconds, so 30 concurrent requests can lead to 180 requests per minute
# 180 * 1500 = 270k tokens per minute (TPM) should be within the 600k TPM limit
sem = asyncio.Semaphore(50)

# measure time
start = time.time()

# Instantiate the LLM model.
# llm = get_model("amazon.titan-text-express-v1")
llm = get_model()

# Initialize an empty DataFrame to store the results
df = pd.DataFrame()

# Loop over the max documents (all by default)
max_docs = len(all_documents)
tasks = []
for i, doc in enumerate(all_documents[:max_docs]):
    # Extract data from the document (async)
    tasks.append(extract_data(llm, doc, sem))

# measure time
start = time.time()

# Create a CSV file and write the results as they become available
with open('invoices.csv', 'w') as f:
    for future in asyncio.as_completed(tasks):
        result = await future
        if isinstance(result, Exception):
            print(result)
        else:
            # Convert the result to a DataFrame and append it to the CSV file
            try:
                record = result.model_dump()
                df_temp = pd.DataFrame.from_dict(record, orient='index').transpose()
                df_temp.to_csv(f, header=f.tell()==0, index=False)
                print(f"Added record for: {record['file_name']}")
            except Exception as e:
                print(f"Error saving record: {e}")
                

# measure time
end = time.time()
print(f"Time elapsed: {end - start} seconds")

### DEBUG functions

In [None]:
# DEBUG: process a single document
async def process_document_file(file):
    # llm = get_model("meta.llama2-70b-chat-v1", {'temperature': 0, 'top_p': 1, 'max_gen_len': 2048})
    llm = get_model()
    loader = PyMuPDFLoader(file)
    data = loader.load()
    invoice = remove_footer(data[0].page_content)
    # get parent folder name
    parent_folder = os.path.basename(os.path.dirname(file))
    # get file name only
    file_name = os.path.basename(file)
    # extract doit payer id from the parent folder name
    payer_id = parent_folder.split("_")[1]
    # add file name to the invoice
    invoice = f"File name: {file_name}\nDoiT payer id: {payer_id}\n" + invoice
    print(invoice)
    result = await extract_data(llm, invoice)
    if isinstance(result, Exception):
        print(result)
    else:
        print(result.model_dump())
    
await process_document_file("data/12-2023/457849337198_doitintl-payer-1919/2023-12-01_Invoice_EUINNL23_655056.pdf")

In [None]:
with open('prompts/llama.txt', 'r') as file:
    content = file.read()

# Replace '\n' with actual newline character
content = content.replace('\\n', '\n')
content = content.replace('\\"', '"')

with open('prompts/llama.txt', 'w') as file:
    file.write(content)


In [None]:
import csv

# Define the path to the input CSV file and the path to the output sorted CSV file
input_file_path = 'invoices-300.csv'
output_file_path = 'sorted_invoices-300.csv'

# Read the CSV file into a list of dictionaries, where each dictionary represents a row
rows = []
with open(input_file_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    header = csv_reader.fieldnames
    for row in csv_reader:
        rows.append(row)

# Sort the list of dictionaries based on the value of the first column (string)
sorted_rows = sorted(rows, key=lambda x: x[header[0]])

# Write the sorted rows back to a new CSV file
with open(output_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=header)
    
    # Write the header
    csv_writer.writeheader()
    
    # Write the sorted rows
    for row in sorted_rows:
        csv_writer.writerow(row)

print(f"The CSV file has been sorted alphabetically based on the first column and saved to {output_file_path}.")
