In [1]:
from main import AwsInvoiceCredit
from langchain.output_parsers import PydanticOutputParser

parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"file_name": {"description": "Name of the AWS invoice PDF file.", "title": "File Name", "type": "string"}, "doit_payer_id": {"description": "Doit Payer ID: Unique identifier for the payer in the DoiT system.", "title": "Doit Payer Id", "type": "string"}, "document_type": {"description": "Determine the document type based on content analysis. Classify as 'Invoice' if it primarily details charges, or 'Credit Note' if it contains references to 'Credit Memo', 'Credit Adjustment Note', 'Tax Invoice Adjustment', or similar terms. Add

In [2]:
import os
from main import remove_footer
from langchain_community.document_loaders import PyMuPDFLoader

def read_invoice(invoice_document):
    loader = PyMuPDFLoader(invoice_document)
    data = loader.load()
    invoice = data[0].page_content
    invoice = remove_footer(invoice)
     # get parent folder name
    parent_folder = os.path.basename(os.path.dirname(invoice_document))
    # get file name only
    file_name = os.path.basename(invoice_document)
    # extract doit payer id from the parent folder name
    payer_id = parent_folder.split("_")[1]
    # add file name to the invoice
    invoice = f"File name: {file_name}\nDoiT payer id: {payer_id}\n" + invoice
    return invoice
  
file_name = input("Enter invoice file path")
print(read_invoice(file_name))

File name: 2024-03-28_Invoice_SGIN24_195445.pdf
DoiT payer id: doitintl-payer-2276
Tax Invoice
Email or talk to us about your AWS account or bill, visit console.aws.amazon.com/support
More information regarding your service charges is available by accessing your Billing Management Console
Account number:
577224445833
GST number:
202236074C
Address:
DoiT International Xinjiapo Pte.Ltd.
ATTN: Vadim Solovey
135 Cecil Street #10–01
Philippine Airlines Building
Singapore, N/A, 069536, SG
Invoice Summary
Tax Invoice Number:
SGIN24-195445
Tax Invoice Date:
March 28, 2024
TOTAL AMOUNT DUE ON April
27, 2024
USD 14,198.27
This Tax Invoice is for the billing period March 1 - March 31, 2024
You have selected USD as your preferred payment currency. Certain services sold and provided in Singapore by AMCS
SG Private Limited (GST No: 201922646H) will be invoiced by Amazon Web Services Singapore Private Limited. See
Service Terms (https://aws.amazon.com/service-terms/) for further details.
Invoice Summ

In [3]:
import asyncio
from langchain_openai import ChatOpenAI
from main import extract_data

async def process_invoice(file_path):
    invoice = read_invoice(file_path)
    # Instantiate the model.
    llm = ChatOpenAI(
        model="gpt-4o-mini",
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        temperature=0.0,
        max_tokens=16384,
        top_p=0.0,
    )
    try:
        parsed = await extract_data(llm, document=invoice, sem=asyncio.Semaphore(1))
        if isinstance(parsed, Exception):
            print(f"Error processing invoice: {parsed}")
        elif isinstance(parsed, AwsInvoiceCredit):
            # Print parsed data as a JSON object
            print(parsed.model_dump_json())
        else:
            print(f"Unexpected result type: {type(parsed)}")
    except Exception as e:
        print(f"Unexpected error processing invoice: {e}")

# Replace 'your_invoice_file_path.pdf' with the actual file path
file_path = input("Enter invoice file path: ")
await process_invoice(file_path)

{"file_name":"2024-03-28_Invoice_SGIN24_195445.pdf","doit_payer_id":"doitintl-payer-2276","document_type":"Invoice","ri_invoice":true,"aws_account_number":"577224445833","address_company":"DoiT International Xinjiapo Pte.Ltd.","address_attn":"Vadim Solovey","address_country":"Singapore","tax_registration_number":"202236074C","invoice_number":"SGIN24-195445","invoice_date":"March 28, 2024","allocation_number":null,"original_invoice_number":null,"original_invoice_date":null,"total_amount":14198.27,"total_amount_currency":"USD","total_vat_tax_amount":1172.33,"total_vat_tax_currency":"USD","billing_period":"March 1, 2024 - March 31, 2024","net_charges_usd":13025.94,"net_charges_non_usd":17524.84,"net_charges_currency":"SGD","vat_percentage":9.0,"exchange_rate":1.34538,"amazon_company_name":"Amazon Web Services Singapore Private Limited","amazon_company_branch":null}
