In [2]:
from typing import Optional
from pydantic import BaseModel, Field

# Define a new Pydantic model with field descriptions and tailored for AWS Invoice/Credit Record.
class AwsInvoiceCredit(BaseModel):
    file_name: str = Field(description="AWS Invoice PDF file name")
    doit_payer_id: str = Field(description="Doit Payer ID")
    document_type: str = Field(description="Document Type: can be 'Invoice' or 'Credit Note' only. Credit Note can be Credit Memo or Credit Adjustment Note.")
    aws_account_number: str = Field(description="AWS Account number")
    address_company: str = Field(description="Address or Bill to Address company name. Use first line of the address. Usually, it is the company name.")
    address_attn: str = Field(description="Address or Bill to Address ATTN (skip the ATTN prefix). Use second line of the address. Usually, it is the name of the person.")
    address_country: str = Field(description="Bill to address country. Use last line of the address. Usually, it is the country name. Convert country code to a full country name.")
    tax_registration_number: Optional[str] = Field(default=None, description="Tax Registration Number or ABN Number or GST Number or GST/HST Registration number or  Issued To; usually the next number after AWS Account Number")
    billing_period: str = Field(description="Billing Period; Two dates separated by a dash")  
    invoice_number: str = Field(description="Invoice Number from the Invoice Summary")
    invoice_date: str = Field(description="Invoice Date from the Invoice Summary")
    original_invoice_number: Optional[str] = Field(default=None, description="Original Invoice Number from the Invoice Summary of Credit Memo/Note; leave empty if not present")
    original_invoice_date: Optional[str] = Field(default=None, description="Original Invoice Date from the Invoice Adjustment Summary of Credit Memo/Note; leave empty if not present")
    total_amount: float = Field(description="Total Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_amount_currency: str = Field(description="Total Amount Currency from the Invoice Summary; use currency code instead of symbol")
    total_vat_tax_amount: Optional[float] = Field(default=None, description="Total VAT/Tax Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_vat_tax_currency: Optional[str] = Field(default=None, description="VAT/Tax Currency from the Invoice Summary; use currency code instead of symbol")
    net_charges_usd: Optional[float] = Field(default=None, description="(Net) Charges (USD) (After Credits/Discounts, excl. Tax) from the (Invoice) Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    net_charges_non_usd: Optional[float] = Field(default=None, description="Net Charges (non-USD) (After Credits/Discounts, excl. Tax) in local currency from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    net_charges_currency: Optional[str] = Field(default=None, description="Net Charges (non-USD) local currency; use currency code instead of symbol")
    vat_percentage: Optional[float] = Field(default=None, description="Extract VAT percent (without % sign) from one of these fields: VAT - <number>% or VAT in <percent> or GST amount at <percent> or HST Amount at <percent>")
    exchange_rate: Optional[float] = Field(default=None, description="Exchange Rate from the (1 USD = <rate> currency) formula")
    amazon_company_name: str = Field(description="Amazon Web Services company name. Usually, it is Amazon Web Services, Inc. but can be different for different countries")
    amazon_company_branch: Optional[str] = Field(default=None, description="Amazon Web Services company branch. Usually, it is after Amazon Web Services EMEA SARL but can be different for different countries")

In [3]:
 from langchain.output_parsers import PydanticOutputParser


 parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
 print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"file_name": {"description": "AWS Invoice PDF file name", "title": "File Name", "type": "string"}, "doit_payer_id": {"description": "Doit Payer ID", "title": "Doit Payer Id", "type": "string"}, "document_type": {"description": "Document Type: can be 'Invoice' or 'Credit Note' only. Credit Note can be Credit Memo or Credit Adjustment Note.", "title": "Document Type", "type": "string"}, "aws_account_number": {"description": "AWS Account number", "title": "Aws Account Number", "type": "string"}, "address_company": {"description": 

In [None]:
import csv

# Define the path to the input CSV file and the path to the output sorted CSV file
input_file_path = input("Sort CSV file") 
output_file_path = 'sorted_' + input_file_path

# Read the CSV file into a list of dictionaries, where each dictionary represents a row
rows = []
with open(input_file_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    header = csv_reader.fieldnames
    for row in csv_reader:
        rows.append(row)

# Sort the list of dictionaries based on the value of the first column (string)
sorted_rows = sorted(rows, key=lambda x: x[header[0]])

# Write the sorted rows back to a new CSV file
with open(output_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=header)
    
    # Write the header
    csv_writer.writeheader()
    
    # Write the sorted rows
    for row in sorted_rows:
        csv_writer.writerow(row)

print(f"The CSV file has been sorted alphabetically based on the first column and saved to {output_file_path}.")


In [None]:
with open('prompts/prompt.txt', 'r') as file:
    content = file.read()

# Replace '\n' with actual newline character
content = content.replace('\\n', '\n')
content = content.replace('\\"', '"')

with open('prompts/prompt.txt', 'w') as file:
    file.write(content)


In [4]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

def read_invoice(file):
    loader = PyMuPDFLoader(file)
    data = loader.load()
    invoice = data[0].page_content 
     # get parent folder name
    parent_folder = os.path.basename(os.path.dirname(file))
    # get file name only
    file_name = os.path.basename(file)
    # extract doit payer id from the parent folder name
    payer_id = parent_folder.split("_")[1]
    # add file name to the invoice
    invoice = f"File name: {file_name}\nDoiT payer id: {payer_id}\n" + invoice
    return invoice
  
file_name = input("Enter invoice file path")
print(read_invoice(file_name))

File name: 2024-01-02_Invoice_1539428769.pdf
DoiT payer id: doitintl-payer-1577
Account number:
640369046937
Bill to Address:
DoiT International USA, Inc
ATTN: Noam Ehrlich
5201 Great America Parkway
Suite 320
Santa Clara, CA, 95054, US
Amazon Web Services, Inc. Invoice
Email or talk to us about your AWS account or bill, visit aws.amazon.com/contact-us/
Submit feedback on your Invoice Experience here.
Invoice Summary
Invoice Number:
1539428769
Please include this invoice number with your payment
Invoice Date:
January 2 , 2024
TOTAL AMOUNT DUE ON February 1 , 2024
$178,874.36
This invoice is for the billing period December 1 - December 31 , 2023
Greetings from Amazon Web Services, we're writing to provide you with an electronic invoice for your use of AWS services. Additional information
about your bill, individual service charge details, and your account history are available on the Account Activity Page.
Summary
AWS Service Charges
$178,874.36
Charges
$265,988.24
Discount (Enterprise 

## Parse AWS Invoice/Credit Record from PDF

In [9]:
import textwrap
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

def extract_data(model, document):
    # Update the prompt to match the new query and desired format.
    # Instantiate the parser with the new model.
    parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
    # Get the file name from the first line of the document
    file_name = document.split("\n")[0].split(":")[1].strip()
    # Update the prompt to match the new query and desired format.
    prompt = PromptTemplate(
        template=textwrap.dedent(
            """
            The following document is a plain text extracted from AWS Invoice or Credit Note PDF file.
            
            <document>
            {invoice}
            <document>
            
            Act as an accountant and extract data from the above document into a flat JSON object.
            {format_instructions}
            {request}
            
            JSON:
            """
        ),
        input_variables=["request", "invoice"],
        partial_variables={
            "format_instructions": parser.get_format_instructions(),
        },
    )
    # Generate the input using the updated prompt.
    parsing_request = textwrap.dedent(
        """
        Tips:
        - Convert all dates to "Month name Day, Year" format with no leading zeros
        - Format all dates according to "Month name Day, Year" format with no leading zeros
        - Convert alpha-2 country code to a full country name
        - Branch name should not contain a full company name
        - Be careful with charges and amount signs, they are usually negative for credits
        - Extract exchange rate (X) from (1 USD = X currency) pattern
        """
    )
    chain = LLMChain(llm=model, prompt=prompt)
    try:
        output = chain.run(request=parsing_request, invoice=document)
        # remove everything before the first { and after the last }
        output = output[output.find("{"):output.rfind("}") + 1]
        parsed = parser.parse(output)
        return parsed
    except Exception as e:
        raise Exception(f"Error processing document {file_name}: {e}")

file_name = input("Enter invoice file path")
invoice = read_invoice(file_name)
# Instantiate the model.
llm = ChatOpenAI(
    model="gpt-4-1106-preview",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    max_tokens=4096,
    model_kwargs={"top_p": 0.0}
)
parsed = extract_data(llm, invoice)
# print parsed data as a JSON object
print(parsed.model_dump_json(indent=2))

{
  "file_name": "2024-01-02_Invoice_1539428769.pdf",
  "doit_payer_id": "doitintl-payer-1577",
  "document_type": "Invoice",
  "aws_account_number": "640369046937",
  "address_company": "DoiT International USA, Inc",
  "address_attn": "Noam Ehrlich",
  "address_country": "United States",
  "tax_registration_number": "M90373009E",
  "billing_period": "December 1 - December 31, 2023",
  "invoice_number": "1539428769",
  "invoice_date": "January 2, 2024",
  "original_invoice_number": null,
  "original_invoice_date": null,
  "total_amount": 178874.36,
  "total_amount_currency": "USD",
  "total_vat_tax_amount": 17726.34,
  "total_vat_tax_currency": "USD",
  "net_charges_usd": null,
  "net_charges_non_usd": null,
  "net_charges_currency": null,
  "vat_percentage": null,
  "exchange_rate": null,
  "amazon_company_name": "Amazon Web Services, Inc.",
  "amazon_company_branch": null
}
