In [None]:
from typing import Optional
from pydantic import BaseModel, Field

# Define a new Pydantic model with field descriptions and tailored for AWS Invoice/Credit Record.
class AwsInvoiceCredit(BaseModel):
    file_name: str = Field(description="AWS Invoice PDF file name")
    doit_payer_id: str = Field(description="Doit Payer ID")
    document_type: str = Field(description="Document Type: can be 'Invoice' or 'Credit Note' only. Credit Note can be Credit Memo or Credit Adjustment Note.")
    aws_account_number: str = Field(description="AWS Account number")
    address_company: str = Field(description="Address or Bill to Address company name. Use first line of the address. Usually, it is the company name.")
    address_attn: str = Field(description="Address or Bill to Address ATTN (skip the ATTN prefix). Use second line of the address. Usually, it is the name of the person.")
    address_country: str = Field(description="Bill to address country. Use last line of the address. Usually, it is the country name. Convert country code to a full country name.")
    tax_registration_number: Optional[str] = Field(default=None, description="Tax Registration Number or ABN Number or GST Number or GST/HST Registration number or  Issued To; usually the next number after AWS Account Number")
    billing_period: str = Field(description="Billing Period; Two dates separated by a dash")  
    invoice_number: str = Field(description="Invoice Number from the Invoice Summary")
    invoice_date: str = Field(description="Invoice Date from the Invoice Summary")
    original_invoice_number: Optional[str] = Field(default=None, description="Original Invoice Number from the Invoice Summary of Credit Memo/Note; leave empty if not present")
    original_invoice_date: Optional[str] = Field(default=None, description="Original Invoice Date from the Invoice Adjustment Summary of Credit Memo/Note; leave empty if not present")
    total_amount: float = Field(description="Total Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_amount_currency: str = Field(description="Total Amount Currency from the Invoice Summary; use currency code instead of symbol")
    total_vat_tax_amount: Optional[float] = Field(default=None, description="Total VAT/Tax Amount from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    total_vat_tax_currency: Optional[str] = Field(default=None, description="VAT/Tax Currency from the Invoice Summary; use currency code instead of symbol")
    net_charges_usd: Optional[float] = Field(default=None, description="(Net) Charges (USD) (After Credits/Discounts, excl. Tax) from the (Invoice) Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    net_charges_non_usd: Optional[float] = Field(default=None, description="Net Charges (non-USD) (After Credits/Discounts, excl. Tax) in local currency from the Invoice Summary; without currency; add minus sign if parentheses around or has a minus prefix")
    net_charges_currency: Optional[str] = Field(default=None, description="Net Charges (non-USD) local currency; use currency code instead of symbol")
    vat_percentage: Optional[float] = Field(default=None, description="Extract VAT percent (without % sign) from one of these fields: VAT - <number>% or VAT in <percent> or GST amount at <percent> or HST Amount at <percent>")
    exchange_rate: Optional[float] = Field(default=None, description="Exchange Rate from the (1 USD = <rate> currency) formula")
    amazon_company_name: str = Field(description="Amazon Web Services company name. Usually, it is Amazon Web Services, Inc. but can be different for different countries")
    amazon_company_branch: Optional[str] = Field(default=None, description="Amazon Web Services company branch. Usually, it is after Amazon Web Services EMEA SARL but can be different for different countries")

In [None]:
 from langchain.output_parsers import PydanticOutputParser


 parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
 print(parser.get_format_instructions())

In [None]:
import csv

# Define the path to the input CSV file and the path to the output sorted CSV file
input_file_path = input("Sort CSV file") 
output_file_path = 'sorted_' + input_file_path

# Read the CSV file into a list of dictionaries, where each dictionary represents a row
rows = []
with open(input_file_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    header = csv_reader.fieldnames
    for row in csv_reader:
        rows.append(row)

# Sort the list of dictionaries based on the value of the first column (string)
sorted_rows = sorted(rows, key=lambda x: x[header[0]])

# Write the sorted rows back to a new CSV file
with open(output_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=header)
    
    # Write the header
    csv_writer.writeheader()
    
    # Write the sorted rows
    for row in sorted_rows:
        csv_writer.writerow(row)

print(f"The CSV file has been sorted alphabetically based on the first column and saved to {output_file_path}.")


In [None]:
with open('prompts/prompt.txt', 'r') as file:
    content = file.read()

# Replace '\n' with actual newline character
content = content.replace('\\n', '\n')
content = content.replace('\\"', '"')

with open('prompts/prompt.txt', 'w') as file:
    file.write(content)


In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

def print_file(file):
    loader = PyMuPDFLoader(file)
    data = loader.load()
    invoice = data[0].page_content 
     # get parent folder name
    parent_folder = os.path.basename(os.path.dirname(file))
    # get file name only
    file_name = os.path.basename(file)
    # extract doit payer id from the parent folder name
    payer_id = parent_folder.split("_")[1]
    # add file name to the invoice
    invoice = f"File name: {file_name}\nDoiT payer id: {payer_id}\n" + invoice
    print(invoice)
    
file_name = input("Enter invoice file path")
print_file(file_name)