In [None]:
from main import AwsInvoiceCredit
from langchain.output_parsers import PydanticOutputParser

parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
print(parser.get_format_instructions())

In [None]:
import csv

# Define the path to the input CSV file and the path to the output sorted CSV file
input_file_path = input("Sort CSV file") 
output_file_path = 'sorted_' + input_file_path

# Read the CSV file into a list of dictionaries, where each dictionary represents a row
rows = []
with open(input_file_path, 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    header = csv_reader.fieldnames
    for row in csv_reader:
        rows.append(row)

# Sort the list of dictionaries based on the value of the first column (string)
sorted_rows = sorted(rows, key=lambda x: x[header[0]])

# Write the sorted rows back to a new CSV file
with open(output_file_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames=header)
    
    # Write the header
    csv_writer.writeheader()
    
    # Write the sorted rows
    for row in sorted_rows:
        csv_writer.writerow(row)

print(f"The CSV file has been sorted alphabetically based on the first column and saved to {output_file_path}.")


In [None]:
with open('prompts/prompt.txt', 'r') as file:
    content = file.read()

# Replace '\n' with actual newline character
content = content.replace('\\n', '\n')
content = content.replace('\\"', '"')

with open('prompts/prompt.txt', 'w') as file:
    file.write(content)


In [None]:
from main import remove_footer

def read_invoice(invoice_document):
    loader = PyMuPDFLoader(invoice_document)
    data = loader.load()
    invoice = data[0].page_content
    invoice = remove_footer(invoice)
     # get parent folder name
    parent_folder = os.path.basename(os.path.dirname(invoice_document))
    # get file name only
    file_name = os.path.basename(invoice_document)
    # extract doit payer id from the parent folder name
    payer_id = parent_folder.split("_")[1]
    # add file name to the invoice
    invoice = f"File name: {file_name}\nDoiT payer id: {payer_id}\n" + invoice
    return invoice
  
file_name = input("Enter invoice file path")
print(read_invoice(file_name))

In [None]:
import os
from langchain_community.document_loaders import PyMuPDFLoader

def read_file(file):
    loader = PyMuPDFLoader(file)
    data = loader.load()
    # return all pages as a single string
    return "\n".join([page.page_content for page in data])
  
file_name = input("Enter PDF file path")
print(read_file(file_name))

## Parse AWS Invoice/Credit Record from PDF

In [None]:
import textwrap
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

def extract_data(model, document):
    # Update the prompt to match the new query and desired format.
    # Instantiate the parser with the new model.
    parser = PydanticOutputParser(pydantic_object=AwsInvoiceCredit)
    # Get the file name from the first line of the document
    file_name = document.split("\n")[0].split(":")[1].strip()
    # Update the prompt to match the new query and desired format.
    prompt = PromptTemplate(
        template=textwrap.dedent(
            """
            The following document is a plain text extracted from AWS Invoice or Credit Note PDF file.
            
            <document>
            {invoice}
            <document>
            
            Act as an accountant and extract data from the above document into a flat JSON object.
            {format_instructions}
            {request}
            
            JSON:
            """
        ),
        input_variables=["request", "invoice"],
        partial_variables={
            "format_instructions": parser.get_format_instructions(),
        },
    )
    # Generate the input using the updated prompt.
    parsing_request = textwrap.dedent(
        """
        Tips:
        - Convert ALL dates to "Month name Day, Year" format with no leading zeros
        - Format ALL dates according to "Month name Day, Year" format with no leading zeros
        - Convert ALL instances of alpha-2 country code to a full country name
        - Branch name should not contain a full company name
        - Be careful with charges and amount signs, they are usually negative for credits
        - Extract exchange rate (X) from (1 USD = X currency) pattern
        """
    )
    chain = LLMChain(llm=model, prompt=prompt)
    try:
        output = chain.run(request=parsing_request, invoice=document)
        # remove everything before the first { and after the last }
        output = output[output.find("{"):output.rfind("}") + 1]
        parsed = parser.parse(output)
        return parsed
    except Exception as e:
        raise Exception(f"Error processing document {file_name}: {e}")

file_name = input("Enter invoice file path")
invoice = read_invoice(file_name)
# Instantiate the model.
llm = ChatOpenAI(
    model="gpt-4o",
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    max_tokens=4096,
    model_kwargs={"top_p": 0.01}
)
parsed = extract_data(llm, invoice)
# print parsed data as a JSON object
print(parsed.model_dump_json(indent=2))

## Compate two CSV files

In [None]:
import pandas as pd

# Read CSV files
test = pd.read_csv('invoices-test.csv')
result = pd.read_csv('invoices-result.csv')

# Ensure both dataframes have the same columns for comparison
common_columns = list(set(test.columns).intersection(set(result.columns)))
test = test[common_columns]
result = result[common_columns]

# sort dataframes by the doit_payer_id column and then by the file_name column
test = test.sort_values(by=['doit_payer_id', 'file_name'])
result = result.sort_values(by=['doit_payer_id', 'file_name'])

# Reset index for both dataframes before comparison
test.reset_index(drop=True, inplace=True)
result.reset_index(drop=True, inplace=True)

# compare row by row
for i in range(len(test)):
    for col in test.columns:
        # if values are not equal, print the row and column where the difference occurs
        # compare values as strings to handle NaN values
        if str(test[col][i]) != str(result[col][i]):
            # ignore NaN values
            if pd.isna(test[col][i]) and pd.isna(result[col][i]):
                continue
            print(f"Invoice {test['file_name'][i]}, {col}: Test={test[col][i]}, Result={result[col][i]}")
