In [11]:
from dotenv import load_dotenv
import os

load_dotenv("/home/dario/Dropbox/ai/key.env",override=True)
os.environ['OPENAI_API_KEY'] = os.environ.get("INVOICE_ANALYST_OPENAI_KEY")
os.environ['LLAMA_CLOUD_API_KEY'] = os.environ.get("INVOICE_ANALYST_LLAMAPARSE_KEY")


In [6]:
import PyPDF2

def process_pdfs(directory):
    """
    Process all PDF files in the specified directory, replacing each file with a new version containing only the first page.

    Args:
       directory (str): The directory to process.

    Returns:
       None
    """
    # Iterate through all files in the specified directory
    # List all files in the directory
    

    for filename in os.listdir(directory):
        # Check if the file is a PDF
        if filename.endswith(".pdf"):
            filepath = os.path.join(directory, filename)
            
            # Open the PDF file
            with open(filepath, "rb") as file:
                reader = PyPDF2.PdfReader(file)
                
                # Check if the PDF has more than one page
                if len(reader.pages) > 1:
                    # Extract the first page
                    first_page = reader.pages[0]
                    
                    # Create a PDF writer object and add the first page
                    writer = PyPDF2.PdfWriter()
                    writer.add_page(first_page)
                    
                    # Write the new PDF back to the same file, replacing the original
                    with open(filepath, "wb") as new_file:
                        writer.write(new_file)

                    print(f"Processed {filename}: Original replaced with first page only.")
                else:
                    print(f"{filename} has only one page or is empty. No changes made.")



In [9]:
# Specify the directory containing PDFs
pdf_directory = "/home/dario/Dropbox/ai/invoice_analyst/facturas"
process_pdfs(pdf_directory)


super_oficina.pdf has only one page or is empty. No changes made.
FA-2-19346.pdf has only one page or is empty. No changes made.
27345649137_011_00003_00000022.pdf has only one page or is empty. No changes made.
FA-2-19504.pdf has only one page or is empty. No changes made.
27328814949_011_00002_00000057.pdf has only one page or is empty. No changes made.
F Aguilar 27345649137_011_00003_00000023.pdf has only one page or is empty. No changes made.
la anonima.pdf has only one page or is empty. No changes made.
98431 62558 MZP Tecnologia SAU.pdf has only one page or is empty. No changes made.
micaela_testa.pdf has only one page or is empty. No changes made.
nafta.pdf has only one page or is empty. No changes made.
nafta2.pdf has only one page or is empty. No changes made.
contadora.pdf has only one page or is empty. No changes made.
27387838436_011_00001_00000016.pdf has only one page or is empty. No changes made.
bari wireless FA-2-19177.pdf has only one page or is empty. No changes made

### Docs:

https://docs.llamaindex.ai/en/stable/module_guides/loading/connector/llama_parse/

In [12]:
import nest_asyncio

nest_asyncio.apply()

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

parser = LlamaParse(
    result_type="markdown",
    verbose=True,
    language="es",
)

file_extractor = {".pdf": parser}

filename_fn = lambda filename: {"file_name": filename}
# automatically sets the metadata of each document according to filename_fn
documents = SimpleDirectoryReader(
    input_dir=pdf_directory, file_extractor=file_extractor, file_metadata=filename_fn, required_exts=[".pdf"]
).load_data()


Started parsing the file under job_id 17f16d04-a625-44cb-8822-a84ca1010e9a
Started parsing the file under job_id 2c595cb3-c207-407e-85cb-45d8e941f3f8
Started parsing the file under job_id 3bcd1df4-eb8d-4251-93e2-501306744457
Started parsing the file under job_id 6a2bc280-e7a7-493a-8997-1137d2a71d7c
Started parsing the file under job_id f995a440-a500-4fc5-8db8-4bd3bdce3884
Started parsing the file under job_id 9f560a91-b830-4401-b5fe-562fc37d15c6
Started parsing the file under job_id b0f8f9ee-cd0a-43bd-838a-7ccaf317d470
Started parsing the file under job_id e8a31408-3506-4faf-8313-b3d6419f2a5e
Started parsing the file under job_id 48a7313f-e3ce-45ba-9b00-61149d700d06
Started parsing the file under job_id 7a8f8094-7927-440d-a4a8-2021c4534993
Started parsing the file under job_id 985c8af9-e2ab-498e-953d-7e5dd40bbd95
Started parsing the file under job_id 821de539-4924-4981-8451-fffaa5b1ec71
Started parsing the file under job_id 07013d32-4b79-4da4-b2c3-24caec358264
Started parsing the file 

Let's look at the output:

In [None]:
print(documents[0].text[:1000])
#print(documents[3].metadata['file_name'])
#os.path.basename(documents[3].metadata['file_name'])

Right away we can see that some kind of structure is being retained!

### Docs

https://docs.llamaindex.ai/en/stable/examples/llm/openai_json_vs_function_calling/?h=basemodel

In [14]:
from llama_index.core import Settings
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.llms.openai import OpenAI

In [16]:
llm = OpenAI(model="gpt-3.5-turbo-1106")

In [17]:
from pydantic import BaseModel, Field
from typing import List


class Invoice(BaseModel):
    """Data model for an invoice."""

    date: str = Field(
        description="Date of the invoice. It can be in the format Day Month Year or Month Day Year or Year Month Day"
    )
    name: str = Field(
        description=" Name of the company or the person that issues the invoice. Never use MZP Tecnologia or anything similar to \
        MZP Tecnologia, because this is the name of the buyer. Look for the name of the vendor. "
    )
    cuit: str = Field(
        description="Tax identification number (nsmed CUIT or C.U.I.T.) of the vendor. It has 11 digits\
              that may appear as DDDDDDDDDDD or DD-DDDDDDDD-D, where D is a digit.\
            do not confuse with MZP Tecnologia's cuit which also appears in the invoice and is 30715264249 or 30-71526424-9.\
                 Look for the cuit of the vendor. "
    )
    product: str = Field(description="Service or product purchased. If there are several items provide the type or category of the product.")
    subtotal: str = Field(description="Subtotal amount of the invoice, before adding taxes, it may use a dot or a comma as a decimal separator")
    iva: str = Field(description="The IVA refers to the Value Added Tax, it is generally the 21% of the subtotal, it may use a dot or a comma as a decimal separator. Some invoices do not have an iva or the value might be zero")
    other_taxes: str = Field(description="Other taxes. They are usually lower than the IVA. A dot or a comma as decimal separators. Some invoices do not have this field.")
    total: str = Field(description="The total amount of the invoice. A dot or a comma as decimal separators.")


In [18]:
prompt = """\
This is an invoice made for MZP Tecnologia, by a vendor. \
Extract the following information: date of the invoice, name of the vendor (it cannot be MZP Tecnologia or anything similar), \
cuit of the vendor (do not confuse with MZP Tecnologia's cuit which is 30715264249 or 30-71526424-9)  \
product purchased (if there are several items provide the type or category of the product),
subtotal, iva, other_taxes, and total.\
Invoice: \

{invoice} \
"""

In [19]:
program = OpenAIPydanticProgram.from_defaults(
    output_cls=Invoice,
    prompt_template_str=prompt,
    verbose=True,
)

In [20]:
import csv

In [21]:
# Path to the CSV file where the data will be written
file_path = pdf_directory+'/output.csv'

# Open the file with the context manager
with open(file_path, mode='w', newline='') as file:
    # Create a CSV writer object
    # Here, specify the delimiter and quote character if necessary
    writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # Write the header row
    writer.writerow(['Archivo', 'Fecha', 'Nombre', 'CUIT', 'Concepto', 'Subtotal', 'IVA', 'Otros impuestos', 'Total'])
    
    # Write data rows
    for doc in documents:
        output = program(invoice=doc.get_content())
        writer.writerow([os.path.basename(doc.metadata['file_name']), output.date, output.name, output.cuit.replace('-', ''), output.product, output.subtotal.replace(',', '.'), output.iva.replace(',', '.'), output.other_taxes.replace(',', '.'), output.total.replace(',', '.')])

print(f"CSV file '{file_path}' has been written successfully.")

Function call: Invoice with args: {"date":"07/02/2024","name":"GARCIA NOELIA SANDRA","cuit":"27328814949","product":"Honorarios Contables del mes de Enero 2024","subtotal":"90000,00","iva":"0,00","other_taxes":"0,00","total":"90000,00"}
Function call: Invoice with args: {"date":"05/03/2024","name":"GARCIA NOELIA SANDRA","cuit":"27328814949","product":"Honorarios contables del mes de Febrero de 2024","subtotal":"90000,00","iva":"0,00","other_taxes":"0,00","total":"90000,00"}
Function call: Invoice with args: {"date":"02/01/2024","name":"AGUILAR MARIA FERNANDA","cuit":"27345649137","product":"Servicios profesionales de biotecnología","subtotal":"1404381,00","iva":"0,00","other_taxes":"0,00","total":"1404381,00"}
Function call: Invoice with args: {"date":"04/03/2024","name":"TESTA MICAELA ANALIA","cuit":"27387838436","product":"Consultoría en Ingeniería","subtotal":"431500,00","iva":"0,00","other_taxes":"0,00","total":"431500,00"}
Function call: Invoice with args: {"date":"01/02/2024","na