In [None]:
!pip install openai
!pip install python-dotenv
!pip install llama-index
!pip install astrapy
!pip install pymupdf
!pip install tqdm

In [None]:
!llamaindex-cli download-llamadataset PatronusAIFinanceBenchDataset --download-dir ./finance_bench

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from rag_system import RAGSystem
import pymupdf
import os
from dotenv import load_dotenv

load_dotenv()


In [None]:
def read_pdf(file_path):
    doc = pymupdf.open(file_path) # open a document
    doc_text = ""
    for page in doc: # iterate the document pages
        text = page.get_text()
        doc_text += "/n" + text
    return doc_text

def read_directory(directory_path):
    documents = []
    def process_directory(path):
        for entry in os.scandir(path):
            if entry.is_file() and entry.name.lower().endswith('.pdf'):
                document_text = read_pdf(entry.path)
                documents.append((document_text, {"file_name": entry.name}))
            elif entry.is_dir():
                process_directory(entry.path)
    
    process_directory(directory_path)
    return documents

In [None]:
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
documents, metadatas = zip(*read_directory("./finance_bench/source_files"))
print(f"Number of loaded documents: {len(documents)}")
rag_system = RAGSystem(embed_model, "patronus_ai_finance_bench")
rag_system.insert_documents(documents, metadatas)

In [None]:
response = rag_system.query("What is the FY2018 capital expenditure amount (in USD millions) for 3M? Give a response to the question by relying on the details shown in the cash flow statement.")
print(response)