In [3]:

import os
import json
from IPython.display import display, Markdown
import time

from langchain_ollama import OllamaLLM as LCOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangChainDocument
import uuid
import pyarrow as pa
from phi.model.ollama import Ollama as PhiOllama
from phi.vectordb.lancedb import LanceDb, SearchType
from phi.embedder.ollama import OllamaEmbedder as PhiOllamaEmbedder
from phi.document import Document as PhiDocument

import PyPDF2

from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas as r_canvas

print("All libraries imported successfully!")

model_id = "gemma:2b"
phi_ollama_embedder = PhiOllamaEmbedder(model=model_id)

print(f"Ollama model '{model_id}' initialized for LLM and Embedder.")

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a given PDF file using PyPDF2.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

print("PDF extraction function defined.")

def process_invoice_data(pdf_path: str) -> list[PhiDocument]:
    """
    Extracts text from PDF and returns properly formatted PhiDocuments
    """
    print(f"Processing PDF: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("No text extracted from PDF.")
        return []

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    
    text_chunks = text_splitter.split_text(text)
    phi_chunks = [
        PhiDocument(
            name=f"chunk_{i}",
            content=chunk,
            meta={"source": pdf_path}
        )
        for i, chunk in enumerate(text_chunks)
    ]
    
    print(f"Split PDF into {len(phi_chunks)} chunks.")
    return phi_chunks
print("Invoice data processing function defined.")


# Cell 5: Setup Vector Database (LanceDB) (MODIFIED AGAIN)

def setup_vector_db(chunks: list[PhiDocument], db_path: str) -> LanceDb: # Added db_path parameter
    """
    More robust database setup with proper cleanup
    """
    import lancedb
    
    db_uri = db_path # Use the passed db_path directly
    
    if os.path.exists(db_uri):
        try:
            import shutil
            shutil.rmtree(db_uri)
            print(f"Cleaned existing database directory: {db_uri}")
        except Exception as e:
            print(f"Warning: Could not clean database directory {db_uri}: {e}")
    
    os.makedirs(db_uri, exist_ok=True)
    
    # Connect to LanceDB with retry logic
    max_retries = 3
    for attempt in range(max_retries):
        try:
            db = lancedb.connect(db_uri)
            
            # Get embedding dimension
            test_embedding = phi_ollama_embedder.get_embedding("test")
            dim = len(test_embedding)
            print(f"Using embedding dimension: {dim}")
            
            # Define schema
            schema = pa.schema([
                pa.field("id", pa.string()),
                pa.field("content", pa.string()),
                pa.field("vector", pa.list_(pa.float32(), dim)),
                pa.field("metadata", pa.string())
            ])
            
            # Create table
            table = db.create_table("invoice_data", schema=schema)
            print("Created new table with proper schema")
            
            # Insert documents with batch processing
            if chunks:
                data = []
                for chunk in chunks:
                    try:
                        embedding = phi_ollama_embedder.get_embedding(chunk.content)
                        data.append({
                            "id": str(uuid.uuid4()),
                            "content": chunk.content,
                            "vector": embedding,
                            "metadata": json.dumps(chunk.meta if hasattr(chunk, 'meta') else {})
                        })
                    except Exception as e:
                        print(f"Error processing chunk: {e}")
                        continue
                
                if data:
                    # Insert in batches to prevent timeouts
                    batch_size = 5
                    for i in range(0, len(data), batch_size):
                        batch = data[i:i + batch_size]
                        table.add(batch)
                    print(f"Successfully inserted {len(data)} chunks into {db_uri}")
                else:
                    print("No valid chunks to insert")
            
            return table
            
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            print(f"Attempt {attempt + 1} failed, retrying... Error: {e}")
            time.sleep(1)


# Cell 6: Define RAG Prompt (MODIFIED for plain text output - NO CHANGE FROM LAST TIME)

rag_prompt_template = ChatPromptTemplate.from_messages([
    ("system", (
        "You are an expert AI assistant specialized in extracting precise details from invoice documents. "
        "Your goal is to accurately answer questions based *only* on the provided invoice context. "
        "If the information is not present in the context, state that you cannot find it. "
        "Provide your answer directly as plain text, concisely and clearly, without any JSON formatting."
        "\n---CONTEXT---\n"
        "{context}"
    )),
    ("human", "Query: {query}")
])

print("RAG prompt template defined (modified for plain text output).")

# Cell 7: Main Execution Logic (MODIFIED AGAIN)

def main():
    # Define the base directory for your project.
    # Based on image_87f6e5.png, 'Agent Practice' is the root where your script and invoice.pdf reside.
    project_root_dir = os.getcwd() 
    
    # Define the full path to the invoice PDF
    pdf_path = os.path.join(project_root_dir, "invoice.pdf")

    # Define the path for the LanceDB database
    lancedb_dir = os.path.join(project_root_dir, "lancedb_invoices")

    # Check if the invoice.pdf exists at the specified location
    if not os.path.exists(pdf_path):
        print("File not found")
        
    else:
        print(f"Using existing invoice.pdf at: {pdf_path}")
        
    # Process PDF and setup the vector database
    chunks = process_invoice_data(pdf_path)
    if not chunks:
        print("Exiting: No text found in PDF.")
        return None
        
    vector_db_instance = setup_vector_db(chunks, db_path=lancedb_dir) # Pass the explicit LanceDB path
    return vector_db_instance

def query_invoice_data(vector_db: LanceDb, question: str):
    """
    Query the invoice data using the vector database
    """
    try:
        # Generate embedding for the question
        query_embedding = phi_ollama_embedder.get_embedding(question)

        # Perform the search 
        results = vector_db.search(query_embedding).limit(5).to_list()
        
        # Combine the context from top results
        context = "\n".join([result["content"] for result in results])
        
        # Setup LLM chain
        llm = LCOllama(model=model_id)
        chain = rag_prompt_template | llm | StrOutputParser()
        
        # Get answer
        response = chain.invoke({
            "context": context,
            "query": question
        })
        
        display(Markdown(f"**Question:** {question}"))
        display(Markdown(f"**Answer:** {response}"))
    except Exception as e:
        print(f"Error querying data: {e}")

if __name__ == "__main__":
    # Get the vector db instance
    vector_db = main()
    
    # Only run queries if we have a valid vector db
    if vector_db:
        print("\nInvoice Query System Ready!")
        query_invoice_data(vector_db, "What is the total amount due for this invoice?")
        query_invoice_data(vector_db, "Who is the customer and what is their address?")
        query_invoice_data(vector_db, "What is the Invoice Number and Date?")
        query_invoice_data(vector_db, "What are the payment terms?")
        query_invoice_data(vector_db, "Who is the salesperson mentioned on the invoice?")
        query_invoice_data(vector_db, "What is the description of the item purchased and its quantity?")
        query_invoice_data(vector_db,"What is the unit price of the decorative clay pottery?")
        query_invoice_data(vector_db, "What is the subtotal before taxes and shipping?")
        query_invoice_data(vector_db,"How much is the sales tax and shipping & handling?")
        query_invoice_data(vector_db, "Are there any special instructions or comments about the shipment?")
        query_invoice_data(vector_db, "What is the phone number of Pottery & Co.?")
        
    else:
        print("Failed to initialize vector database")

All libraries imported successfully!
Ollama model 'gemma:2b' initialized for LLM and Embedder.
PDF extraction function defined.
Invoice data processing function defined.
RAG prompt template defined (modified for plain text output).
Using existing invoice.pdf at: C:\Users\ALISH\Agent Practice\invoice.pdf
Processing PDF: C:\Users\ALISH\Agent Practice\invoice.pdf
Split PDF into 2 chunks.
Cleaned existing database directory: C:\Users\ALISH\Agent Practice\lancedb_invoices
Using embedding dimension: 2048
Created new table with proper schema
Successfully inserted 2 chunks into C:\Users\ALISH\Agent Practice\lancedb_invoices

Invoice Query System Ready!


**Question:** What is the total amount due for this invoice?

**Answer:** The total amount due for this invoice is $1389.99.

**Question:** Who is the customer and what is their address?

**Answer:** The customer's name and address are not explicitly mentioned in the context, so I cannot answer this question from the provided context.

**Question:** What is the Invoice Number and Date?

**Answer:** Invoice Number: #100
Date: 1/1/23

**Question:** What are the payment terms?

**Answer:** The payment terms are not explicitly mentioned in the context, so I cannot answer this question from the provided context.

**Question:** Who is the salesperson mentioned on the invoice?

**Answer:** The context does not provide the salesperson's name, so I cannot answer this question from the provided context.

**Question:** What is the description of the item purchased and its quantity?

**Answer:** The item purchased is not explicitly mentioned in the context, so I cannot answer this question from the provided context.

**Question:** What is the unit price of the decorative clay pottery?

**Answer:** The unit price of the decorative clay pottery is not explicitly stated in the context, so I cannot answer this question from the provided context.

**Question:** What is the subtotal before taxes and shipping?

**Answer:** The subtotal before taxes and shipping is $1300.00.

**Question:** How much is the sales tax and shipping & handling?

**Answer:** Sales tax: 65.00
Shipping & handling: 24.99

Error querying data: lance error: LanceError(IO): Execution error: Not found: C:/Users/ALISH/Agent Practice/lancedb_invoices/invoice_data.lance/data/ac0a73ce-2e7d-4262-adcd-cec363a839a9.lance, C:\Users\runneradmin\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\lance-io-0.31.1\src\local.rs:122:31, C:\Users\runneradmin\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\lance-0.31.1\src\dataset\scanner.rs:2955:83
Error querying data: lance error: LanceError(IO): Execution error: Not found: C:/Users/ALISH/Agent Practice/lancedb_invoices/invoice_data.lance/data/ac0a73ce-2e7d-4262-adcd-cec363a839a9.lance, C:\Users\runneradmin\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\lance-io-0.31.1\src\local.rs:122:31, C:\Users\runneradmin\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\lance-0.31.1\src\dataset\scanner.rs:2955:83
