In [None]:
# Cell 1: Imports and Setup (UPDATED)

import os
import json
from IPython.display import display, Markdown

# LangChain components for LLM and RAG
# from langchain_community.llms import Ollama as LCOllama <-- OLD
# from langchain_ollama import Ollama as LCOllama <-- Incorrect Name
from langchain_ollama import OllamaLLM as LCOllama # <-- NEW: Corrected Class Name import
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangChainDocument
import uuid  
import pyarrow as pa 
# phi-agent components
from phi.model.ollama import Ollama as PhiOllama
from phi.vectordb.lancedb import LanceDb, SearchType
from phi.embedder.ollama import OllamaEmbedder as PhiOllamaEmbedder
from phi.document import Document as PhiDocument
import tkinter as tk
from tkinter import messagebox
from tkinter import ttk, scrolledtext
from tkinter import font as tkfont
# PDF processing
import PyPDF2

# For creating a dummy PDF (optional)
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas as r_canvas

print("All libraries imported successfully!")

# Cell 2: Ollama Model Initialization (No Changes)

model_id = "gemma:2b"
# Initialize Ollama model for phi-agent
phi_ollama_model = PhiOllama(id=model_id)
# Initialize Ollama embedder for phi-agent's vector DB
phi_ollama_embedder = PhiOllamaEmbedder(model=model_id)

print(f"Ollama model '{model_id}' initialized for LLM and Embedder.")


# Cell 3: PDF Text Extraction Function (No Changes)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a given PDF file using PyPDF2.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

print("PDF extraction function defined.")


def process_invoice_data(pdf_path: str) -> list[PhiDocument]:
    """
    Extracts text from PDF and returns properly formatted PhiDocuments
    """
    print(f"Processing PDF: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("No text extracted from PDF.")
        return []

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    
    # Split text and create PhiDocuments
    text_chunks = text_splitter.split_text(text)
    phi_chunks = [
        PhiDocument(
            name=f"chunk_{i}",
            content=chunk,  # Using 'content' consistently
            meta={"source": pdf_path}
        )
        for i, chunk in enumerate(text_chunks)
    ]
    
    print(f"Split PDF into {len(phi_chunks)} chunks.")
    return phi_chunks
print("Invoice data processing function defined.")


# Cell 5: Setup Vector Database (LanceDB) (No Changes)

def setup_vector_db(chunks: list[PhiDocument]) -> LanceDb:
    """
    Fixed setup with proper schema alignment
    """
    import lancedb
    
    db_uri = "tmp/lancedb_invoices"
    os.makedirs(db_uri, exist_ok=True)
    
    # Connect to LanceDB
    db = lancedb.connect(db_uri)
    
    # Get embedding dimension
    try:
        test_embedding = phi_ollama_embedder.get_embedding("test")
        dim = len(test_embedding)
        print(f"Using embedding dimension: {dim}")
    except Exception as e:
        print(f"Error getting embedding dimension: {e}")
        dim = 2048  # Default for gemma:2b
        print(f"Using default dimension: {dim}")

    # Define schema that matches our data structure
    schema = pa.schema([
        pa.field("id", pa.string()),
        pa.field("content", pa.string()),  # Changed from 'text' to 'content'
        pa.field("vector", pa.list_(pa.float32(), dim)),
        pa.field("metadata", pa.string())
    ])
    
    # Table handling - always create new table to avoid schema conflicts
    if os.path.exists(db_uri):
        import shutil
        shutil.rmtree(db_uri)
        os.makedirs(db_uri, exist_ok=True)
    
    table = db.create_table("invoice_data", schema=schema)
    print("Created new table with proper schema")
    
    # Document insertion
    if chunks:
        data = []
        for chunk in chunks:
            try:
                embedding = phi_ollama_embedder.get_embedding(chunk.content)
                data.append({
                    "id": str(uuid.uuid4()),
                    "content": chunk.content,  # Changed from 'text' to 'content'
                    "vector": embedding,
                    "metadata": json.dumps(chunk.meta if hasattr(chunk, 'meta') else {})
                })
            except Exception as e:
                print(f"Error processing chunk: {e}")
                continue
        
        if data:
            table.add(data)
            print(f"Successfully inserted {len(data)} chunks")
        else:
            print("No valid chunks to insert")
    
    return table


# Cell 6: Define RAG Prompt (No Changes)

# Cell 6: Define RAG Prompt (MODIFIED)

rag_prompt_template = ChatPromptTemplate.from_messages([
    ("system", (
        "You are an expert AI assistant specialized in extracting precise details from invoice documents. "
        "Your goal is to accurately answer questions based *only* on the provided invoice context. "
        "If the information is not present in the context, state that you cannot find it."
        "\n---CONTEXT---\n"
        "{context}"
    )),
    ("human", "Query: {query}")
])

print("RAG prompt template defined (for natural language output).")

print("RAG prompt template defined.")

# Cell 7: Main Execution Flow (UPDATED)
# Cell 7: Main Execution Flow (Updated)
def main():
    # Setup: Create 'tmp' folder and define invoice PDF path
    pdf_dir = os.path.join(os.getcwd(), "tmp")
    os.makedirs(pdf_dir, exist_ok=True)
    pdf_path = os.path.join(pdf_dir, "invoice.pdf")

    # Check if invoice exists - exit if not found
    if not os.path.exists(pdf_path):
        print(f"Invoice PDF not found at: {pdf_path}")
        print("Please provide an invoice PDF file named 'invoice.pdf' in the tmp directory")
        return None

    print(f"Using invoice at: {pdf_path}")

    # Process PDF and setup the vector database
    chunks = process_invoice_data(pdf_path)
    if not chunks:
        print("Exiting: No text found in PDF.")
        return None
    
    vector_db_instance = setup_vector_db(chunks)
    return vector_db_instance
    
def query_invoice_data(vector_db: LanceDb, question: str, text_widget: scrolledtext.ScrolledText):
    """
    Query the invoice data and display results in the Tkinter widget
    """
    try:
        # Generate embedding for the question
        query_embedding = phi_ollama_embedder.get_embedding(question)
        
        # Perform the search
        results = vector_db.search(query_embedding).limit(3).to_list()
        context = "\n".join([result["content"] for result in results])
        
        # Setup LLM chain
        llm = LCOllama(model=model_id)
        chain = rag_prompt_template | llm | StrOutputParser()
        
        # Get answer
        response = chain.invoke({"context": context, "query": question})
        
        # Format the output for GUI
        text_widget.configure(state='normal')
        text_widget.insert(tk.END, f"Question: {question}\n", 'question')
        
        # Clean up and format the response
        if response.startswith("{") and response.endswith("}"):
            try:
                data = json.loads(response)
                formatted_response = "\n".join([f"  • {k.replace('_', ' ').title()}: {v}" 
                                              for k, v in data.items()])
                text_widget.insert(tk.END, f"Answer:\n{formatted_response}\n\n", 'answer')
            except json.JSONDecodeError:
                text_widget.insert(tk.END, f"Answer: {response}\n\n", 'answer')
        else:
            text_widget.insert(tk.END, f"Answer: {response}\n\n", 'answer')
        
        text_widget.configure(state='disabled')
        text_widget.see(tk.END)
        
    except Exception as e:
        text_widget.configure(state='normal')
        text_widget.insert(tk.END, f"Error processing question: {e}\n\n", 'error')
        text_widget.configure(state='disabled')

# Replace the if __name__ == "__main__": block with this:
if __name__ == "__main__":
    # Create main window
    root = tk.Tk()
    root.title("Invoice Query System")
    root.geometry("800x600")
    
    # Configure styles
    style = ttk.Style()
    style.configure('TFrame', background='#f0f0f0')
    style.configure('TLabel', background='#f0f0f0', font=('Arial', 10, 'bold'))
    
    # Create fonts
    title_font = tkfont.Font(family='Helvetica', size=12, weight='bold')
    question_font = tkfont.Font(family='Helvetica', size=10, weight='bold')
    answer_font = tkfont.Font(family='Helvetica', size=10)
    
    # Main container
    main_frame = ttk.Frame(root)
    main_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
    
    # Header
    header_frame = ttk.Frame(main_frame)
    header_frame.pack(fill=tk.X, pady=(0, 10))
    
    ttk.Label(header_frame, text="Invoice Query System", font=title_font).pack(side=tk.TOP)
    
    # Output area
    output_frame = ttk.Frame(main_frame)
    output_frame.pack(fill=tk.BOTH, expand=True)
    
    output_text = scrolledtext.ScrolledText(
        output_frame,
        wrap=tk.WORD,
        width=80,
        height=25,
        font=answer_font
    )
    output_text.pack(fill=tk.BOTH, expand=True)
    
    # Configure tags for text styling
    output_text.tag_configure('question', foreground='blue', font=question_font)
    output_text.tag_configure('answer', foreground='green')
    output_text.tag_configure('error', foreground='red')
    
    # Get the vector db instance
    vector_db = main()
    
    if vector_db:
        output_text.insert(tk.END, "Invoice Query System Ready!\n\n", 'title')
        
        # List of questions to ask
        questions = [
            "What is the total amount due for this invoice?",
            "Who is the customer and what is their address?",
            "What is the Invoice Number and Date?",
            "What are the payment terms?",
            "Who is the salesperson mentioned on the invoice?",
            "What is the description of the item purchased and its quantity?",
            "What is the unit price of the decorative clay pottery?",
            "What is the subtotal before taxes and shipping?",
            "How much is the sales tax and shipping & handling?",
            "Are there any special instructions or comments about the shipment?"
        ]
        
        # Process each question
        for question in questions:
            query_invoice_data(vector_db, question, output_text)
            
        # Add status message
        output_text.configure(state='normal')
        output_text.insert(tk.END, "\nAll queries completed.\n", 'title')
        output_text.configure(state='disabled')
    else:
        output_text.insert(tk.END, "Failed to initialize vector database\n", 'error')
    
    root.mainloop()


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_ollama.chat_models import ChatOllama


All libraries imported successfully!
Ollama model 'gemma:2b' initialized for LLM and Embedder.
PDF extraction function defined.
Invoice data processing function defined.
RAG prompt template defined (for natural language output).
RAG prompt template defined.
Using invoice at: C:\Users\ALISH\Agent Practice\tmp\invoice.pdf
Processing PDF: C:\Users\ALISH\Agent Practice\tmp\invoice.pdf
Split PDF into 2 chunks.
Using embedding dimension: 2048
Created new table with proper schema
Successfully inserted 2 chunks


In [9]:
# Cell 1: Imports and Setup (UPDATED)

import os
import json
from IPython.display import display, Markdown

# LangChain components for LLM and RAG
# from langchain_community.llms import Ollama as LCOllama <-- OLD
# from langchain_ollama import Ollama as LCOllama <-- Incorrect Name
from langchain_ollama import OllamaLLM as LCOllama # <-- NEW: Corrected Class Name import
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LangChainDocument
import uuid  
import pyarrow as pa 
# phi-agent components
from phi.model.ollama import Ollama as PhiOllama
from phi.vectordb.lancedb import LanceDb, SearchType
from phi.embedder.ollama import OllamaEmbedder as PhiOllamaEmbedder
from phi.document import Document as PhiDocument

# PDF processing
import PyPDF2

# For creating a dummy PDF (optional)
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas as r_canvas

print("All libraries imported successfully!")

# Cell 2: Ollama Model Initialization (No Changes)

model_id = "gemma:2b"
# Initialize Ollama model for phi-agent
phi_ollama_model = PhiOllama(id=model_id)
# Initialize Ollama embedder for phi-agent's vector DB
phi_ollama_embedder = PhiOllamaEmbedder(model=model_id)

print(f"Ollama model '{model_id}' initialized for LLM and Embedder.")


# Cell 3: PDF Text Extraction Function (No Changes)

def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text from a given PDF file using PyPDF2.
    """
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

print("PDF extraction function defined.")


def process_invoice_data(pdf_path: str) -> list[PhiDocument]:
    """
    Extracts text from PDF and returns properly formatted PhiDocuments
    """
    print(f"Processing PDF: {pdf_path}")
    text = extract_text_from_pdf(pdf_path)
    if not text:
        print("No text extracted from PDF.")
        return []

    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    
    # Split text and create PhiDocuments
    text_chunks = text_splitter.split_text(text)
    phi_chunks = [
        PhiDocument(
            name=f"chunk_{i}",
            content=chunk,  # Using 'content' consistently
            meta={"source": pdf_path}
        )
        for i, chunk in enumerate(text_chunks)
    ]
    
    print(f"Split PDF into {len(phi_chunks)} chunks.")
    return phi_chunks
print("Invoice data processing function defined.")


# Cell 5: Setup Vector Database (LanceDB) (No Changes)

def setup_vector_db(chunks: list[PhiDocument]) -> LanceDb:
    """
    More robust database setup with proper cleanup
    """
    import lancedb
    
    db_uri = "tmp/lancedb_invoices"
    
    # Clean up any existing database
    if os.path.exists(db_uri):
        try:
            import shutil
            shutil.rmtree(db_uri)
        except Exception as e:
            print(f"Warning: Could not clean database directory: {e}")
    
    os.makedirs(db_uri, exist_ok=True)
    
    # Connect to LanceDB with retry logic
    max_retries = 3
    for attempt in range(max_retries):
        try:
            db = lancedb.connect(db_uri)
            
            # Get embedding dimension
            test_embedding = phi_ollama_embedder.get_embedding("test")
            dim = len(test_embedding)
            print(f"Using embedding dimension: {dim}")
            
            # Define schema
            schema = pa.schema([
                pa.field("id", pa.string()),
                pa.field("content", pa.string()),
                pa.field("vector", pa.list_(pa.float32(), dim)),
                pa.field("metadata", pa.string())
            ])
            
            # Create table
            table = db.create_table("invoice_data", schema=schema)
            print("Created new table with proper schema")
            
            # Insert documents with batch processing
            if chunks:
                data = []
                for chunk in chunks:
                    try:
                        embedding = phi_ollama_embedder.get_embedding(chunk.content)
                        data.append({
                            "id": str(uuid.uuid4()),
                            "content": chunk.content,
                            "vector": embedding,
                            "metadata": json.dumps(chunk.meta if hasattr(chunk, 'meta') else {})
                        })
                    except Exception as e:
                        print(f"Error processing chunk: {e}")
                        continue
                
                if data:
                    # Insert in batches to prevent timeouts
                    batch_size = 5
                    for i in range(0, len(data), batch_size):
                        batch = data[i:i + batch_size]
                        table.add(batch)
                    print(f"Successfully inserted {len(data)} chunks")
                else:
                    print("No valid chunks to insert")
            
            return table
            
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            print(f"Attempt {attempt + 1} failed, retrying... Error: {e}")
            time.sleep(1)  # Add delay between retries
# Cell 6: Define RAG Prompt (No Changes)

rag_prompt_template = ChatPromptTemplate.from_messages([
    ("system", (
        "You are an expert AI assistant specialized in extracting precise details from invoice documents. "
        "Your goal is to accurately answer questions based *only* on the provided invoice context. "
        "If the information is not present in the context, state that you cannot find it. "
        "For all extractions, return the response in a clear JSON format with appropriate keys for the extracted data."
        "\n---CONTEXT---\n"
        "{context}"
    )),
    ("human", "Query: {query}")
])

print("RAG prompt template defined.")

# Cell 7: Main Execution Flow (UPDATED)
# Cell 7: Main Execution Flow (Updated)
def main():
    # Setup: Create 'tmp' folder and define invoice PDF path
    pdf_dir = os.path.join(os.getcwd(), "tmp")
    os.makedirs(pdf_dir, exist_ok=True)
    pdf_path = os.path.join(pdf_dir, "invoice.pdf")

    # Create a dummy invoice if it doesn't exist
    if not os.path.exists(pdf_path):
        print("Creating a dummy 'invoice.pdf' for demonstration.")
        c = r_canvas.Canvas(pdf_path, pagesize=letter)
        c.drawString(100, 750, "Invoice No: INV-2023-001")
        c.drawString(100, 730, "Invoice Date: October 26, 2023")
        c.drawString(100, 710, "Customer: ABC Corp")
        c.drawString(100, 690, "Total Amount Due: $1234.56 USD")
        c.drawString(100, 670, "Payment Terms: Net 30")
        c.drawString(100, 650, "Line Item 1: Product X - Quantity: 2 - Price: $300.00 - Total: $600.00")
        c.drawString(100, 630, "Line Item 2: Service Y - Quantity: 1 - Price: $634.56 - Total: $634.56")
        c.save()
    else:
        print(f"Using existing invoice.pdf at: {pdf_path}")

    # Process PDF and setup the vector database
    chunks = process_invoice_data(pdf_path)
    if not chunks:
        print("Exiting: No text found in PDF.")
        return None
    
    vector_db_instance = setup_vector_db(chunks)
    return vector_db_instance

def query_invoice_data(vector_db: LanceDb, question: str):
    """
    Query the invoice data using the vector database
    """
    try:
        # Generate embedding for the question
        query_embedding = phi_ollama_embedder.get_embedding(question)
        
        # Perform the search (correct LanceDB syntax)
        results = vector_db.search(query_embedding).limit(3).to_list()
        
        # Combine the context from top results
        context = "\n".join([result["content"] for result in results])
        
        # Setup LLM chain
        llm = LCOllama(model=model_id)
        chain = rag_prompt_template | llm | StrOutputParser()
        
        # Get answer
        response = chain.invoke({
            "context": context,
            "query": question
        })
        
        display(Markdown(f"**Question:** {question}"))
        display(Markdown(f"**Answer:** {response}"))
    except Exception as e:
        print(f"Error querying data: {e}")

if __name__ == "__main__":
    # Get the vector db instance
    vector_db = main()
    
    # Only run queries if we have a valid vector db
    if vector_db:
        print("\nInvoice Query System Ready!")
        query_invoice_data(vector_db, "What is the total amount due for this invoice?")
        query_invoice_data(vector_db, "Who is the customer and what is their address?")
        query_invoice_data(vector_db, "What is the Invoice Number and Date?")
        query_invoice_data(vector_db, "What are the payment terms?")
        query_invoice_data(vector_db, "Who is the salesperson mentioned on the invoice?",)
        query_invoice_data(vector_db, "What is the description of the item purchased and its quantity?")
        query_invoice_data(vector_db,"What is the unit price of the decorative clay pottery?")
        query_invoice_data(vector_db, "What is the subtotal before taxes and shipping?")
        query_invoice_data(vector_db,"How much is the sales tax and shipping & handling?")
        query_invoice_data(vector_db, "Are there any special instructions or comments about the shipment?")
        query_invoice_data(vector_db,  "What is the phone number of Pottery & Co.?")
        
        
    else:
        print("Failed to initialize vector database")

All libraries imported successfully!
Ollama model 'gemma:2b' initialized for LLM and Embedder.
PDF extraction function defined.
Invoice data processing function defined.
RAG prompt template defined.
Using existing invoice.pdf at: C:\Users\ALISH\Agent Practice\tmp\invoice.pdf
Processing PDF: C:\Users\ALISH\Agent Practice\tmp\invoice.pdf
Split PDF into 2 chunks.
Using embedding dimension: 2048
Created new table with proper schema
Successfully inserted 2 chunks

Invoice Query System Ready!


**Question:** What is the total amount due for this invoice?

**Answer:** {
  "total_due": 1389.99
}

**Question:** Who is the customer and what is their address?

**Answer:** {
  "customer_name": "Mollie Grau",
  "customer_address": "210 Stars Avenue, Berkeley, CA 78910"
}

**Question:** What is the Invoice Number and Date?

**Answer:** {
  "invoice_number": "100",
  "invoice_date": "1/1/23"
}

**Question:** What are the payment terms?

**Answer:** {
  "payment_terms": "Due on receipt"
}

**Question:** Who is the salesperson mentioned on the invoice?

**Answer:** The context does not provide any information about the salesperson, so I cannot extract the requested data from the context.

**Question:** What is the description of the item purchased and its quantity?

**Answer:** {
  "description": "Decorative clay pottery (LG)",
  "quantity": 100
}

**Question:** What is the unit price of the decorative clay pottery?

**Answer:** {
  "unit_price": 13.00
}

**Question:** What is the subtotal before taxes and shipping?

**Answer:** {
  "subtotal": 1300.00,
  "sales_tax": 65.00,
  "shipping_handling": 24.99,
  "total_due": 1389.99
}

**Question:** How much is the sales tax and shipping & handling?

**Answer:** {
  "sales_tax": 65,
  "shipping_handling": 24.99
}

**Question:** Are there any special instructions or comments about the shipment?

**Answer:** {
  "special_instructions": "Shipment contains fragile goods",
  "comments": "Shipment contains fragile goods"
}

**Question:** What is the phone number of Pottery & Co.?

**Answer:** {
  "phone_number": "(123) 456 -7890"
}