In [2]:
!pip install google-generativeai langchain-google-genai phidata

Collecting google-generativeai
  Using cached google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting langchain-google-genai
  Using cached langchain_google_genai-2.1.8-py3-none-any.whl.metadata (7.0 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.1-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.177.0-py3-none-any.whl.metadata (7.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf (from google-generativeai)
  Downloading protobuf-5.29.5-cp310-abi3-win_amd64.whl.metadata (592 bytes)
INFO: pip is looking at multiple versions of langchain

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.30.0 requires protobuf<5,>=3.20, but you have protobuf 5.29.5 which is incompatible.


In [1]:
# Cell 1: Imports and Setup (Premium Optimized)
import os
import json
from IPython.display import display, Markdown
import time
from tenacity import retry, stop_after_attempt, wait_exponential

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import pyarrow as pa
from phi.vectordb.lancedb import LanceDb
from phi.document import Document as PhiDocument
import PyPDF2
import google.generativeai as genai

print("All libraries imported successfully!")

# Initialize with premium configuration
os.environ['GOOGLE_API_KEY'] = "AIzaSyB5LHC0ntTSiM4rG8FNd3mQV6XqXDwx_lE"  # Your premium key

# Premium model selection
gemini_model = "models/gemini-2.5-pro"  # Using 1M token context
embedder = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Enhanced LLM configuration for premium
llm = ChatGoogleGenerativeAI(
    model=gemini_model,
    temperature=0,
    max_retries=5,
    request_timeout=120,
    convert_system_message_to_human=False,
    streaming=True
)

print(f"Initialized Gemini 2.5 Pro with 1M token context window")

# PDF processing with larger chunk size for premium
def extract_text_from_pdf(pdf_path: str) -> str:
    """Enhanced PDF extraction handling multi-column layouts"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"PDF extraction error: {e}")
        return ""

def process_invoice_data(pdf_path: str) -> list[PhiDocument]:
    """Optimized for premium model's larger context"""
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return []

    # Larger chunks for premium models
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=5000,  # Increased from 1000
        chunk_overlap=500,
        length_function=len
    )
    
    return [
        PhiDocument(
            name=f"chunk_{i}",
            content=chunk,
            meta_data={
                "source": pdf_path,
                "page": i//3  # Approximate page tracking
            }
        )
        for i, chunk in enumerate(splitter.split_text(text))
    ]

# Vector DB with premium optimizations
def setup_vector_db(chunks: list[PhiDocument], db_path: str) -> LanceDb:
    """Enhanced for premium model features"""
    import lancedb
    
    if os.path.exists(db_path):
        import shutil
        shutil.rmtree(db_path)
    
    os.makedirs(db_path, exist_ok=True)
    
    db = lancedb.connect(db_path)
    test_embedding = embedder.embed_query("test")
    dim = len(test_embedding)
    
    schema = pa.schema([
        pa.field("id", pa.string()),
        pa.field("content", pa.string()),
        pa.field("vector", pa.list_(pa.float32(), dim)),
        pa.field("metadata", pa.string()),
        pa.field("page_ref", pa.int32())  # Added for premium context tracking
    ])
    
    table = db.create_table("invoice_data", schema=schema)
    
    if chunks:
        # Batch processing with error handling
        batch_size = 50
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            try:
                embeddings = embedder.embed_documents([ch.content for ch in batch])
                data = [{
                    "id": str(uuid.uuid4()),
                    "content": ch.content,
                    "vector": emb,
                    "metadata": json.dumps(ch.meta_data),
                    "page_ref": ch.meta_data.get("page", 0)
                } for ch, emb in zip(batch, embeddings)]
                table.add(data)
            except Exception as e:
                print(f"Batch {i//batch_size} failed: {str(e)}")
                continue
                
    return table

# Premium-optimized RAG prompt
rag_prompt_template = ChatPromptTemplate.from_messages([
    ("system", (
        "INVOICE ANALYTICS ENGINE (Gemini 2.5 Pro)\n"
        "Leverage your 1M token context to:\n"
        "1. Extract exact values from complex layouts\n"
        "2. Cross-reference across document sections\n"
        "3. Validate numerical consistency\n"
        "4. Structure output for direct database insertion\n\n"
        "CONTEXT:\n{context}"
    )),
    ("human", (
        "Query: {query}\n"
        "Respond with:\n"
        "- Direct extracted values\n"
        "- JSON if requesting multiple fields\n"
        "- Markdown tables for comparative analysis"
    ))
])

# Main execution with premium features
def main():
    pdf_path = os.path.join(os.getcwd(), "invoice.pdf")
    lancedb_dir = os.path.join(os.getcwd(), "lancedb_invoices_premium")
    
    if not os.path.exists(pdf_path):
        print("Error: invoice.pdf not found!")
        return None
    
    chunks = process_invoice_data(pdf_path)
    return setup_vector_db(chunks, lancedb_dir) if chunks else None

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def query_invoice_data(vector_db: LanceDb, question: str):
    """Premium-enhanced query with analytics"""
    try:
        query_embedding = embedder.embed_query(question)
        results = vector_db.search(query_embedding).limit(8).to_list()  # More context
        
        context = "\nDOCUMENT SECTIONS:\n" + "\n---\n".join(
            f"PAGE {r['page_ref']}:\n{r['content']}" 
            for r in results
        )
        
        response = (rag_prompt_template | llm | StrOutputParser()).invoke({
            "context": context, 
            "query": question
        })
        
        display(Markdown(f"**Question:** {question}"))
        display(Markdown(f"**Analysis:**\n{response}"))
        
    except Exception as e:
        print(f"Query failed: {type(e).__name__}: {e}")
        raise

if __name__ == "__main__":
    vector_db = main()
    if vector_db:
        premium_questions = [
            "Extract all line items as JSON with fields: description, quantity, unit_price, total",
            "Analyze payment terms across all invoices and identify the most common pattern",
            "Compare invoice dates versus payment due dates and calculate average days early/late",
            "Generate a markdown table of all customers and their total amounts due",
            "Verify tax calculations for all line items and flag discrepancies"
        ]
        for q in premium_questions:
            query_invoice_data(vector_db, q)

All libraries imported successfully!
Initialized Gemini 2.5 Pro with 1M token context window


**Question:** Extract all line items as JSON with fields: description, quantity, unit_price, total

**Analysis:**
```json
[
  {
    "description": "Decorative clay pottery (LG)",
    "quantity": 100,
    "unit_price": 13.00,
    "total": 1300.00
  }
]
```

**Question:** Analyze payment terms across all invoices and identify the most common pattern

**Analysis:**
Due on receipt

**Question:** Compare invoice dates versus payment due dates and calculate average days early/late

**Analysis:**
| Invoice Date | Payment Due Date | Days to Pay (Early/Late) |
| :--- | :--- | :--- |
| 1/1/23 | Due on receipt | 0 |

**Question:** Generate a markdown table of all customers and their total amounts due

**Analysis:**
| Customer | Total Amount Due |
| :--- | :--- |
| Mollie Grau | 1389.99 |

**Question:** Verify tax calculations for all line items and flag discrepancies

**Analysis:**
| Description | Extracted Subtotal | Extracted Tax | Calculated Tax (5.00% Rate) | Discrepancy |
| :--- | :--- | :--- | :--- | :--- |
| Invoice Total | $1,300.00 | $65.00 | $65.00 | $0.00 |

**Conclusion:** The sales tax of $65.00 is correctly calculated at a 5.00% rate on the subtotal of $1,300.00. There are no discrepancies.