In [2]:
# Cell 1: Imports and Setup (Professional Presentation)
import os
import json
from IPython.display import display, Markdown
import time
from tenacity import retry, stop_after_attempt, wait_exponential

# --- New Imports for Professional UI ---
from rich import print
from rich.panel import Panel
from rich.syntax import Syntax
from rich.table import Table
from rich.markdown import Markdown as RichMarkdown
# -----------------------------------------

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
import uuid
import pyarrow as pa
#LanceDB is not a native python library, make sure to install it using pip install lancedb
from phi.vectordb.lancedb import LanceDb
from phi.document import Document as PhiDocument
import PyPDF2
import google.generativeai as genai

print("All libraries imported successfully!")

# Initialize with premium configuration
# IMPORTANT: Replace with your actual Google API Key
os.environ['GOOGLE_API_KEY'] = "AIzaSyB5LHC0ntTSiM4rG8FNd3mQV6XqXDwx_lE"  # Your premium key

# Premium model selection
gemini_model = "models/gemini-2.5-pro"  # Using 1M token context
embedder = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


# Enhanced LLM configuration for premium
llm = ChatGoogleGenerativeAI(
    model=gemini_model,
    temperature=0,
    max_retries=5,
    request_timeout=120,
    convert_system_message_to_human=False,
    streaming=True
)

print(f"Initialized Gemini 1.5 Pro with 1M token context window")

# PDF processing (no changes needed here)
def extract_text_from_pdf(pdf_path: str) -> str:
    """Enhanced PDF extraction handling multi-column layouts"""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text
    except Exception as e:
        print(f"PDF extraction error: {e}")
        return ""

def process_invoice_data(pdf_path: str) -> list[PhiDocument]:
    """Optimized for premium model's larger context"""
    text = extract_text_from_pdf(pdf_path)
    if not text:
        return []
    splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500, length_function=len)
    return [
        PhiDocument(
            name=f"chunk_{i}",
            content=chunk,
            meta_data={"source": pdf_path, "page": i // 3}
        )
        for i, chunk in enumerate(splitter.split_text(text))
    ]

# Vector DB setup (no changes needed here)
def setup_vector_db(chunks: list[PhiDocument], db_path: str) -> LanceDb:
    """Enhanced for premium model features"""
    import lancedb
    if os.path.exists(db_path):
        import shutil
        shutil.rmtree(db_path)
    os.makedirs(db_path, exist_ok=True)
    db = lancedb.connect(db_path)
    test_embedding = embedder.embed_query("test")
    dim = len(test_embedding)
    schema = pa.schema([
        pa.field("id", pa.string()),
        pa.field("content", pa.string()),
        pa.field("vector", pa.list_(pa.float32(), dim)),
        pa.field("metadata", pa.string()),
        pa.field("page_ref", pa.int32())
    ])
    table = db.create_table("invoice_data", schema=schema)
    if chunks:
        batch_size = 50
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i + batch_size]
            try:
                embeddings = embedder.embed_documents([ch.content for ch in batch])
                data = [{
                    "id": str(uuid.uuid4()),
                    "content": ch.content,
                    "vector": emb,
                    "metadata": json.dumps(ch.meta_data),
                    "page_ref": ch.meta_data.get("page", 0)
                } for ch, emb in zip(batch, embeddings)]
                table.add(data)
            except Exception as e:
                print(f"Batch {i//batch_size} failed: {str(e)}")
                continue
    return table

# RAG prompt (no changes needed here)
rag_prompt_template = ChatPromptTemplate.from_messages([
    ("system", (
        "INVOICE ANALYTICS ENGINE (Gemini 2.5 Pro)\n"
        "Leverage your 1M token context to:\n"
        "1. Extract exact values from complex layouts\n"
        "2. Cross-reference across document sections\n"
        "3. Validate numerical consistency\n"
        "4. Structure output for direct database insertion\n\n"
        "CONTEXT:\n{context}"
    )),
    ("human", (
        "Query: {query}\n"
        "Respond with:\n"
        "- A direct, concise answer.\n"
        "- A valid JSON object when multiple fields are requested.\n"
        "- A valid Markdown table for comparative analysis."
    ))
])

# --- New Professional Display Function ---
def display_professional_output(question: str, response: str):
    """
    Displays the question and AI's analysis in a professional,
    well-formatted panel using the 'rich' library.
    """
    # Check if the response is JSON for syntax highlighting
    response_content = None
    if response.strip().startswith("{") or response.strip().startswith("["):
        try:
            # Attempt to parse to confirm it's JSON
            json.loads(response)
            # If successful, use Syntax for highlighting
            response_content = Syntax(response, "json", theme="monokai", line_numbers=True)
        except json.JSONDecodeError:
            # If it's not valid JSON, treat it as Markdown
            response_content = RichMarkdown(response)
    else:
        # For all other text, render it as Markdown
        response_content = RichMarkdown(response)

    # Create a panel with the question as the title and analysis as content
    print(Panel(
        response_content,
        title=f"[bold cyan]Query[/bold cyan]: {question}",
        title_align="left",
        border_style="green",
        expand=True
    ))

# --- Updated Query Function ---
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def query_invoice_data(vector_db: LanceDb, question: str):
    """Premium-enhanced query with analytics and professional display"""
    try:
        query_embedding = embedder.embed_query(question)
        results = vector_db.search(query_embedding).limit(8).to_list()
        
        context = "\nDOCUMENT SECTIONS:\n" + "\n---\n".join(
            f"PAGE {r['page_ref']}:\n{r['content']}" for r in results
        )
        
        response = (rag_prompt_template | llm | StrOutputParser()).invoke({
            "context": context,
            "query": question
        })
        
        # Use the new professional display function
        display_professional_output(question, response)
        
    except Exception as e:
        print(f"Query failed: {type(e).__name__}: {e}")
        raise

# Main execution (no changes needed here)
def main():
    # Make sure you have an 'invoice.pdf' file in the same directory
    pdf_path = os.path.join(os.getcwd(), "invoice.pdf")
    lancedb_dir = os.path.join(os.getcwd(), "lancedb_invoices_premium")
    
    if not os.path.exists(pdf_path):
        print(Panel("[bold red]Error:[/bold red] 'invoice.pdf' not found in the current directory!", border_style="red"))
        return None
    
    chunks = process_invoice_data(pdf_path)
    return setup_vector_db(chunks, lancedb_dir) if chunks else None

if __name__ == "__main__":
    vector_db = main()
    if vector_db:
        premium_questions = [
            "Extract all line items as a single JSON object with fields: description, quantity, unit_price, total",
            "Analyze payment terms and state them clearly.",
            "Compare the invoice date versus the payment due date and calculate the days to pay.",
            "Generate a markdown table of all customers and their total amounts due.",
            "Verify the tax calculation based on a 5% rate and flag any discrepancy."
        ]
        for q in premium_questions:
            query_invoice_data(vector_db, q)