# Smart Document Analyst

A multi-modal AI-powered business application that allows users to upload and interrogate any business document through a conversational Q&A interface. Built in Jupyter Notebook with a Gradio UI, the app supports invoices, contracts, forms, PDFs, Word documents, audio recordings, and plain text files

In [None]:
# Install / upgrade dependencies (run once)
!pip -q install -U gradio pillow openai anthropic google-genai numpy

In [None]:
# Imports + initialize LLM clients

import os
import io
import json
import time
from typing import Optional, Dict, Any, Generator, List, Tuple

import gradio as gr
import numpy as np
from PIL import Image

from openai import OpenAI
from anthropic import Anthropic
from google import genai
from google.genai import types

# Clients (keys assumed already set in your environment)


openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GEMINI_API_KEY')


if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set )")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not ")


openai = OpenAI()
anthropic = Anthropic()
gemini =  genai.Client()



In [None]:
# ystem Prompt & Model Configuration


SYSTEM_PROMPT = """You are an expert Business Document Analyst with deep expertise in:
- Invoices & receipts: extracting vendor details, line items, totals, tax, payment terms
- Contracts & agreements: identifying parties, key dates, obligations, risks, termination clauses
- Forms & applications: parsing fields, flagging missing or inconsistent data, summarizing intent
- General business documents: summarizing content, spotting anomalies, answering precise questions

Your behaviour:
- When a document image is first uploaded, automatically provide a structured overview:
  * Document type detected
  * Key parties or entities involved
  * Top 3-5 key findings or extracted fields
  * Any anomalies, missing info, or red flags 
- After the overview, invite the user to ask follow-up questions
- Answer questions precisely, referencing specific sections of the document
- When extracting structured data, return it as a clean markdown table
- For contract risk items use:  High Risk /  Medium Risk /  Low Risk
- Be concise, professional, and business-focused at all times
- If something in the document is unclear or illegible, say so honestly
"""

# Model registry ‚Äî maps UI display name to provider + model ID
MODELS = {
    "GPT-4o mini (OpenAI)": {
        "provider": "openai",
        "model_id": "gpt-4o-mini"
    },
    "Claude Sonnet (Anthropic)": {
        "provider": "anthropic",
        "model_id": "claude-sonnet-4-5"
    },
    "Gemini 2.5 Flash (Google)": {
        "provider": "gemini",
        "model_id": "gemini-2.5-flash"
    }
}



In [None]:
# Image Utility


import base64
from io import BytesIO

def pil_to_base64(pil_image: Image.Image, format: str = "JPEG") -> str:
    """
    Convert a PIL Image to a base64 string.
    Handles RGBA and palette-mode images by converting to RGB first.
    """
    if pil_image.mode in ("RGBA", "P", "LA"):
        pil_image = pil_image.convert("RGB")
    
    buffer = BytesIO()
    pil_image.save(buffer, format=format)
    buffer.seek(0)
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


# --- Quick Test ---
# Create a small dummy image and verify the function works
test_img = Image.new("RGB", (100, 100), color=(255, 0, 0))  # solid red square
test_b64 = pil_to_base64(test_img)






In [None]:
# Streaming Inference Engine


def stream_response(
    message: str,
    history: List[Dict],
    model_key: str,
    image: Optional[Image.Image] = None
) -> Generator[str, None, None]:
    """
    Stream a response from the selected model.
    
    Args:
        message   : current user message/question
        history   : list of {"role": "user"/"assistant", "content": "..."} dicts
        model_key : key from MODELS dict (matches UI dropdown)
        image     : PIL image if attached (only on first turn)
    
    Yields:
        str: incremental response chunks for Gradio streaming
    """

    model_cfg = MODELS[model_key]
    provider  = model_cfg["provider"]
    model_id  = model_cfg["model_id"]
    b64       = pil_to_base64(image) if image else None

    # ------------------------------------------------------------------ #
    # OPENAI                                                               #
    # ------------------------------------------------------------------ #
    if provider == "openai":

        # Build message history in OpenAI format
        messages = [{"role": "system", "content": SYSTEM_PROMPT}]

        # Add prior turns from history
        for turn in history:
            messages.append({"role": turn["role"], "content": turn["content"]})

        # Build current user message ‚Äî include image if present
        if b64:
            user_content = [
                {"type": "image_url",
                 "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
                {"type": "text", "text": message}
            ]
        else:
            user_content = message

        messages.append({"role": "user", "content": user_content})

        # Stream from OpenAI
        stream = openai.chat.completions.create(
            model=model_id,
            messages=messages,
            stream=True
        )

        accumulated = ""
        for chunk in stream:
            delta = chunk.choices[0].delta.content
            if delta:
                accumulated += delta
                yield accumulated  # yield full string so far (Gradio requirement)

    # ------------------------------------------------------------------ #
    # ANTHROPIC                                                            #
    # ------------------------------------------------------------------ #
    elif provider == "anthropic":

        # Build message history in Anthropic format
        messages = []

        for turn in history:
            messages.append({"role": turn["role"], "content": turn["content"]})

        # Build current user message ‚Äî include image if present
        if b64:
            user_content = [
                {"type": "image",
                 "source": {
                     "type": "base64",
                     "media_type": "image/jpeg",
                     "data": b64
                 }},
                {"type": "text", "text": message}
            ]
        else:
            user_content = [{"type": "text", "text": message}]

        messages.append({"role": "user", "content": user_content})

        # Stream from Anthropic
        with anthropic.messages.stream(
            model=model_id,
            max_tokens=2048,
            system=SYSTEM_PROMPT,
            messages=messages
        ) as stream:
            accumulated = ""
            for text_chunk in stream.text_stream:
                accumulated += text_chunk
                yield accumulated

    # ------------------------------------------------------------------ #
    # GEMINI                                                               #
    # ------------------------------------------------------------------ #
    elif provider == "gemini":

        # Build contents list ‚Äî Gemini uses a flat list of role/parts dicts
        contents = []

        for turn in history:
            # Gemini uses "model" instead of "assistant"
            role = "model" if turn["role"] == "assistant" else "user"
            contents.append(
                types.Content(
                    role=role,
                    parts=[types.Part.from_text(text=turn["content"])]
                )
            )

        # Build current user turn ‚Äî include image if present
        if b64:
            image_bytes = base64.b64decode(b64)
            current_parts = [
                types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg"),
                types.Part.from_text(text=message)
            ]
        else:
            current_parts = [types.Part.from_text(text=message)]

        contents.append(types.Content(role="user", parts=current_parts))

        # Stream from Gemini
        stream = gemini.models.generate_content_stream(
            model=model_id,
            contents=contents,
            config=types.GenerateContentConfig(
                system_instruction=SYSTEM_PROMPT,
                max_output_tokens=2048,
            )
        )

        accumulated = ""
        for chunk in stream:
            if chunk.text:
                accumulated += chunk.text
                yield accumulated





In [None]:
# File Processing Utility

import pathlib

# Install required libraries if not already present
import subprocess
subprocess.run(["pip", "install", "pypdf", "python-docx", "-q"], 
               capture_output=True)

from pypdf import PdfReader
from docx import Document as DocxDocument


def process_uploaded_file(file_path: str) -> Dict[str, Any]:
    """
    Process any uploaded file and return normalized content.

    Returns a dict with:
      - type    : "image" | "pdf" | "docx" | "audio" | "text"
      - content : extracted text (for non-image types)
      - image   : PIL Image object (for image type only)
      - name    : original filename
    """
    path = pathlib.Path(file_path)
    ext  = path.suffix.lower()
    name = path.name

    # ‚îÄ‚îÄ IMAGE ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if ext in [".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tiff", ".gif"]:
        img = Image.open(file_path).convert("RGB")
        return {"type": "image", "image": img, "content": None, "name": name}

    # ‚îÄ‚îÄ PDF ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    elif ext == ".pdf":
        try:
            reader = PdfReader(file_path)
            pages  = []
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text and text.strip():
                    pages.append(f"[Page {i+1}]\n{text.strip()}")
            full_text = "\n\n".join(pages) if pages else "‚ö†Ô∏è No extractable text found in PDF (may be scanned image)."
            return {"type": "pdf", "content": full_text, "image": None, "name": name}
        except Exception as e:
            return {"type": "error", "content": f"PDF read error: {e}", "image": None, "name": name}

    # ‚îÄ‚îÄ DOCX ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    elif ext in [".docx", ".doc"]:
        try:
            doc        = DocxDocument(file_path)
            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
            full_text  = "\n\n".join(paragraphs) if paragraphs else "‚ö†Ô∏è No text found in document."
            return {"type": "docx", "content": full_text, "image": None, "name": name}
        except Exception as e:
            return {"type": "error", "content": f"DOCX read error: {e}", "image": None, "name": name}

    # ‚îÄ‚îÄ AUDIO ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    elif ext in [".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"]:
        try:
            with open(file_path, "rb") as audio_file:
                transcript = openai.audio.transcriptions.create(
                    model="whisper-1",
                    file=audio_file,
                    response_format="text"
                )
            transcribed = transcript if isinstance(transcript, str) else transcript.text
            full_text   = f"[Audio Transcription ‚Äî {name}]\n\n{transcribed}"
            return {"type": "audio", "content": full_text, "image": None, "name": name}
        except Exception as e:
            return {"type": "error", "content": f"Audio transcription error: {e}", "image": None, "name": name}

    # ‚îÄ‚îÄ PLAIN TEXT / CSV ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    elif ext in [".txt", ".csv", ".md"]:
        try:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                full_text = f.read()
            return {"type": "text", "content": full_text, "image": None, "name": name}
        except Exception as e:
            return {"type": "error", "content": f"Text read error: {e}", "image": None, "name": name}

    # ‚îÄ‚îÄ UNSUPPORTED ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    else:
        return {
            "type": "error",
            "content": f"‚ö†Ô∏è Unsupported file type: {ext}. Supported: images, PDF, DOCX, audio, TXT, CSV",
            "image": None,
            "name": name
        }




In [None]:
# Gradio UI ‚Äî for multi-file type support


def chat(
    message: Dict,
    history: List[Dict],
    model_key: str
) -> Generator[str, None, None]:
    """
    Updated chat handler supporting all file types.
    - Images   ‚Üí sent as base64 to vision models
    - PDF/DOCX ‚Üí text extracted, injected into prompt
    - Audio    ‚Üí transcribed via Whisper, injected into prompt
    - TXT/CSV  ‚Üí read directly, injected into prompt
    """

    user_text  = message.get("text", "").strip()
    image      = None
    extra_text = ""   # will hold extracted content from non-image files

    # --- Process uploaded file if any ---
    files = message.get("files", [])
    if files:
        file_path = files[0]["path"] if isinstance(files[0], dict) else files[0]
        result    = process_uploaded_file(file_path)

        if result["type"] == "image":
            # Visual pathway ‚Äî pass PIL image to LLM
            image = result["image"]

        elif result["type"] == "error":
            yield result["content"]
            return

        else:
            # Text pathway ‚Äî prepend extracted content to the prompt
            doc_type_label = {
                "pdf"  : "üìÑ PDF Document",
                "docx" : "üìù Word Document",
                "audio": "üéôÔ∏è Audio Transcript",
                "text" : "üìÉ Text File"
            }.get(result["type"], "Document")

            extra_text = (
                f"The user has uploaded a {doc_type_label} named '{result['name']}'.\n"
                f"Here is its content:\n\n"
                f"{'='*60}\n"
                f"{result['content']}\n"
                f"{'='*60}\n\n"
            )

    # --- Require at least some input ---
    if not user_text and not extra_text and image is None:
        yield "‚ö†Ô∏è Please upload a document and/or type a question."
        return

    # --- Build final prompt ---
    if extra_text and not user_text:
        # File uploaded with no question ‚Üí auto analyze
        final_prompt = extra_text + "Please analyze this document and provide a structured overview."
    elif extra_text and user_text:
        # File + question together
        final_prompt = extra_text + f"User question: {user_text}"
    else:
        # Text only or image only
        final_prompt = user_text or "Please analyze this document and provide a structured overview."

    # --- Stream response ---
    for chunk in stream_response(
        message=final_prompt,
        history=history,
        model_key=model_key,
        image=image
    ):
        yield chunk


# ------------------------------------------------------------------ #
# UI Layout                                                            #
# ------------------------------------------------------------------ #
with gr.Blocks(title="Smart Document Analyst") as demo:

    gr.Markdown("""
    #  Smart Document Analyst
    **Upload any business document** and ask questions about it.  
    Supports:  Images ¬∑  PDF ¬∑  DOCX ¬∑ üéôÔ∏è Audio ¬∑  TXT/CSV  
    Multi-turn Q&A ¬∑ Switch models anytime to compare responses.
    """)

    model_selector = gr.Dropdown(
        choices=list(MODELS.keys()),
        value="Claude Sonnet (Anthropic)",
        label="ü§ñ Select AI Model",
        interactive=True
    )

    gr.ChatInterface(
        fn=chat,
    
        multimodal=True,
        additional_inputs=[model_selector],
        chatbot=gr.Chatbot(
            label="Document Q&A",
            height=520,
            placeholder=(
                " Upload a document using the **paperclip icon**, then ask:\n\n"
                "- *What is the total amount due?*\n"
                "- *Who are the parties in this contract?*\n"
                "- *Summarize the key clauses*\n"
                "- *Extract all line items as a table*\n"
                "- *Are there any red flags?*"
            )
        ),
        textbox=gr.MultimodalTextbox(
            placeholder="Upload a document and/or type your question...",
            file_types=[
                ".jpg", ".jpeg", ".png", ".webp", ".bmp",   
                ".pdf",                                       
                ".docx", ".doc",                             
                ".mp3", ".wav", ".m4a", ".ogg", ".webm",    
                ".txt", ".csv", ".md"                        
            ],
            file_count="single"
        ),
    )

    gr.Markdown("""
    ---
    üí° **Tips:** Switch models mid-conversation to compare analysis.  
    Audio files are auto-transcribed before analysis.  
    For scanned PDFs (image-based), upload as image file instead.
    """)

demo.launch(
    theme=gr.themes.Soft(),
    debug=False,
    share=True,                         
    auth=[                               
        ("admin", "admin123"),
        ("analyst", "docs2024"),
    ],
    auth_message="üîê Please log in to access the Smart Document Analyst",
     prevent_thread_lock=True  
)