In [223]:
import os
import openai
import json
from langchain_openai import ChatOpenAI
from langgraph.graph import START, END, StateGraph, MessagesState
from dotenv import load_dotenv
from pypdf import PdfReader
from typing import List, Optional
from pydantic import BaseModel, Field
from typing import TypedDict
from langchain_openai import ChatOpenAI
from langchain_groq import ChatGroq


load_dotenv(override=True)


True

In [224]:
reader = PdfReader("./sample-pdf/invoice_Aaron Bergman_36258.pdf")
page = reader.pages[0]
text = page.extract_text()

print(text)

INVOICE
# 36258
SuperStore
Bill To
:
Aaron Bergman
Ship To
:
98103, Seattle,
Washington, United
States
Mar 06 2012
First Class
$50.10
Date
:
Ship Mode
:
Balance Due
:
Item
Quantity
Rate
Amount
Global Push Button Manager's Chair, Indigo
1
$48.71
$48.71
Chairs, Furniture, FUR-CH-4421
$48.71
$9.74
$11.13
$50.10
Subtotal
:
Discount (20%)
:
Shipping
:
Total
:
Notes
:
Thanks for your business!
Terms
:
Order ID : CA-2012-AB10015140-40974


In [234]:
doc = fitz.open("./sample-pdf/sample_invoice_usd.pdf")
page = doc.load_page(0)  # Load the first page

# Extract text from the page
text = page.get_text("html")

print(text)

<div id="page0" style="width:595.3pt;height:841.9pt">
<p style="top:39.2pt;left:36.0pt;line-height:16.0pt"><span style="font-family:Arial,sans-serif;font-size:16.0pt;color:#000000">INVOICE</span></p>
<p style="top:70.0pt;left:36.0pt;line-height:10.0pt"><b><span style="font-family:Arial,sans-serif;font-size:10.0pt;color:#000000">Invoice No:</span></b><span style="font-family:Arial,sans-serif;font-size:10.0pt;color:#000000"> INV-1001</span></p>
<p style="top:88.0pt;left:36.0pt;line-height:10.0pt"><b><span style="font-family:Arial,sans-serif;font-size:10.0pt;color:#000000">Date:</span></b><span style="font-family:Arial,sans-serif;font-size:10.0pt;color:#000000"> 2026-01-01</span></p>
<p style="top:120.8pt;left:42.0pt;line-height:9.0pt"><b><span style="font-family:Arial,sans-serif;font-size:9.0pt;color:#000000">From:</span></b></p>
<p style="top:132.8pt;left:42.0pt;line-height:9.0pt"><span style="font-family:Arial,sans-serif;font-size:9.0pt;color:#000000">Acme Software Solutions</span></p>

In [251]:
class AgentState(TypedDict):
    parsed_text: str
    invoice_data: str
    structured_invoice_data: InvoiceData
    business_rules_validation_result: str
    flag_for_review: bool
    final_json_output: str

def parse_document(state:AgentState) -> AgentState:
    """Parse Invoice PDF"""
    # reader = PdfReader("./sample-pdf/invoice_Aaron Bergman_36258.pdf")
    # page = reader.pages[0]
    # text = page.extract_text()

    # state["parsed_text"] = text

    doc = fitz.open("./sample-pdf/sample_invoice_usd_with_discrepancies.pdf")
    page = doc.load_page(0)  # Load the first page

    # Extract text from the page
    text = page.get_text("html")

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )

    print("Parsing Document...")

    response = llm.invoke([
        {"role": "system", "content": "Simplify the HTML text. Output a well structured HTML text."},
        {"role": "user", "content": text}
    ])

    state["parsed_text"] = response.content

    print(f"HTML Formatted Text: {state['parsed_text']}")

    return state


In [252]:


class LineItem(BaseModel):
    item_name: str = Field(description="The name of the item")
    item_category: Optional[str] = Field(description="The category of the item")
    quantity: Optional[float]
    unit_price: Optional[float]
    amount: Optional[float]


class InvoiceData(BaseModel):
    invoice_number: Optional[str] = Field(description="The unique identifier for the invoice")
    date: Optional[str] = Field(description="The date the invoice was issued")
    vendor_name: Optional[str] = Field(description="The name of the vendor or service provider")
    shipping_fee: Optional[float] = Field(description="The shipping fee amount")
    discount: Optional[float] = Field(description="The discount amount")
    total_amount: Optional[float] = Field(description="The total amount due including taxes")
    tax_amount: Optional[float] = Field(description="The total tax amount")
    currency: Optional[str] = Field(description="The currency code (e.g., USD, EUR)")
    line_items: List[LineItem] = Field(default_factory=list)
    flag_for_review: Optional[bool] = Field(description="Flag to indicate if the invoice requires review")
    
    

def extract_fields(state: AgentState) -> AgentState:
    """
    Extracts structured invoice data from the parsed document text.
    """

    system_prompt = """

    You are an invoice data extraction assistant. Your task is to extract structured information from the invoice text.
    The structure should be in JSON format. Identify well the currency and give the associated currency code.

    """

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )

    print("Extracting invoice data...")
    
    response = llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": state["parsed_text"]},
    ])

    invoice_data = response.content



    pretty_json = json.dumps(invoice_data, indent=4, sort_keys=True)
    print(pretty_json)

    return {**state, "invoice_data": f"{invoice_data}"}


In [253]:
def validate_extracted_fields(state: AgentState) -> AgentState:
    """Evaluates the extracted fields if correct"""

    system_prompt = """
    You are a invoice data validator. Check any inconsistencies in the extracted fields. 
    Remove irrelevant fields if needed. Return the final invoice data in JSON format.
    """

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )
    structured_llm = llm.with_structured_output(InvoiceData)

    print("Validating Fields...")

    response = structured_llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Evaluate the extracted fields: {state['invoice_data']}"},
    ])   

    print(response.model_dump_json(indent=4))

    return {**state, "structured_invoice_data": response.model_dump_json(indent=2)}

In [254]:
def validate_business_rules(state: AgentState) -> AgentState:
    """Evaluates data against the business rules"""

    business_rules = """
    You are an business rules expert. Validate the invoice data against the following business rules:

    Currency Standardization: All extracted amounts must be converted or tagged with an ISO currency code (e.g., USD, EUR). If multiple currencies are present, flag for manual review.
    Threshold:Flag Invoices >$5,000
    Validate the total amount of list items, shipping fees, discounts, and taxes into actual total amount.

    Respond in markdown.

    """

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )

    print("Validating against businesss rules...")
    
    response = llm.invoke([
        {"role": "system", "content": business_rules},
        {"role": "user", "content": state["structured_invoice_data"]}
    ])

    from IPython.display import display, Markdown

    display(Markdown(response.content))

    state["business_rules_validation_result"] = response.content
    
    return state

In [255]:
def decision_node(state: AgentState) -> AgentState:
    """Decides which action to take based on the state."""

    system_prompt = f""" 

    Given the Business Rules Validation Result: 
    
    {state["business_rules_validation_result"]}

    Provide a decision whether to accept or flag the invoice for review. Only respond with True or False.
    True if the invoice should be flagged for review.
    False if the invoice should be accepted.
    
    """

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )

    print("Deciding...")

    response = llm.invoke(system_prompt)

    state["flag_for_review"] = response.content

    print("Decision: ", state["flag_for_review"])

    return state 


    

In [256]:
def finalization_node(state: AgentState) -> AgentState: 
    """Finalization node for the agent. Gives a final JSON response that will be added to the database."""

    system_prompt = """
    You are a professional invoice data validator. 
You MUST respond ONLY by calling the provided tool with the structured invoice data. 
Do not provide any conversational text or preamble.
    """

    llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    )

    print("Finalizing...")

    structured_llm = llm.with_structured_output(InvoiceData, method="function_calling")
    response = structured_llm.invoke(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Invoice Data: {state["structured_invoice_data"]}, Business Rules Validation Result: {state["business_rules_validation_result"]}, Flag for review: {state["flag_for_review"]}"},
        ]
    )

    state["final_json_output"] = response

    print("Final JSON output:", response.model_dump_json(indent=4))
    
    return state

In [257]:
graph = StateGraph(AgentState)
graph.add_node("parse_document", parse_document)
graph.add_node("extract_fields", extract_fields)
graph.add_node("validate_extracted_fields", validate_extracted_fields)
graph.add_node("validate_business_rules", validate_business_rules)
graph.add_node("decision_node", decision_node)
graph.add_node("finalization_node", finalization_node)

graph.add_edge(START, "parse_document")
graph.add_edge("parse_document", "extract_fields")
graph.add_edge("extract_fields", "validate_extracted_fields")
graph.add_edge("validate_extracted_fields", "validate_business_rules")
graph.add_edge("validate_business_rules", "decision_node")
graph.add_edge("decision_node", "finalization_node")
graph.add_edge("finalization_node", END)

agent = graph.compile()

In [258]:
result = agent.invoke({"parsed_text": "", "invoice_data": "", "structured_invoice_data": {}, "business_rules_validation_result": ""})

Parsing Document...
HTML Formatted Text: ```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Invoice – INV‑1002</title>
    <style>
        body {
            font-family: Arial, Helvetica, sans-serif;
            margin: 2rem;
            color: #000;
        }
        h1 {
            font-size: 1.6rem;
            margin-bottom: 0.5rem;
        }
        .meta,
        .addresses,
        .terms {
            margin-top: 1rem;
        }
        .meta span,
        .addresses div {
            display: inline-block;
            min-width: 8rem;
            font-weight: bold;
        }
        .addresses {
            display: flex;
            justify-content: space-between;
        }
        .addresses div {
            font-weight: normal;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin-top: 1.5rem;
        }
        th,
        td {
            border: 1px solid #ddd;
            padding

## Invoice Validation Report – **INV‑1002**  

| Business Rule | Requirement | Result |
|---------------|-------------|--------|
| **Currency Standardization** | All amounts must be tagged with a single ISO‑4217 code (e.g., USD). | ✅ **USD** is provided and no other currency appears. |
| **Threshold** | Flag invoices whose **total amount** exceeds **$5,000**. | ✅ Total = **$4,900** → below threshold (no automatic flag). |
| **Total‑Amount Consistency** | `total_amount` must equal **Σ(line‑item amounts) + shipping_fee – discount + tax_amount**. | ❌ **Inconsistent** – calculated total = **$4,600**, but `total_amount` = **$4,900** (difference **$300**). |
| **Line‑Item Amount Accuracy** | For each line item: `amount` = `quantity` × `unit_price`. | ❌ All three items have mismatches:  <br>1. **Web Application Development** – 2 × $2,000 = $4,000 (recorded $2,000) <br>2. **AI Feature Integration** – 1 × $1,200 = $1,200 (recorded $1,500) <br>3. **Maintenance & Support** – 3 × $300 = $900 (recorded $800) |
| **Manual Review Flag** | `flag_for_review` set to **true**. | ✅ Already flagged – justified by the multiple inconsistencies above. |

---

### Detailed Calculations  

1. **Sum of line‑item amounts (as recorded):**  
   - $2,000 + $1,500 + $800 = **$4,300**  

2. **Add tax, shipping, discount:**  
   - Shipping fee = $0 (null)  
   - Discount = $0 (null)  
   - Tax = $300  

   **Calculated total** = $4,300 + $300 = **$4,600**  

3. **Declared `total_amount`** = **$4,900** → **$300** higher than expected.

4. **Per‑item expected amounts (quantity × unit_price):**  

| Item | Quantity | Unit Price | Expected Amount | Recorded Amount | Δ |
|------|----------|------------|----------------|----------------|---|
| Web Application Development | 2 | $2,000 | $4,000 | $2,000 | **‑$2,000** |
| AI Feature Integration | 1 | $1,200 | $1,200 | $1,500 | **+$300** |
| Maintenance & Support | 3 | $300 | $900 | $800 | **‑$100** |

---

## Recommendations  

1. **Correct line‑item amounts** to reflect the true quantity × unit‑price values.  
2. **Re‑calculate the invoice total** after fixing line items, shipping, discounts, and tax.  
3. **Update `total_amount`** to match the recomputed total.  
4. Since the invoice already has `flag_for_review: true`, retain the manual‑review status until the above corrections are made.  

> **Overall Verdict:** The invoice fails the **total‑amount consistency** and **line‑item accuracy** rules and therefore **requires manual review and correction** before it can be approved.

Deciding...
Decision:  True
Finalizing...
Final JSON output: {
    "invoice_number": "INV-1002",
    "date": "2026-01-01",
    "vendor_name": "Acme Software Solutions",
    "shipping_fee": null,
    "discount": null,
    "total_amount": 4900.0,
    "tax_amount": 300.0,
    "currency": "USD",
    "line_items": [
        {
            "item_name": "Web Application Development",
            "item_category": null,
            "quantity": 2.0,
            "unit_price": 2000.0,
            "amount": 2000.0
        },
        {
            "item_name": "AI Feature Integration",
            "item_category": null,
            "quantity": 1.0,
            "unit_price": 1200.0,
            "amount": 1500.0
        },
        {
            "item_name": "Maintenance & Support",
            "item_category": null,
            "quantity": 3.0,
            "unit_price": 300.0,
            "amount": 800.0
        }
    ],
    "flag_for_review": true
}


In [215]:
result

{'parsed_text': '```html\n<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <title>Invoice #36258</title>\n    <style>\n        body {font-family: Arial, Helvetica, sans-serif; margin: 2rem; color:#333;}\n        header {display:flex; justify-content:space-between; align-items:flex-start;}\n        header h1 {margin:0; font-size:2rem; color:#3a3a3a;}\n        header .invoice-number {font-size:1rem; color:#777;}\n        .company {font-weight:bold; margin-top:0.5rem;}\n        .address-block {margin-top:1rem;}\n        .address-block h3 {margin:0 0 0.2rem 0; font-size:1rem; font-weight:normal;}\n        .address-block p {margin:0; font-size:0.9rem;}\n        .details {margin-top:1rem; display:flex; gap:2rem;}\n        .details div {font-size:0.9rem;}\n        .details div span {font-weight:bold; color:#333;}\n        table {width:100%; border-collapse:collapse; margin-top:2rem;}\n        th, td {border:1px solid #ddd; padding:0.5rem; text-align:left;}\n        t