In [None]:
euri-5bd420cb6bb042c8fc06270384d4fa001146787ce861d82b9cbf0d607a9cf3d1

In [1]:
import os, json, time, sqlite3, warnings
from typing import Dict, Any, List
import re
import easyocr
from euriai.langgraph import EuriaiLangGraph

In [None]:
EURI_API_KEY  = "euri-1f3025f0ddab44e290bf86bed48790c9f4fd1325a59b42e49701607ebb5b9546"
MODEL         = "gpt-4.1-nano"

In [3]:
INPUT_DIR     = "invoices"
DB_PATH       = "invoice.sqlite" 
PROCESSED_LOG = "processed.json"
POLL_SEC      = 5

In [4]:
os.makedirs(INPUT_DIR, exist_ok=True)

In [5]:
def load_seen() -> set:
    if not os.path.exists(PROCESSED_LOG):
        return set()
    try:
        with open(PROCESSED_LOG, "r", encoding="utf-8") as f:
            return set(json.load(f))
    except Exception:
        return set()


In [6]:
def save_seen(seen: set) -> None:
    with open(PROCESSED_LOG, "w", encoding="utf-8") as f:
        json.dump(sorted(list(seen)), f, ensure_ascii=False, indent=2)
        
seen = load_seen() 

In [7]:
def ensure_schema():
    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()
    # Create if missing
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS invoices (
          id INTEGER PRIMARY KEY AUTOINCREMENT,
          file_name TEXT,
          vendor TEXT,
          number TEXT,
          date TEXT,
          total REAL,
          currency TEXT,
          raw_json TEXT
        )
        """
    )
    # Add columns if table existed with an older schema
    cur.execute("PRAGMA table_info(invoices);")
    cols = {row[1] for row in cur.fetchall()}
    if "file_name" not in cols:
        cur.execute("ALTER TABLE invoices ADD COLUMN file_name TEXT;")
    if "raw_json" not in cols:
        cur.execute("ALTER TABLE invoices ADD COLUMN raw_json TEXT;")
    con.commit()
    con.close()

ensure_schema()


In [8]:
ocr = easyocr.Reader(["en"], gpu=False)

Using CPU. Note: This module is much faster with a GPU.


In [10]:
clean_graph = EuriaiLangGraph(api_key=EURI_API_KEY, default_model=MODEL)
clean_graph.add_ai_node(
    "CLEAN",
    """You clean noisy OCR to plain text.
- Keep facts.
- No guessing.
- Keep table rows readable.

OCR:
{ocr_text}"""
)
clean_graph.set_entry_point("CLEAN")
clean_graph.set_finish_point("CLEAN")


Added AI node: CLEAN (model: gpt-4.1-nano)
Set entry point: CLEAN
Set finish point: CLEAN


In [12]:
extract_graph = EuriaiLangGraph(api_key=EURI_API_KEY, default_model=MODEL)
extract_graph.add_ai_node(
    "EXTRACT",
    """From CLEAN_TEXT, return STRICT JSON with keys exactly:
vendor, number, date, total, currency,
line_items (list of {{description, quantity, unit_price, amount}}).

Unknown → null. Numbers numeric. Dates YYYY-MM-DD if possible.
JSON ONLY, no extra text.

CLEAN_TEXT:
{clean_text}"""
)
extract_graph.set_entry_point("EXTRACT")
extract_graph.set_finish_point("EXTRACT")

Added AI node: EXTRACT (model: gpt-4.1-nano)
Set entry point: EXTRACT
Set finish point: EXTRACT


In [13]:
def pick_text(x, *, prefer_key=None):
    """Return a plain string from various possible structures.
    If x is a dict, try prefer_key or common keys; else stringify.
    """
    if isinstance(x, str):
        return x
    if isinstance(x, dict):
        if prefer_key and prefer_key in x and isinstance(x[prefer_key], str):
            return x[prefer_key]
        for k in ("output", "text", "CLEAN_output", "EXTRACT_output"):
            if k in x and isinstance(x[k], str):
                return x[k]
        return json.dumps(x, ensure_ascii=False)
    return str(x)


In [14]:
def parse_json_safe(raw):
    """Parse JSON robustly. Accepts dict or str; falls back to substring."""
    if isinstance(raw, dict):
        return raw
    if not isinstance(raw, str):
        return {"__raw__": raw}
    try:
        return json.loads(raw)
    except Exception:
        pass
    try:
        s, e = raw.find("{"), raw.rfind("}")
        if s != -1 and e != -1 and e > s:
            return json.loads(raw[s:e+1])
    except Exception:
        pass
    return {"__raw__": raw}

In [15]:
def _heuristic_extract(clean_text: str) -> dict:
    """Very simple regex-based extractor to keep DB flowing when AI is down."""
    def find(pat, s):
        m = re.search(pat, s, re.IGNORECASE)
        return m.group(1).strip() if m else None

    vendor = find(r"Vendor:\s*(.+)", clean_text)
    number = find(r"(?:Invoice Number|Invoice No\.?):\s*([A-Za-z0-9\-]+)", clean_text)
    date   = find(r"(?:Invoice Date|Date):\s*([0-9]{4}-[0-9]{2}-[0-9]{2})", clean_text)
    total  = find(r"Total:\s*([0-9]+(?:\.[0-9]+)?)", clean_text)
    curr   = find(r"Total:\s*[0-9]+(?:\.[0-9]+)?\s*([A-Za-z]{3})", clean_text) or find(r"Currency:\s*([A-Za-z]{3})", clean_text)

    try: total = float(total) if total is not None else None
    except: total = None

    return {
        "vendor": vendor,
        "number": number,
        "date":   date,
        "total":  total,
        "currency": curr,
        "line_items": []
    }


In [16]:
def NODE_OCR(file_path: str) -> Dict[str, Any]:
    """Read an image file and return OCR text."""
    text = "\n".join(ocr.readtext(file_path, detail=0))
    return {"ocr_text": text}

In [17]:
def NODE_CLEAN(ocr_text: str) -> Dict[str, Any]:
    """Normalize noisy OCR text using the CLEAN AI node; fallback to pass-through on error."""
    try:
        clean_raw = clean_graph.run({"ocr_text": ocr_text})
        clean_text = pick_text(clean_raw, prefer_key="CLEAN_output")
        if not isinstance(clean_text, str) or not clean_text.strip():
            raise RuntimeError("Empty CLEAN output")
        return {"clean_text": clean_text, "CLEAN_raw": clean_raw}
    except Exception as e:
        print(f"[CLEAN:FALLBACK] {e}")
        return {"clean_text": ocr_text, "CLEAN_raw": {"fallback": True}}

In [18]:
def NODE_EXTRACT(clean_text_any) -> dict:
    """Extract structured JSON using the EXTRACT AI node; fallback to heuristic on error."""
    clean_text = pick_text(clean_text_any, prefer_key="CLEAN_output")
    try:
        result = extract_graph.run({"clean_text": clean_text})
        raw_json = pick_text(result, prefer_key="EXTRACT_output")
        return {"raw_json": raw_json, "EXTRACT_raw": result}
    except Exception as e:
        print(f"[EXTRACT:FALLBACK] {e}")
        heuristic = _heuristic_extract(clean_text)
        return {"raw_json": json.dumps(heuristic, ensure_ascii=False), "EXTRACT_raw": {"fallback": True}}


In [19]:
def NODE_VALIDATE(data: Dict[str, Any]) -> Dict[str, Any]:
    """Minimal schema checks and numeric sanity for demo purposes."""
    issues: List[str] = []
    for k in ["vendor", "number", "date", "currency"]:
        if k not in data or data.get(k) in (None, ""):
            issues.append(f"missing key: {k}")
    try:
        if data.get("total") is None:
            issues.append("total is null")
        else:
            float(data.get("total"))
    except Exception:
        issues.append(f"total not numeric: {data.get('total')}")
    if not isinstance(data.get("line_items", []), list):
        issues.append("line_items not a list")
    return {"valid": len(issues) == 0, "issues": issues}


In [20]:
def NODE_PERSIST(file_name: str, data: dict, raw_json_any):
    raw_json_str = json.dumps(raw_json_any, ensure_ascii=False) if not isinstance(raw_json_any, str) else raw_json_any
    con = sqlite3.connect(DB_PATH)
    cur = con.cursor()
    cur.execute(
        """
        INSERT INTO invoices(file_name, vendor, number, date, total, currency, raw_json)
        VALUES (?, ?, ?, ?, ?, ?, ?)
        """,
        (
            file_name,
            data.get("vendor"),
            data.get("number"),
            data.get("date"),
            data.get("total"),
            data.get("currency"),
            raw_json_str,
        ),
    )
    rowid = cur.lastrowid
    con.commit()
    con.close()
    print(f"[DB] Inserted row id={rowid} at {os.path.abspath(DB_PATH)}")
    return {"db": "sqlite", "rowid": rowid}

In [21]:
def NODE_NOTIFY(file_name: str, data: Dict[str, Any], valid: bool, issues: List[str]) -> None:
    status = "OK" if valid else f"WARN: {issues}"
    print(
        f"[{status}] file='{file_name}' vendor={data.get('vendor')} number={data.get('number')} total={data.get('total')} {data.get('currency')}"
    )

In [None]:
NODES = ["WATCH", "OCR", "CLEAN", "EXTRACT", "VALIDATE", "PERSIST", "NOTIFY"]
EDGES = [
    ("WATCH", "OCR"),
    ("OCR", "CLEAN"),
    ("CLEAN", "EXTRACT"),
    ("EXTRACT", "VALIDATE"),
    ("VALIDATE", "PERSIST"),
    ("PERSIST", "NOTIFY"),
]

In [23]:
def run_pipeline_for_file(file_path: str) -> None:
    file_name = os.path.basename(file_path)

    # OCR → CLEAN → EXTRACT
    ocr_out     = NODE_OCR(file_path)
    clean_out   = NODE_CLEAN(pick_text(ocr_out.get("ocr_text")))
    extract_out = NODE_EXTRACT(clean_out.get("clean_text"))

    # Parse JSON → VALIDATE
    raw_json = extract_out["raw_json"]
    data     = parse_json_safe(raw_json)
    val_out  = NODE_VALIDATE(data)

    # Persist + Notify
    NODE_PERSIST(file_name, data, raw_json)
    NODE_NOTIFY(file_name, data, val_out["valid"], val_out["issues"])


In [None]:
print(f"\nWatching '{INPUT_DIR}' every {POLL_SEC}s. Only NEW .png/.jpg will be processed.")
try:
    while True:
        for fname in sorted(os.listdir(INPUT_DIR)):
            if not fname.lower().endswith((".png", ".jpg", ".jpeg")):
                continue
            if fname in seen:
                continue  # already processed (persists across restarts)
            fpath = os.path.join(INPUT_DIR, fname)
            try:
                run_pipeline_for_file(fpath)          # trigger full pipeline
                seen.add(fname); save_seen(seen)      # mark as processed
            except Exception as e:
                print(f"[ERROR] {fname}: {e}")
        time.sleep(POLL_SEC)
except KeyboardInterrupt:
    print("Stopped watching.")


Watching 'invoices' every 5s. Only NEW .png/.jpg will be processed.




Graph compiled successfully
[1m[values][0m {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1001\nInvoice Date: 2025-09-10\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 469.39 EUR\nTax: 84.49 EUR\nTotal: 553.88 EUR\nDue Date: 2025-10-28\nPO Numberz\nPO-2001\nNotes: Payment due within 30 days_\nbillingc'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1001\nInvoice Date: 2025-09-10\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 469.39 EUR\nTax: 84.49 EUR\nTotal: 553.88 EUR\nDue Date: 2025-10-28\nPO Numberz\nPO-2001\nNotes: Payment due within 30 days_\nbillingc', 'CLEAN_output': 'Vendor: Bright Future Co  \nInvoice Number: INV-1001  \nInvoice Date: 2025-09-10  \nBilling Address: 123 Example Street, City  \nBilling Email: example.com  \nSubtotal: 469.39 EUR  \nTax: 84.49 EUR  \nTotal: 553.88 EUR  \nDue Date: 2025-10-28  \nPO Number: PO-2001  \nNotes: Payment d



[1m[values][0m {'ocr_text': 'Vendor: Tech Solutions Ltd.\nInvoice Number: INV-1002\nInvoice Date: 2025-09-12\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 518.54 INR\nTax: 93.34 INR\nTotal: 611.88 INR\nDue Date: 2025-10-23\nPO Numberz\nPO-2002\nNotes: Payment due within 30 days_\nbillingc'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Tech Solutions Ltd.\nInvoice Number: INV-1002\nInvoice Date: 2025-09-12\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 518.54 INR\nTax: 93.34 INR\nTotal: 611.88 INR\nDue Date: 2025-10-23\nPO Numberz\nPO-2002\nNotes: Payment due within 30 days_\nbillingc', 'CLEAN_output': 'Vendor: Tech Solutions Ltd.  \nInvoice Number: INV-1002  \nInvoice Date: 2025-09-12  \nBilling Address: 123 Example Street, City  \nBilling Email: @example.com  \nSubtotal: 518.54 INR  \nTax: 93.34 INR  \nTotal: 611.88 INR  \nDue Date: 2025-10-23  \nPO Number: PO-2002  \nNotes: Payment due within 30 days', 



[1m[values][0m {'ocr_text': 'Vendor: Global Supplies Inc.\nInvoice Number: INV-1003\nInvoice Date: 2025-09-25\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 864.1 EUR\nTax: 155.54 EUR\nTotal: 1019.64 EUR\nDue Date: 2025-10-03\nPO Number: PO-2003\nNotes: Payment due within 30 days_\nbillingc'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Global Supplies Inc.\nInvoice Number: INV-1003\nInvoice Date: 2025-09-25\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 864.1 EUR\nTax: 155.54 EUR\nTotal: 1019.64 EUR\nDue Date: 2025-10-03\nPO Number: PO-2003\nNotes: Payment due within 30 days_\nbillingc', 'CLEAN_output': 'Vendor: Global Supplies Inc.  \nInvoice Number: INV-1003  \nInvoice Date: 2025-09-25  \nBilling Address: 123 Example Street, City  \nBilling Email: @example.com  \nSubtotal: 864.10 EUR  \nTax: 155.54 EUR  \nTotal: 1019.64 EUR  \nDue Date: 2025-10-03  \nPO Number: PO-2003  \nNotes: Payment due within 30 da



[1m[values][0m {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1005\nInvoice Date: 2025-09-25\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 710.55 EUR\nTax: 127.9 EUR\nTotal: 838.4499999999999 EUR\nDue Date: 2025-10-04\nPO Numberz\nPO-2005\nNotes: Payment due within 30\nbillingc\ndays:'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1005\nInvoice Date: 2025-09-25\nBilling Address: 123 Example Street, City\nBilling Email:\n@example.com\nSubtotal: 710.55 EUR\nTax: 127.9 EUR\nTotal: 838.4499999999999 EUR\nDue Date: 2025-10-04\nPO Numberz\nPO-2005\nNotes: Payment due within 30\nbillingc\ndays:', 'CLEAN_output': 'Vendor: Bright Future Co  \nInvoice Number: INV-1005  \nInvoice Date: 2025-09-25  \nBilling Address: 123 Example Street, City  \nBilling Email: @example.com  \nSubtotal: 710.55 EUR  \nTax: 127.90 EUR  \nTotal: 838.55 EUR  \nDue Date: 2025-10-04  \nPO Number: PO-2005  \nNotes: Payment due



[1m[values][0m {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1006\nInvoice Date: 2025-09-09\nBilling Address: 123 Example Street, City\nBilling Email: billing@example.com\nSubtotal: 669.39 INR\n120.49 INR\nTotal: 789.88 INR\nDue Date: 2025-10-20\nPO Number: PO-2006\nNotes: Payment due within 30 days_\nTax:'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Bright Future Co_\nInvoice Number: INV-1006\nInvoice Date: 2025-09-09\nBilling Address: 123 Example Street, City\nBilling Email: billing@example.com\nSubtotal: 669.39 INR\n120.49 INR\nTotal: 789.88 INR\nDue Date: 2025-10-20\nPO Number: PO-2006\nNotes: Payment due within 30 days_\nTax:', 'CLEAN_output': 'Vendor: Bright Future Co  \nInvoice Number: INV-1006  \nInvoice Date: 2025-09-09  \nBilling Address: 123 Example Street, City  \nBilling Email: billing@example.com  \nSubtotal: 669.39 INR  \nTax: 120.49 INR  \nTotal: 789.88 INR  \nDue Date: 2025-10-20  \nPO Number: PO-2006  \nNotes: Payment due within 30 days', 'CLEA



[1m[values][0m {'ocr_text': 'Vendor: Alpha Traders\nInvoice Number: INV-1007\nInvoice Date: 2025-09-01\nBilling Address: 123 Example Street, City\nEmail: billing@example.com\nSubtotal: 311.48 INR\n56.07 INR\nTotal: 367.55 INR\nDue Date: 2025-10-03\nPO Number: PO-2007\nNotes: Payment due within 30 days_\nBilling\nTax:'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Alpha Traders\nInvoice Number: INV-1007\nInvoice Date: 2025-09-01\nBilling Address: 123 Example Street, City\nEmail: billing@example.com\nSubtotal: 311.48 INR\n56.07 INR\nTotal: 367.55 INR\nDue Date: 2025-10-03\nPO Number: PO-2007\nNotes: Payment due within 30 days_\nBilling\nTax:', 'CLEAN_output': 'Vendor: Alpha Traders  \nInvoice Number: INV-1007  \nInvoice Date: 2025-09-01  \nBilling Address: 123 Example Street, City  \nEmail: billing@example.com  \nSubtotal: 311.48 INR  \nTax: 56.07 INR  \nTotal: 367.55 INR  \nDue Date: 2025-10-03  \nPO Number: PO-2007  \nNotes: Payment due within 30 days', 'CLEAN_raw_response': 'Ve



[1m[values][0m {'ocr_text': 'Vendor: Global Supplies Inc.\nInvoice Number: INV-1008\nInvoice Date: 2025-09-08\nBilling Address: 123 Example Street, City\nBilling Email: billing@example.com\nSubtotal: 129.72 USD\n23.35 USD\nTotal: 153.07 USD\nDue Date: 2025-10-12\nPO Number: PO-2008\nNotes: Payment due within 30 days_\nTax:'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Global Supplies Inc.\nInvoice Number: INV-1008\nInvoice Date: 2025-09-08\nBilling Address: 123 Example Street, City\nBilling Email: billing@example.com\nSubtotal: 129.72 USD\n23.35 USD\nTotal: 153.07 USD\nDue Date: 2025-10-12\nPO Number: PO-2008\nNotes: Payment due within 30 days_\nTax:', 'CLEAN_output': 'Vendor: Global Supplies Inc.  \nInvoice Number: INV-1008  \nInvoice Date: 2025-09-08  \nBilling Address: 123 Example Street, City  \nBilling Email: billing@example.com  \nSubtotal: 129.72 USD  \nTax: 23.35 USD  \nTotal: 153.07 USD  \nDue Date: 2025-10-12  \nPO Number: PO-2008  \nNotes: Payment due within 30 days'



[1m[values][0m {'ocr_text': 'Vendor: Euron Tech\nInvoice Number: INV-1009\nInvoice Date: 2025-09-08\nBilling Address: 123 Example Street, City\nEmail: billing@example.com\nSubtotal: 827.08 INR\n148.87 INR\nTotal: 975.95 INR\nDue Date: 2025-10-16\nPO Number: PO-2009\nNotes: Payment due within 30 days_\nBilling\nTax:'}
[1m[updates][0m {'CLEAN': {'ocr_text': 'Vendor: Euron Tech\nInvoice Number: INV-1009\nInvoice Date: 2025-09-08\nBilling Address: 123 Example Street, City\nEmail: billing@example.com\nSubtotal: 827.08 INR\n148.87 INR\nTotal: 975.95 INR\nDue Date: 2025-10-16\nPO Number: PO-2009\nNotes: Payment due within 30 days_\nBilling\nTax:', 'CLEAN_output': 'Vendor: Euron Tech  \nInvoice Number: INV-1009  \nInvoice Date: 2025-09-08  \nBilling Address: 123 Example Street, City  \nEmail: billing@example.com  \n\nSubtotal: 827.08 INR  \nTax: 148.87 INR  \nTotal: 975.95 INR  \n\nDue Date: 2025-10-16  \nPO Number: PO-2009  \nNotes: Payment due within 30 days', 'CLEAN_raw_response': 'Vend