# Local Model Sanity Notebook

This notebook checks **fuzzy**, **TF-IDF**, **MiniLM**, **DistilBERT**, and **LayoutLMv3** models from local disk only. 
- No internet calls
- Explicit error printing
- Draws boxes over PDF for visual check

In [None]:
import os, json, traceback
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

from rapidfuzz import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor, LayoutLMv3Model
from sentence_transformers import SentenceTransformer

# -------- config --------
MODELS_ROOT = Path("src/models").resolve()
DOC_ID = "sample_doc"  # change to your doc id folder under data/
DATA_ROOT = Path("data")

def norm(s):
    return " ".join((s or "").lower().replace("\u00A0"," ").split())

In [None]:
# ---- load token boxes/meta ----
docdir = DATA_ROOT / DOC_ID
boxes = json.loads((docdir/"boxes.json").read_text())
meta  = json.loads((docdir/"meta.json").read_text())
print(f"Loaded {len(boxes)} tokens from {DOC_ID}")

In [None]:
# ---- fuzzy + tfidf scorers ----
def score_fuzzy(q, c):
    return fuzz.QRatio(q, c)/100.0

def score_tfidf(q, c):
    vec = TfidfVectorizer(ngram_range=(1,2))
    try:
        vec.fit([q,c])
    except ValueError:
        vec.fit(["placeholder"])
    X = vec.transform([q,c])
    return float(cosine_similarity(X[0],X[1])[0,0])

In [None]:
# ---- try load local models ----
errors = {}

minilm = None
try:
    path = MODELS_ROOT/"sentence-transformers_all-MiniLM-L6-v2"
    minilm = SentenceTransformer(str(path))
    print("[OK] MiniLM loaded")
except Exception as e:
    errors['minilm'] = traceback.format_exc()

distil = None
try:
    path = MODELS_ROOT/"distilbert-base-uncased"
    tok = AutoTokenizer.from_pretrained(str(path), local_files_only=True)
    mdl = AutoModel.from_pretrained(str(path), local_files_only=True)
    distil = (tok, mdl.eval(), 'cuda' if torch.cuda.is_available() else 'cpu')
    print("[OK] DistilBERT loaded")
except Exception as e:
    errors['distil'] = traceback.format_exc()

layout = None
try:
    path = MODELS_ROOT/"microsoft_layoutlmv3-base"
    proc = AutoProcessor.from_pretrained(str(path), local_files_only=True)
    mdl  = LayoutLMv3Model.from_pretrained(str(path), local_files_only=True)
    layout = (proc, mdl.eval(), 'cuda' if torch.cuda.is_available() else 'cpu')
    print("[OK] LayoutLMv3 loaded")
except Exception as e:
    errors['layoutlmv3'] = traceback.format_exc()

errors

In [None]:
# ---- run one test key/value ----
key = "Invoice Number"
value = "12345"
combo = norm(key+" "+value)

print("Fuzzy:", score_fuzzy(value, value))
print("TFIDF:", score_tfidf(value, value))

if minilm:
    try:
        E = minilm.encode([combo, value], convert_to_numpy=True, normalize_embeddings=True)
        print("MiniLM score:", float(np.dot(E[0],E[1])))
    except Exception as e:
        print("MiniLM failed:", e)

if distil:
    tok, mdl, dev = distil
    try:
        t = tok([combo,value], return_tensors='pt', padding=True).to(dev)
        with torch.no_grad():
            h = mdl(**t).last_hidden_state.mean(1)
            h = torch.nn.functional.normalize(h, dim=1)
            print("DistilBERT score:", float(h[0]@h[1].T))
    except Exception as e:
        print("DistilBERT failed:", e)

if layout:
    print("LayoutLMv3 loaded; not running scoring here (needs image features)")

In [None]:
# ---- draw one page with dummy boxes ----
page = 0
fig, ax = plt.subplots(figsize=(8,10))
ax.set_title(f"Doc {DOC_ID} page {page}")
ax.set_xlim(0, meta['pages'][page]['width'])
ax.set_ylim(0, meta['pages'][page]['height'])
ax.invert_yaxis()

colors = {"fuzzy":"green","tfidf":"blue","minilm":"purple","distil":"red"}
for i,box in enumerate(boxes[:20]):  # just first 20 tokens
    rect = Rectangle((box['x0'], box['y0']), box['x1']-box['x0'], box['y1']-box['y0'],
                     linewidth=0.5, edgecolor='gray', facecolor='none')
    ax.add_patch(rect)
plt.show()