# 03 — Validate documents against the policy

Now the inspector walks every street with two things in hand, the census and the policy. For each run it checks the rules and writes a one-line violation when a rule is broken. At the end it writes a short verdict for the whole file. Small, predictable, boring. Exactly what you want from a judge.

In [None]:
from pathlib import Path
import pandas as pd, json

CWD = Path.cwd()
ROOT = (CWD if CWD.name != "notebooks" else CWD.parent).resolve()

FEAT = ROOT / "reports" / "features"
POLICY = ROOT / "reports" / "policy" / "policy.json"
OUT = ROOT / "reports" / "validation"
OUT.mkdir(parents=True, exist_ok=True)

with open(POLICY, "r", encoding="utf-8") as f:
    policy = json.load(f)

def ok_color(c, allowed):
    if not allowed:
        return True
    if c is None or c != c:  # NaN
        return True
    return str(c).upper() in {x.upper() for x in allowed}

def validate_file(csv_path: Path, policy: dict) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    v = []
    for _, r in df.iterrows():
        kind = r.get("kind")
        fam = r.get("font_family")
        size = r.get("font_size_pt")
        col  = r.get("color_rgb")

        if kind == "docx_run":
            rule = policy["fonts"].get("docx", {}).get("body", {})
            if rule.get("family") and pd.notna(fam) and fam != rule["family"]:
                v.append((r["file"], "docx.font_family", fam, rule["family"]))
            if rule.get("size_pt") and pd.notna(size) and float(size) != float(rule["size_pt"]):
                v.append((r["file"], "docx.font_size_pt", size, rule["size_pt"]))
            if not ok_color(col, policy.get("colors_rgb", [])):
                v.append((r["file"], "docx.color_rgb", col, "palette"))

        if kind == "pptx_run":
            pr = policy["fonts"].get("pptx", {})
            fam_req = pr.get("family")
            body_min = pr.get("body_min_pt")
            title_min = pr.get("title_min_pt")
            if fam_req and pd.notna(fam) and fam != fam_req:
                v.append((r["file"], "pptx.font_family", fam, fam_req))
            if pd.notna(size):
                s = float(size)
                # heuristic: if size >= 22 it is heading-like; otherwise body-like
                if s < float(body_min):
                    v.append((r["file"], "pptx.body_too_small", s, f">= {body_min}"))
                if s >= 22 and s < float(title_min):
                    v.append((r["file"], "pptx.title_too_small", s, f">= {title_min}"))
            if not ok_color(col, policy.get("colors_rgb", [])):
                v.append((r["file"], "pptx.color_rgb", col, "palette"))
    return pd.DataFrame(v, columns=["file", "rule", "observed", "expected"])

In [None]:
# Run validation over all feature CSVs
summ = []
for csv in FEAT.glob("*.features.csv"):
    v = validate_file(csv, policy)
    v.to_csv(OUT / f"{csv.stem}.violations.csv", index=False)
    summ.append({"file": csv.name, "violations": len(v)})
summary = pd.DataFrame(summ).sort_values("violations", ascending=False)
summary.to_csv(OUT / "summary.csv", index=False)
print("Wrote per-file violations and summary to", OUT)
display(summary.head(10))

In [None]:
# Peek into worst offender, if any
if not summary.empty and summary["violations"].max() > 0:
    worst = summary.iloc[0]["file"]
    import pandas as pd
    dfv = pd.read_csv(OUT / worst.replace(".features.csv", ".violations.csv"))
    display(dfv.head(20))