# 02 — Build a tiny policy from the good examples

Let the correct files teach the computer what “good” looks like. Read the feature tables for the files marked as correct, find the most common font family, estimate sensible size thresholds, and write a tiny `policy.json`. Keep it human. If you cannot read it in one minute, simplify it.

In [None]:
from pathlib import Path
import pandas as pd, json, statistics
from collections import Counter

CWD = Path.cwd()
ROOT = (CWD if CWD.name != "notebooks" else CWD.parent).resolve()
FEAT = ROOT / "reports" / "features"
POLICY_DIR = ROOT / "reports" / "policy"
POLICY_DIR.mkdir(parents=True, exist_ok=True)

def stats(nums):
    if not nums:
        return {}
    s = sorted(nums)
    def pct(q):
        idx = max(0, min(len(s)-1, int(q*len(s))-1))
        return s[idx]
    return {"min": s[0], "p10": pct(0.10), "p25": pct(0.25), "median": statistics.median(s), "p75": pct(0.75), "p90": pct(0.90), "max": s[-1], "count": len(s)}

files = list(FEAT.glob("*.features.csv"))
assert files, "No features found. Run 01 notebook first."
good = [f for f in files if "juist" in f.stem.lower()] or files  # fallback to all if none tagged
print("Using", len(good), "feature files to infer rules")

In [None]:
import re
ppt_runs = []
doc_runs = []
for f in good:
    df = pd.read_csv(f)
    if any(df["kind"].str.contains("pptx_run", na=False)):
        ppt_runs.append(df[df["kind"]=="pptx_run"])
    if any(df["kind"].str.contains("docx_run", na=False)):
        doc_runs.append(df[df["kind"]=="docx_run"])

ppt_df = pd.concat(ppt_runs, ignore_index=True) if ppt_runs else pd.DataFrame(columns=["font_family","font_size_pt","color_rgb"])
doc_df = pd.concat(doc_runs, ignore_index=True) if doc_runs else pd.DataFrame(columns=["font_family","font_size_pt","color_rgb"])

ppt_fonts = Counter([f for f in ppt_df["font_family"].dropna().tolist() if f])
doc_fonts = Counter([f for f in doc_df["font_family"].dropna().tolist() if f])

ppt_sizes = [float(x) for x in ppt_df["font_size_pt"].dropna().tolist()]
doc_sizes = [float(x) for x in doc_df["font_size_pt"].dropna().tolist()]

ppt_stats = stats(ppt_sizes)
body_sizes = [x for x in ppt_sizes if x <= 22]
title_sizes = [x for x in ppt_sizes if x > 22]
body_stats = stats(body_sizes)
title_stats = stats(title_sizes)

ppt_family = (ppt_fonts.most_common(1)[0][0] if ppt_fonts else (doc_fonts.most_common(1)[0][0] if doc_fonts else "Montserrat"))
doc_family = (doc_fonts.most_common(1)[0][0] if doc_fonts else ppt_family)

# Floors chosen to be readable on slides; edit if your template says otherwise
ppt_body_min = int(max(18, round(body_stats.get("p25", 18)))) if body_stats else 18
ppt_title_min = int(max(32, round(title_stats.get("p10", 32)))) if title_stats else 32

# Word: use median, snap to 10/11/12
doc_body = 11
if doc_sizes:
    med = round(statistics.median(doc_sizes))
    doc_body = min((10,11,12), key=lambda x: abs(x-med))

palette = []
for s in (ppt_df.get("color_rgb", pd.Series(dtype=str)), doc_df.get("color_rgb", pd.Series(dtype=str))):
    for c in s.dropna().tolist():
        c = str(c).upper()
        if c not in palette:
            palette.append(c)
palette = palette[:8]

policy = {
    "fonts": {
        "docx": {"body": {"family": doc_family, "size_pt": doc_body}},
        "pptx": {"family": ppt_family, "title_min_pt": ppt_title_min, "body_min_pt": ppt_body_min}
    },
    "colors_rgb": palette,
    "logo": {"pptx_title_slide_required": False},
    "notes": {
        "derived_from": [f.name for f in good],
        "ppt_stats": ppt_stats,
        "ppt_body_stats": body_stats,
        "ppt_title_stats": title_stats
    }
}

print(json.dumps(policy, indent=2))

If something looks off, change it in the next cell. Treat the JSON like a contract that a new teammate could read without asking you anything.

In [None]:
# Optional manual override example:
# policy["fonts"]["pptx"]["family"] = "Montserrat"
# policy["fonts"]["pptx"]["title_min_pt"] = 36
# policy["fonts"]["pptx"]["body_min_pt"] = 20
# policy["fonts"]["docx"]["body"]["size_pt"] = 11

In [None]:
# Save policy
out = POLICY_DIR / "policy.json"
with open(out, "w", encoding="utf-8") as f:
    json.dump(policy, f, indent=2)
print("Wrote", out)