
# BridgeLite — Offline Evaluation Notebook

This notebook evaluates the exported model bundle on **train** and **production_sim** datasets, produces:
- macro-F1, accuracy, coverage@τ
- confusion matrices (saved as PNGs)
- a τ sweep (F1 vs coverage) plot to help choose the operating threshold
- a qualitative sample of predictions

> Expected repo layout:
> - `app/model_sklearn.pkl` (bundle with `vectorizer`, `model`, `label_encoder`)
> - `data/transactions_mock.csv`, `data/production_sim.csv`
> - `app/preproc.py` that defines `preprocess_record`

**Tip:** If running inside Docker, ensure these files exist inside the container.


In [None]:

import json
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

# Import your preprocessor from the app package
from app.preproc import preprocess_record

# Save plots into repo-relative paths as well
REPORT_DIR = Path("reports/eval")
REPORT_DIR.mkdir(parents=True, exist_ok=True)

print("Environment OK. Reports will be saved to:", REPORT_DIR.resolve())


In [None]:

# ---- Configuration ----
MODEL_PATH = Path("app/model_sklearn.pkl")
TRAIN_CSV  = Path("data/transactions_mock.csv")
PROD_CSV   = Path("data/production_sim.csv")

TAU = 0.6            # operating threshold for coverage calculation
N_SAMPLES_QUAL = 12  # qualitative sample size

assert MODEL_PATH.exists(), f"Missing model bundle: {MODEL_PATH}"
assert TRAIN_CSV.exists(), f"Missing train CSV: {TRAIN_CSV}"
assert PROD_CSV.exists(),  f"Missing prod CSV:  {PROD_CSV}"

bundle = joblib.load(MODEL_PATH)
vec = bundle["vectorizer"]
clf = bundle["model"]
le  = bundle["label_encoder"]
labels = list(le.classes_)

print("Loaded bundle with classes:", labels[:10], "..." if len(labels)>10 else "")


In [None]:

def build_text(row: pd.Series) -> str:
    """Build the exact text the model expects: masked + op/mcc tokens."""
    pre = preprocess_record({"raw_label": row.get("raw_label", ""), "channel": row.get("channel", None)})
    op = pre.operation_type or "unknown"
    mcc = row.get("mcc", "")
    mcc_tok = f"mcc:{int(mcc)}" if str(mcc).strip().isdigit() else "mcc:none"
    return f"{pre.masked} op:{op} {mcc_tok}"


def predict_batch(texts: list[str]):
    X = vec.transform(texts)
    proba = clf.predict_proba(X)
    idx = np.argmax(proba, axis=1)
    pred = le.inverse_transform(idx)
    pmax = proba.max(axis=1)
    return pred, pmax, proba


def eval_split(df: pd.DataFrame, tau: float = 0.6, name: str = "eval"):
    texts = df.apply(build_text, axis=1).astype(str).tolist()
    y_true = None
    if "category_label" in df.columns:
        y_true = le.transform(df["category_label"].astype(str).values)
    pred, pmax, proba = predict_batch(texts)
    if y_true is not None:
        y_pred = le.transform(pred)
        f1 = float(f1_score(y_true, y_pred, average="macro"))
        acc = float(accuracy_score(y_true, y_pred))
    else:
        f1, acc = None, None
    coverage = float(np.mean(pmax >= tau))
    return {
        "pred": pred, "pmax": pmax, "proba": proba, "y_true": y_true,
        "f1": f1, "acc": acc, "coverage": coverage, "name": name
    }


def plot_confusion(y_true, y_pred, classes: list[str], title: str, out_png: Path):
    cm = confusion_matrix(y_true, y_pred, labels=np.arange(len(classes)))
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111)
    im = ax.imshow(cm, interpolation="nearest")
    ax.set_title(title)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_xticks(range(len(classes)))
    ax.set_yticks(range(len(classes)))
    ax.set_xticklabels(classes, rotation=45, ha="right")
    ax.set_yticklabels(classes)
    for (i,j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center")
    fig.tight_layout()
    fig.savefig(out_png, dpi=150)
    plt.close(fig)
    return out_png


In [None]:

df_train = pd.read_csv(TRAIN_CSV)
print("Train shape:", df_train.shape)
res_train = eval_split(df_train, tau=TAU, name="train")
print({k: res_train[k] for k in ["f1","acc","coverage"]})

if res_train["y_true"] is not None:
    y_pred_train = le.transform(res_train["pred"])
    png = REPORT_DIR / "confusion_train.png"
    plot_confusion(res_train["y_true"], y_pred_train, labels, "Confusion (train)", png)
    print("Saved:", png)


In [None]:

df_prod = pd.read_csv(PROD_CSV)
print("Prod shape:", df_prod.shape)
res_prod = eval_split(df_prod, tau=TAU, name="prod")
print({k: res_prod[k] for k in ["f1","acc","coverage"]})

if res_prod["y_true"] is not None:
    y_pred_prod = le.transform(res_prod["pred"])
    png = REPORT_DIR / "confusion_prod.png"
    plot_confusion(res_prod["y_true"], y_pred_prod, labels, "Confusion (prod)", png)
    print("Saved:", png)


In [None]:

taus = np.linspace(0.4, 0.9, 26)  # step 0.02
f1s, covs = [], []
# Use prod if it has labels, else train
target = res_prod if res_prod["y_true"] is not None else res_train

for t in taus:
    cov = float(np.mean(target["pmax"] >= t))
    if target["y_true"] is not None:
        idx = np.argmax(target["proba"], axis=1)
        y_pred = idx  # doesn't change with tau; tau only affects coverage
        f1 = float(f1_score(target["y_true"], y_pred, average="macro"))
    else:
        f1 = np.nan
    covs.append(cov); f1s.append(f1)

best_idx = int(np.nanargmax([f for f, c in zip(f1s, covs) if c >= 0.90] or [np.nan]))
best_tau = float(taus[best_idx]) if f1s else None

fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111)
ax.plot(taus, covs, label="coverage")
ax.plot(taus, f1s, label="macro-F1")
ax.set_title("τ sweep — coverage vs macro-F1")
ax.set_xlabel("τ")
ax.set_ylabel("value")
ax.legend()
fig.tight_layout()
out_png = REPORT_DIR / "tau_sweep.png"
fig.savefig(out_png, dpi=150)
plt.close(fig)

summary_tau = {"taus": taus.tolist(), "coverage": covs, "macro_f1": f1s, "recommended_tau_if_cov90": best_tau}
print("Saved τ sweep:", out_png, "  recommended τ (coverage≥0.90):", best_tau)


In [None]:

# Show some qualitative examples with predictions
sample_df = (df_prod if len(df_prod) else df_train).sample(min(N_SAMPLES_QUAL, len(df_prod) or len(df_train)), random_state=7)
texts = sample_df.apply(build_text, axis=1).astype(str).tolist()
pred, pmax, _ = predict_batch(texts)
out = sample_df.copy()
out["pred"] = pred
out["conf"] = pmax
if "category_label" in out.columns:
    out["true"] = out["category_label"]
display_cols = [c for c in ["raw_label","channel","mcc","true","pred","conf"] if c in out.columns]
out_sorted = out[display_cols].sort_values("conf", ascending=False)
out_sorted.head(N_SAMPLES_QUAL)


In [None]:

summary = {
    "train": {k: res_train[k] for k in ["f1","acc","coverage"]},
    "prod":  {k: res_prod[k]  for k in ["f1","acc","coverage"]},
    "tau": float(TAU),
    "recommended_tau_if_cov90": summary_tau.get("recommended_tau_if_cov90", None),
    "classes": labels,
}
(REPORT_DIR / "offline_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
print(json.dumps(summary, indent=2))
