# Error Analysis Workbench: Confusion Matrix and Threshold Decisions\n\nUse this notebook to explore how threshold choices change confusion-matrix outcomes and business risk.

## What You Should Practice\n- inspect confusion matrices at different thresholds,\n- connect threshold shifts to precision/recall tradeoffs,\n- justify a metric-driven deployment threshold,\n- document a deployment decision with explicit risk assumptions.

In [None]:
from pathlib import Path\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nfrom IPython.display import display\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import (\n    confusion_matrix,\n    precision_score,\n    recall_score,\n    f1_score,\n    roc_auc_score,\n    precision_recall_curve,\n)\nfrom sklearn.pipeline import make_pipeline\nfrom sklearn.preprocessing import StandardScaler\n\nfrom sota_supervised_showcase.data import load_digits_split, build_binary_target

In [None]:
# Load a binary task: digit == 0 vs not 0\nsplit = load_digits_split()\ny_train_binary = build_binary_target(split.y_train, positive_digit=0)\ny_test_binary = build_binary_target(split.y_test, positive_digit=0)\n\nmodel = make_pipeline(\n    StandardScaler(),\n    LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42),\n)\nmodel.fit(split.x_train, y_train_binary)\ny_score = model.decision_function(split.x_test)\n\n# Convert decision scores to probabilities for thresholding\ny_proba = 1.0 / (1.0 + np.exp(-y_score))\n\nprint('Samples in test set:', len(y_test_binary))\nprint('Positive class proportion:', round(y_test_binary.mean(), 4))\nprint('ROC AUC (threshold-independent):', round(roc_auc_score(y_test_binary, y_score), 4))

In [None]:
def threshold_report(threshold: float) -> dict:\n    y_pred = (y_proba >= threshold).astype(int)\n    tn, fp, fn, tp = confusion_matrix(y_test_binary, y_pred).ravel()\n\n    precision = precision_score(y_test_binary, y_pred, zero_division=0)\n    recall = recall_score(y_test_binary, y_pred, zero_division=0)\n    f1 = f1_score(y_test_binary, y_pred, zero_division=0)\n    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0\n\n    return {\n        'threshold': threshold,\n        'precision': precision,\n        'recall': recall,\n        'f1': f1,\n        'specificity': specificity,\n        'tp': int(tp),\n        'fp': int(fp),\n        'tn': int(tn),\n        'fn': int(fn),\n    }\n\ndef plot_confusion_at_threshold(threshold: float) -> pd.DataFrame:\n    report = threshold_report(threshold)\n    matrix = np.array([[report['tn'], report['fp']], [report['fn'], report['tp']]])\n\n    fig, ax = plt.subplots(figsize=(5, 4))\n    im = ax.imshow(matrix, cmap='Blues')\n    ax.set_title(f'Confusion Matrix @ threshold={threshold:.2f}')\n    ax.set_xlabel('Predicted')\n    ax.set_ylabel('Actual')\n    ax.set_xticks([0, 1], labels=['Negative', 'Positive'])\n    ax.set_yticks([0, 1], labels=['Negative', 'Positive'])\n\n    for i in range(2):\n        for j in range(2):\n            ax.text(j, i, matrix[i, j], ha='center', va='center', color='black')\n\n    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)\n    plt.show()\n\n    metric_row = pd.DataFrame([report])[['threshold', 'precision', 'recall', 'f1', 'specificity', 'tp', 'fp', 'tn', 'fn']]\n    return metric_row

In [None]:
# Inspect a default threshold first\ndefault_metrics = plot_confusion_at_threshold(0.50)\ndefault_metrics

In [None]:
# Sweep threshold range to visualize tradeoffs\nthresholds = np.linspace(0.05, 0.95, 91)\nrows = [threshold_report(float(t)) for t in thresholds]\nsweep_df = pd.DataFrame(rows)\n\nfig, ax = plt.subplots(figsize=(8, 5))\nax.plot(sweep_df['threshold'], sweep_df['precision'], label='Precision', linestyle='--')\nax.plot(sweep_df['threshold'], sweep_df['recall'], label='Recall', linestyle='-')\nax.plot(sweep_df['threshold'], sweep_df['f1'], label='F1', linewidth=2)\nax.plot(sweep_df['threshold'], sweep_df['specificity'], label='Specificity', linestyle=':')\nax.set_title('Metric values across thresholds')\nax.set_xlabel('Threshold')\nax.set_ylabel('Metric value')\nax.set_ylim(0, 1.05)\nax.legend()\nplt.show()\n\n# Show top threshold candidates by F1\nsweep_df.sort_values('f1', ascending=False).head(10)

In [None]:
# Optional interactive threshold control (uses ipywidgets if available)\ndef render_threshold(threshold: float = 0.5):\n    metrics_df = plot_confusion_at_threshold(threshold)\n    display(metrics_df)\n\ntry:\n    import ipywidgets as widgets\n\n    slider = widgets.FloatSlider(\n        value=0.5, min=0.05, max=0.95, step=0.01, description='Threshold'\n    )\n    widgets.interact(render_threshold, threshold=slider)\nexcept Exception as exc:  # fallback path for environments without ipywidgets\n    print('Interactive widgets unavailable:', exc)\n    print('Use render_threshold(<value>) manually, for example render_threshold(0.35)')

## Metric-Driven Deployment Decision Worksheet\nFill this table after exploring threshold behavior.

In [None]:
deployment_decision = pd.DataFrame([\n    {\n        'chosen_threshold': None,\n        'primary_metric': '',\n        'target_metric_value': '',\n        'false_negative_cost': '',\n        'false_positive_cost': '',\n        'monitoring_plan': '',\n        'rollback_condition': '',\n        'decision_rationale': '',\n    }\n])\ndeployment_decision

In [None]:
def deployment_checklist(decision: pd.DataFrame) -> pd.Series:\n    required_cols = [\n        'chosen_threshold',\n        'primary_metric',\n        'target_metric_value',\n        'false_negative_cost',\n        'false_positive_cost',\n        'monitoring_plan',\n        'rollback_condition',\n        'decision_rationale',\n    ]\n    missing_cols = [col for col in required_cols if col not in decision.columns]\n    if missing_cols:\n        raise ValueError(f'Missing columns: {missing_cols}')\n\n    row = decision.iloc[0]\n    checks = {\n        col: (row[col] is not None and str(row[col]).strip() != '' and str(row[col]).lower() != 'none')\n        for col in required_cols\n    }\n    checks['all_fields_complete'] = all(checks.values())\n    return pd.Series(checks)\n\n# After filling deployment_decision, run:\n# deployment_checklist(deployment_decision)

## Final Prompt\nWrite 5-8 lines with your deployment recommendation:\n- chosen threshold,\n- metric target and expected tradeoff,\n- what could fail in production,\n- what signal should trigger rollback.