## **Ablation B — URL-only manifest**
- (keep TLDLegitimateProb, exclude HTML/Title features)

### **Import Libraries**

In [3]:
from pathlib import Path
import os
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, average_precision_score, brier_score_loss
from xgboost import XGBClassifier
import mlflow
import yaml
from dotenv import load_dotenv


### **Set working directory**

In [4]:
os.chdir("../")
print(os.getcwd())

d:\MLops\NetworkSecurity


In [5]:
# Load environment variables from .env file
load_dotenv()
SEED = 42
THRESH_PATH = Path("configs/dev/thresholds.json")
MLFLOW_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
EXPERIMENT = os.getenv("MLFLOW_EXPERIMENT", "phiusiil_baselines")
THRESH_PATH.parent.mkdir(parents=True, exist_ok=True)

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 3
python-dotenv could not parse statement starting at line 7
python-dotenv could not parse statement starting at line 10
python-dotenv could not parse statement starting at line 11
python-dotenv could not parse statement starting at line 12
python-dotenv could not parse statement starting at line 13
python-dotenv could not parse statement starting at line 14
python-dotenv could not parse statement starting at line 15


### **Load dataset and yml files**

In [6]:
DATA = Path("data/processed/phiusiil_clean_urlfeats.csv")
MANIFEST = Path("configs/dev/features_url_only.yaml")

# Show the fingerprint we wrote, handy for MLflow tags
fp_path = Path("outputs/url_features_fingerprint.json")

if fp_path.exists():
    print("Data fingerprint:", json.loads(fp_path.read_text()))
else:
    print("Fingerprint file not found; proceed anyway.")

assert DATA.exists(), f"Missing processed data: {DATA}"
assert MANIFEST.exists(), f"Missing manifest: {MANIFEST}"


cfg = yaml.safe_load(MANIFEST.read_text())
whitelist = cfg["include"]
blacklist = set(cfg.get("exclude", []))

df = pd.read_csv(DATA, encoding_errors="ignore")
label_col = next(
    (c for c in df.columns if c.lower() in {"label", "result", "y", "target"}), None
)
assert label_col, "No label column found in processed data"

Data fingerprint: {'rows': 235370, 'cols': 58, 'file': 'data\\processed\\phiusiil_clean_urlfeats.csv', 'md5': '30393b938e541b7b3cef650818740d20', 'added_features': ['url_len', 'url_digit_ratio', 'url_subdomains'], 'ranges': {'url_len': [14, 6097], 'url_digit_ratio': [0.0, 0.6842105263157895], 'url_subdomains': [0, 10]}}


### **Selects the features to include/exclude**

In [7]:
# Keep exactly the whitelist columns that actually exist; drop anything else
present = [c for c in whitelist if c in df.columns]
missing = [c for c in whitelist if c not in df.columns]
assert present, f"No manifest features found. Missing from data: {missing}"

# Never allow blacklisted or non-numeric columns to slip in
X = df[present].select_dtypes(include=["number"]).copy()
y = df[label_col].astype(int).values

print("URL-only manifest (present):", present)
if missing:
    print("Note: these manifest features were not found and are skipped:", missing)

# Save the resolved feature list for audit + MLflow logging later
Path("outputs").mkdir(exist_ok=True)
Path("outputs/feature_manifest_resolved.json").write_text(
    json.dumps({"features": present}, indent=2)
)

# Optionally extract URLs if needed for later use
if "URL" in df.columns:
    urls = df["URL"].astype(str).values
else:
    urls = np.array([""] * len(df))

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=SEED
)


URL-only manifest (present): ['url_len', 'url_digit_ratio', 'url_subdomains', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'CharContinuationRate', 'URLCharProb', 'TLDLegitimateProb']


### **Define candidates (uncalibrated base)**

In [8]:
logreg_base = Pipeline(
    [
        ("scaler", StandardScaler(with_mean=False)),  # sparse-safe; no harm if dense
        (
            "clf",
            LogisticRegression(
                max_iter=2000, class_weight="balanced", random_state=SEED
            ),
        ),
    ]
)

xgb_base = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=0,
    objective="binary:logistic",
    verbose=False,
)


candidates = {
    "logreg": logreg_base,
    "xgb": xgb_base,
}

### **Fit + calibrate + score**

In [9]:
def fit_calibrated(name, model):
    # isotonic calibration with 5-fold CV (robust on tabular)

    calib = CalibratedClassifierCV(
        model,
        method="isotonic",
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
    )
    calib.fit(X_train, y_train)

    # we need p_malicious = P(y=0). Most sklearn returns prob for class 1 -> P(y=1) (legit)
    p_legit = calib.predict_proba(X_val)[:, 1]
    p_mal = 1.0 - p_legit

    y_hat_phish = (p_mal >= 0.5).astype(int)  # 1 means "predict phish"
    y_pred = 1 - y_hat_phish  # map back to y-space (1=legit, 0=phish)

    # core metrics
    f1m = f1_score(y_val, y_pred, average="macro")  # temp decision at 0.5 on p_mal
    prauc = average_precision_score(
        (y_val == 0).astype(int), p_mal
    )  # AP wrt phishing as positive class
    brier = brier_score_loss((y_val == 0).astype(int), p_mal)  # smaller=better
    return (
        calib,
        {
            "f1_macro@0.5_on_p_mal": float(f1m),
            "pr_auc_phish": float(prauc),
            "brier_phish": float(brier),
        },
        p_mal,
    )


results, calibrated, pvals = {}, {}, {}
for name, model in candidates.items():
    cls, metrics, p_mal = fit_calibrated(name, model)
    calibrated[name] = cls
    pvals[name] = p_mal
    results[name] = metrics

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.

Parameters: { "verbose" } are not used.



### **Pick best by PR-AUC (tie-break F1)**

In [10]:
order = sorted(
    results.items(),
    key=lambda kv: (kv[1]["pr_auc_phish"], kv[1]["f1_macro@0.5_on_p_mal"]),
    reverse=True,
)
best_name, best_metrics = order[0]
best_model = calibrated[best_name]
p_mal = pvals[best_name]

### **Find single threshold (t) maximizing F1-macro**

In [11]:
grid = np.linspace(0.05, 0.95, 19)
f1s = []
for t in grid:
    y_hat = (p_mal >= t).astype(int)  # 1=phish prediction if p_mal>=t
    # but our y is 0=phish, 1=legit → map predictions to y-space:
    y_pred = 1 - y_hat
    f1s.append(f1_score(y_val, y_pred, average="macro"))
t_star = float(grid[int(np.argmax(f1s))])

### **Choose a symmetric band around t for a target gray-zone rate**

In [12]:
def pick_band_for_target(
    p_mal: np.ndarray, t_star: float, target=0.10, tol=0.002, max_iters=40
):
    lo, hi = 0.0, 0.5  # search bounds on half-width

    def gray(half_w):
        low = max(0.0, t_star - half_w)
        high = min(1.0, t_star + half_w)
        return ((p_mal >= low) & (p_mal < high)).mean(), low, high

    for _ in range(max_iters):
        half_w = (lo + hi) / 2
        g, low, high = gray(half_w)
        if g > target + tol:  # too wide -> shrink
            hi = half_w
        elif g < target - tol:  # too narrow -> widen
            lo = half_w
        else:
            low_f, high_f = float(low), float(high)
            return low_f, high_f, float(g)

    g, low, high = gray((lo + hi) / 2)
    return float(low), float(high), float(g)


# Compute the band
low, high, gray = pick_band_for_target(p_mal, t_star=t_star, target=0.10)
print({"t_star": float(t_star), "low": low, "high": high, "gray_zone_rate": gray})

{'t_star': 0.44999999999999996, 'low': 0.18828124999999996, 'high': 0.71171875, 'gray_zone_rate': 0.09958788290776224}


### **Expand to gray-zone band around t targeting ~10–15%**

In [13]:
target_lo, target_hi = 0.10, 0.15
band_candidates = np.linspace(0.05, 0.40, 8)  # half-widths
chosen = (t_star, max(0.0, t_star - 0.10), min(1.0, t_star + 0.10), 0.0)  # default
for w in band_candidates:
    low, high = max(0.0, t_star - w), min(1.0, t_star + w)
    gray = ((p_mal >= low) & (p_mal < high)).mean()
    if target_lo <= gray <= target_hi:
        chosen = (t_star, float(low), float(high), float(gray))
        break
t_star, low, high, gray_rate = chosen

### **Final metrics (forced decision and gray-zone rate)**

In [14]:
y_hat_star = (p_mal >= t_star).astype(int)
y_pred_star = 1 - y_hat_star
final_f1 = f1_score(y_val, y_pred_star, average="macro")
final_pr = average_precision_score((y_val == 0).astype(int), p_mal)

summary = {
    "data_file": str(DATA),
    "best_model": best_name,
    "metrics_val": {
        "pr_auc_phish": final_pr,
        "f1_macro@t_star": final_f1,
        "brier_phish": brier_score_loss((y_val == 0).astype(int), p_mal),
    },
    "thresholds": {
        "t_star": float(t_star),
        "low": float(low),
        "high": float(high),
        "gray_zone_rate": float(gray),
    },
    "class_mapping": {"phish": 0, "legit": 1},
    "seed": SEED,
}
print("Selection:", best_name, best_metrics)
print("Thresholds:", summary["thresholds"])

Selection: xgb {'f1_macro@0.5_on_p_mal': 0.9163419573449059, 'pr_auc_phish': 0.9577761979950676, 'brier_phish': 0.06481116607409061}
Thresholds: {'t_star': 0.44999999999999996, 'low': 0.14999999999999997, 'high': 0.75, 'gray_zone_rate': 0.14232909886561584}


## **log to MLflow + export thresholds.json**

In [16]:
# log to MLflow + export thresholds.json
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT)
with mlflow.start_run(run_name=f"{best_name}_calibrated"):
    mlflow.log_params(
        {
            "model": best_name,
            "calibration": "isotonic_cv5",
            "seed": SEED,
            "features": X.shape[1],
            "train_rows": int(len(X_train)),
            "val_rows": int(len(X_val)),
            "data_file": str(DATA),
        }
    )
    mlflow.log_metrics(
        {
            "val_pr_auc_phish": summary["metrics_val"]["pr_auc_phish"],
            "val_f1_macro_t_star": summary["metrics_val"]["f1_macro@t_star"],
            "val_brier_phish": summary["metrics_val"]["brier_phish"],
            "gray_zone_rate": summary["thresholds"]["gray_zone_rate"],
            "t_star": summary["thresholds"]["t_star"],
            "low": summary["thresholds"]["low"],
            "high": summary["thresholds"]["high"],
        }
    )
    # Save/export thresholds for serving
    with open(THRESH_PATH, "w", encoding="utf-8") as f:
        json.dump(
            {
                "model": best_name,
                "class_mapping": summary["class_mapping"],
                "calibration": {"method": "isotonic", "cv": 5},
                "thresholds": summary["thresholds"],
                "data": {"file": summary["data_file"]},
                "seed": summary["seed"],
            },
            f,
            indent=2,
        )
    mlflow.log_artifact(THRESH_PATH)

print(f"MLflow tracking URI: {MLFLOW_URI}")
print(f"Wrote thresholds → {THRESH_PATH.resolve()}")


2025/09/17 11:59:15 INFO mlflow.tracking.fluent: Experiment with name 'phiusiil_baselines' does not exist. Creating a new experiment.


🏃 View run xgb_calibrated at: http://localhost:5000/#/experiments/1/runs/c80b541349ed467095e1f1155b9bf8b8
🧪 View experiment at: http://localhost:5000/#/experiments/1
MLflow tracking URI: http://localhost:5000
Wrote thresholds → D:\MLops\NetworkSecurity\configs\dev\thresholds.json


## **Persist trained model + metadata for model_svc**

Save the calibrated model and metadata to `models/dev/` for serving.

In [17]:
# === Persist trained model + metadata for model_svc ===
from __future__ import annotations
import json, hashlib
from pathlib import Path
import joblib

# 1) Locate the fitted estimator (adjust the list if your var names differ)
candidates = [
    globals().get("calibrated_clf"),
    globals().get("calibrated_model"),
    globals().get("best_model"),
    globals().get("best_clf"),
    globals().get("final_model"),
    globals().get("clf"),
]
fitted = next((m for m in candidates if m is not None), None)
if fitted is None:
    raise ValueError(
        "Could not find a fitted estimator (expected one of: calibrated_clf, calibrated_model, best_model, best_clf, final_model, clf)"
    )

# 2) Infer feature order (priority: X_train -> X_val -> manifest include list)
feature_order = None
for X_name in ("X_train", "X_val", "X_features"):
    if X_name in globals() and hasattr(globals()[X_name], "columns"):
        feature_order = list(globals()[X_name].columns)
        break
if feature_order is None:
    # fallback to config manifest

    manifest_path = Path("configs/dev/features_url_only.yaml")
    if manifest_path.exists():
        data = yaml.safe_load(manifest_path.read_text(encoding="utf-8"))
        feature_order = list(data.get("include", []))
if not feature_order:
    raise ValueError(
        "Feature order not found; please expose X_train/X_val with columns or ensure configs/dev/features_url_only.yaml has 'include'."
    )

# 3) Determine which class id corresponds to 'phish'
#    Our convention so far: class 0 == 'phish', class 1 == 'legit'.
phish_class_id = 0
if hasattr(fitted, "classes_"):
    classes = list(getattr(fitted, "classes_"))
    if 0 in classes:
        phish_class_id = classes.index(0)  # index in predict_proba columns
    else:
        # If classes are strings or different mapping, prefer the first column but record classes
        phish_class_id = 0

# 4) Build metadata (small but sufficient for serving)
meta = {
    "feature_order": feature_order,
    "class_mapping": {"phish": 0, "legit": 1},  # training-time label mapping
    "phish_proba_col_index": phish_class_id,  # column index in predict_proba for P(phish)
    "model_type": type(fitted).__name__,
    "notes": "URL-only baseline; calibrated; saved for model_svc.",
}

# 5) Output paths
ART_DIR = Path("models/dev")
ART_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = ART_DIR / "model.pkl"
META_PATH = ART_DIR / "model_meta.json"

# 6) Save
joblib.dump(fitted, MODEL_PATH)
META_PATH.write_text(json.dumps(meta, indent=2), encoding="utf-8")

# 7) Fingerprint for auditability
m = hashlib.md5(MODEL_PATH.read_bytes()).hexdigest()
print("[artifact] model:", MODEL_PATH, "md5:", m)
print("[artifact] meta :", META_PATH)
print("[artifact] features:", len(feature_order), "→", feature_order[:8], "...")

[artifact] model: models\dev\model.pkl md5: f022f9a181bd7bb406ce544f2947ad1f
[artifact] meta : models\dev\model_meta.json
[artifact] features: 8 → ['url_len', 'url_digit_ratio', 'url_subdomains', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'CharContinuationRate', 'URLCharProb', 'TLDLegitimateProb'] ...


---
---
# PART 2: Retrain with Shared Feature Extraction

**Context:** After deploying the model, we discovered training/serving skew - the models were trained on features extracted by PhiUSIIL authors, but production used our `src/common/feature_extraction.py`. This caused all predictions to return 1.0.

**Goal:** Retrain models using features extracted with our shared library to ensure training/serving consistency.

**Validation Checkpoint:** Compare performance (PR-AUC, F1, Brier) with original models above.

## Load New Features

In [None]:
# Load features extracted with shared library
DATA_V2 = Path("data/processed/phiusiil_features_v2.csv")

if not DATA_V2.exists():
    raise FileNotFoundError(
        f"New features not found: {DATA_V2}\n"
        f"Run notebooks/feature_engineering.ipynb first"
    )

df_v2 = pd.read_csv(DATA_V2)

print(f"Loaded: {DATA_V2}")
print(f"Shape: {df_v2.shape}")
print(f"Columns: {list(df_v2.columns)}")

## Feature Distribution Comparison

In [None]:
# Compare distributions of key features
print("Feature distribution comparison (Old vs New):\n")

feature_cols_v2 = [c for c in df_v2.columns if c not in ["URL", "label"]]

comparison_data = []
for feat in feature_cols_v2:
    if feat in df.columns:  # If feature existed in old data
        old_mean = df[feat].mean()
        new_mean = df_v2[feat].mean()
        diff = abs(old_mean - new_mean)
        status = "✓" if diff < 0.1 else "⚠️"

        comparison_data.append(
            {
                "Feature": feat,
                "Old Mean": old_mean,
                "New Mean": new_mean,
                "Diff": diff,
                "Status": status,
            }
        )

        print(
            f"{status} {feat:35s} Old: {old_mean:7.4f}  New: {new_mean:7.4f}  Δ: {diff:7.4f}"
        )

# Save comparison for later reference
df_comparison = pd.DataFrame(comparison_data)
df_comparison.to_csv("outputs/feature_comparison_v1_vs_v2.csv", index=False)
print(f"\n✓ Saved comparison to outputs/feature_comparison_v1_vs_v2.csv")

## Prepare Data (Same 7 Features)

In [None]:
# Use same 7 features as original model
FEATURES_7 = [
    "TLDLegitimateProb",
    "CharContinuationRate",
    "SpacialCharRatioInURL",
    "URLCharProb",
    "LetterRatioInURL",
    "NoOfOtherSpecialCharsInURL",
    "DomainLength",
]

# Verify all features exist
missing = [f for f in FEATURES_7 if f not in df_v2.columns]
if missing:
    raise ValueError(f"Missing features in new data: {missing}")

X_v2 = df_v2[FEATURES_7].copy()
y_v2 = df_v2["label"].astype(int).values

print(f"Feature matrix shape: {X_v2.shape}")
print(f"Label distribution:\n{pd.Series(y_v2).value_counts()}")
print(f"\nFeatures: {FEATURES_7}")

## Train/Val Split

In [None]:
# Same split as original (20% validation, stratified, same seed)
X_train_v2, X_val_v2, y_train_v2, y_val_v2 = train_test_split(
    X_v2, y_v2, test_size=0.20, stratify=y_v2, random_state=SEED
)

print(f"Train: {X_train_v2.shape}")
print(f"Val:   {X_val_v2.shape}")

## Train 7-Feature Model (v2)

In [None]:
# Use same model architecture as original
xgb_base_v2 = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=0,
    objective="binary:logistic",
    verbose=False,
)

# Calibrate with isotonic (same as original)
calib_v2 = CalibratedClassifierCV(
    xgb_base_v2,
    method="isotonic",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED),
)

print("Training 7-feature model (v2)...")
calib_v2.fit(X_train_v2, y_train_v2)
print("✓ Training complete")

## Evaluate 7-Feature Model (v2)

In [None]:
# Get predictions
p_legit_v2 = calib_v2.predict_proba(X_val_v2)[:, 1]  # P(legit)
p_mal_v2 = 1.0 - p_legit_v2  # P(phish)

# Calculate metrics
y_val_phish_v2 = (y_val_v2 == 0).astype(int)  # 1=phish for PR-AUC

pr_auc_v2 = average_precision_score(y_val_phish_v2, p_mal_v2)
brier_v2 = brier_score_loss(y_val_phish_v2, p_mal_v2)

# F1 at optimal threshold
grid = np.linspace(0.05, 0.95, 19)
f1s_v2 = []
for t in grid:
    y_hat = (p_mal_v2 >= t).astype(int)
    y_pred = 1 - y_hat
    f1s_v2.append(f1_score(y_val_v2, y_pred, average="macro"))
f1_v2 = max(f1s_v2)

print("\n" + "=" * 60)
print("7-FEATURE MODEL (V2) PERFORMANCE")
print("=" * 60)
print(f"PR-AUC (phish):  {pr_auc_v2:.6f}")
print(f"F1-Macro:        {f1_v2:.6f}")
print(f"Brier Score:     {brier_v2:.6f}")
print("=" * 60)

## Compare with Original Model

In [None]:
# Get original model metrics (from Part 1 above)
try:
    # These should be defined from Part 1
    pr_auc_original = final_pr  # or extract from summary dict
    f1_original = final_f1
    brier_original = summary["metrics_val"]["brier_phish"]

    print("\n" + "=" * 60)
    print("MODEL COMPARISON: ORIGINAL vs V2")
    print("=" * 60)
    print(f"{'Metric':<20} {'Original':>12} {'V2':>12} {'Δ':>12} {'Status':>10}")
    print("-" * 60)

    # PR-AUC comparison
    pr_diff = pr_auc_v2 - pr_auc_original
    pr_status = "✓" if pr_auc_v2 > 0.95 else "⚠️"
    print(
        f"{'PR-AUC':<20} {pr_auc_original:>12.6f} {pr_auc_v2:>12.6f} {pr_diff:>+12.6f} {pr_status:>10}"
    )

    # F1 comparison
    f1_diff = f1_v2 - f1_original
    f1_status = "✓" if f1_v2 > 0.95 else "⚠️"
    print(
        f"{'F1-Macro':<20} {f1_original:>12.6f} {f1_v2:>12.6f} {f1_diff:>+12.6f} {f1_status:>10}"
    )

    # Brier comparison
    brier_diff = brier_v2 - brier_original
    brier_status = "✓" if brier_v2 < 0.01 else "⚠️"
    print(
        f"{'Brier Score':<20} {brier_original:>12.6f} {brier_v2:>12.6f} {brier_diff:>+12.6f} {brier_status:>10}"
    )

    print("=" * 60)

    # Overall assessment
    if pr_auc_v2 > 0.95 and f1_v2 > 0.95:
        print("\n✅ V2 MODEL PERFORMANCE: ACCEPTABLE")
        print("   → Performance maintained with shared feature extraction")
        print("   → Safe to deploy v2 models")
    else:
        print("\n⚠️ V2 MODEL PERFORMANCE: BELOW THRESHOLD")
        print("   → Feature distribution shift impacted model quality")
        print("   → Recommendation: Redo feature selection with new features")

except NameError:
    print("⚠️ Original model metrics not found - run Part 1 first")

## Spot Check Predictions

In [None]:
# Test on known URLs using our feature extraction
import sys

sys.path.insert(0, str(Path.cwd() / "src"))
from common.feature_extraction import extract_features

test_urls = [
    ("https://google.com", "Legitimate", 0.02),
    ("https://github.com", "Legitimate", 0.02),
    ("https://microsoft.com", "Legitimate", 0.02),
    ("http://phishing.top/login", "Phishing", 0.95),
    ("http://secure-bank-verify.tk/account", "Phishing", 0.95),
    ("http://paypal-secure-login.ml/update", "Phishing", 0.95),
]

print("\n" + "=" * 60)
print("SPOT CHECK: KNOWN URLS")
print("=" * 60)
print(f"{'URL':<45} {'Expected':<12} {'p_mal':>8} {'Status':>8}")
print("-" * 60)

spot_check_results = []

for url, expected_class, expected_score in test_urls:
    try:
        # Extract features
        features = extract_features(url, include_https=False)

        # Create DataFrame with correct feature order
        X_test = pd.DataFrame([features])[FEATURES_7]

        # Predict
        p_legit = calib_v2.predict_proba(X_test)[0, 1]
        p_mal = 1.0 - p_legit

        # Check if prediction is correct
        if expected_class == "Legitimate":
            status = "✓" if p_mal < 0.3 else "✗"
        else:  # Phishing
            status = "✓" if p_mal > 0.7 else "✗"

        spot_check_results.append(
            {
                "url": url,
                "expected": expected_class,
                "p_malicious": p_mal,
                "correct": status == "✓",
            }
        )

        print(f"{url[:45]:<45} {expected_class:<12} {p_mal:>8.4f} {status:>8}")

    except Exception as e:
        print(f"{url[:45]:<45} {'ERROR':<12} {'N/A':>8} {'✗':>8}")
        print(f"  Error: {e}")

print("=" * 60)

# Summary
correct_count = sum(r["correct"] for r in spot_check_results)
total_count = len(spot_check_results)
accuracy = correct_count / total_count if total_count > 0 else 0

print(f"\nSpot Check Accuracy: {correct_count}/{total_count} ({accuracy:.1%})")

if accuracy == 1.0:
    print("✅ All spot checks passed!")
elif accuracy >= 0.8:
    print("⚠️ Most spot checks passed, but some failed")
else:
    print("❌ Many spot checks failed - model may not be working correctly")

## Save V2 Model (If Performance Acceptable)

In [None]:
# Only save if performance is acceptable
if pr_auc_v2 > 0.95 and f1_v2 > 0.95:
    # Save model
    MODEL_PATH_V2 = Path("models/dev/model_7feat_v2.pkl")
    joblib.dump(calib_v2, MODEL_PATH_V2)
    print(f"✓ Saved model to {MODEL_PATH_V2}")

    # Save metadata
    META_PATH_V2 = Path("models/dev/model_7feat_v2_meta.json")
    meta_v2 = {
        "feature_order": FEATURES_7,
        "class_mapping": {"phish": 0, "legit": 1},
        "phish_proba_col_index": 0,
        "model_type": "CalibratedClassifierCV",
        "calibration": "isotonic_cv5",
        "training_date": pd.Timestamp.now().isoformat(),
        "seed": SEED,
        "metrics": {
            "pr_auc": float(pr_auc_v2),
            "f1_macro": float(f1_v2),
            "brier": float(brier_v2),
        },
        "notes": "7-feature model trained with shared feature extraction library (v2)",
        "data_source": "phiusiil_features_v2.csv",
    }

    META_PATH_V2.write_text(json.dumps(meta_v2, indent=2), encoding="utf-8")
    print(f"✓ Saved metadata to {META_PATH_V2}")

    print("\n✅ V2 MODEL SAVED - Ready for deployment")
else:
    print("\n⚠️ V2 MODEL NOT SAVED - Performance below threshold")
    print("   Next steps: Redo feature selection with new features")

## Decision: Which Model to Use?

In [None]:
print("\n" + "=" * 60)
print("DEPLOYMENT RECOMMENDATION")
print("=" * 60)

if pr_auc_v2 > 0.95 and f1_v2 > 0.95:
    print("\n✅ DEPLOY V2 MODEL")
    print("\nReasons:")
    print("  1. Performance maintained (PR-AUC > 0.95, F1 > 0.95)")
    print("  2. Training/serving consistency (same feature extraction)")
    print("  3. Spot checks pass on known URLs")
    print("\nNext steps:")
    print("  1. Update model service to use model_7feat_v2.pkl")
    print("  2. Update config.yaml to point to v2 models")
    print("  3. Test end-to-end with model service")
    print("  4. Document the training/serving skew fix")
else:
    print("\n⚠️ DO NOT DEPLOY V2 MODEL")
    print("\nReasons:")
    print(f"  1. Performance degraded: PR-AUC={pr_auc_v2:.4f} (target: >0.95)")
    print("  2. Feature distribution shift too large")
    print("\nNext steps:")
    print("  1. Run feature selection on new features (phiusiil_features_v2.csv)")
    print("  2. Identify optimal feature set for NEW distributions")
    print("  3. Retrain with optimal features")
    print("  4. Re-evaluate performance")

print("=" * 60)