## **Ablation B — URL-only manifest**
- (keep TLDLegitimateProb, exclude HTML/Title features)

### **Import Libraries**

In [1]:
from pathlib import Path
import os, json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score, average_precision_score, brier_score_loss
from xgboost import XGBClassifier
import mlflow
import yaml
from dotenv import load_dotenv


### **Set working directory**

In [2]:
os.chdir("../")
print(os.getcwd())

d:\MLops\NetworkSecurity


In [3]:
# Load environment variables from .env file
load_dotenv()
SEED = 42
THRESH_PATH = Path("configs/dev/thresholds.json")
MLFLOW_URI = os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000")
EXPERIMENT = os.getenv("MLFLOW_EXPERIMENT", "phiusiil_baselines")
THRESH_PATH.parent.mkdir(parents=True, exist_ok=True)

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 3
python-dotenv could not parse statement starting at line 7
python-dotenv could not parse statement starting at line 10
python-dotenv could not parse statement starting at line 11
python-dotenv could not parse statement starting at line 12
python-dotenv could not parse statement starting at line 13
python-dotenv could not parse statement starting at line 14
python-dotenv could not parse statement starting at line 15


### **Load dataset and yml files**

In [4]:
DATA = Path("data/processed/phiusiil_clean_urlfeats.csv")
MANIFEST = Path("configs/dev/features_url_only.yaml")

# Show the fingerprint we wrote, handy for MLflow tags
fp_path = Path("outputs/url_features_fingerprint.json")

if fp_path.exists():
    print("Data fingerprint:", json.loads(fp_path.read_text()))
else:
    print("Fingerprint file not found; proceed anyway.")

assert DATA.exists(), f"Missing processed data: {DATA}"
assert MANIFEST.exists(), f"Missing manifest: {MANIFEST}"


cfg = yaml.safe_load(MANIFEST.read_text())
whitelist = cfg["include"]
blacklist = set(cfg.get("exclude", []))

df = pd.read_csv(DATA, encoding_errors="ignore")
label_col = next((c for c in df.columns if c.lower() in {"label","result","y","target"}), None)
assert label_col, "No label column found in processed data"

Data fingerprint: {'rows': 235370, 'cols': 58, 'file': 'data\\processed\\phiusiil_clean_urlfeats.csv', 'md5': '30393b938e541b7b3cef650818740d20', 'added_features': ['url_len', 'url_digit_ratio', 'url_subdomains'], 'ranges': {'url_len': [14, 6097], 'url_digit_ratio': [0.0, 0.6842105263157895], 'url_subdomains': [0, 10]}}


### **Selects the features to include/exclude**

In [5]:
# Keep exactly the whitelist columns that actually exist; drop anything else
present = [c for c in whitelist if c in df.columns]
missing = [c for c in whitelist if c not in df.columns]
assert present, f"No manifest features found. Missing from data: {missing}"

# Never allow blacklisted or non-numeric columns to slip in
X = df[present].select_dtypes(include=["number"]).copy()
y = df[label_col].astype(int).values

print("URL-only manifest (present):", present)
if missing:
    print("Note: these manifest features were not found and are skipped:", missing)

# Save the resolved feature list for audit + MLflow logging later
Path("outputs").mkdir(exist_ok=True)
Path("outputs/feature_manifest_resolved.json").write_text(json.dumps({"features": present}, indent=2))

# Optionally extract URLs if needed for later use
if "URL" in df.columns:
    urls = df["URL"].astype(str).values
else:
    urls = np.array([""] * len(df))

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=SEED)


URL-only manifest (present): ['url_len', 'url_digit_ratio', 'url_subdomains', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'CharContinuationRate', 'URLCharProb', 'TLDLegitimateProb']


### **Define candidates (uncalibrated base)**

In [6]:
logreg_base = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),   # sparse-safe; no harm if dense
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=SEED))

])

xgb_base = XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1, subsample=0.9, colsample_bytree=0.9,
    reg_lambda=1.0, random_state=SEED, n_jobs=0, objective="binary:logistic", verbose=False
)


candidates = {
    "logreg": logreg_base,
    "xgb": xgb_base,
}

### **Fit + calibrate + score**

In [7]:
def fit_calibrated(name, model):
    # isotonic calibration with 5-fold CV (robust on tabular)
    calib = CalibratedClassifierCV(model, 
                                   method="isotonic", 
                                   cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
                                   )
    calib.fit(X_train, y_train)
    
    # we need p_malicious = P(y=0). Most sklearn returns prob for class 1 -> P(y=1) (legit)
    p_legit = calib.predict_proba(X_val)[:, 1]
    p_mal = 1.0 - p_legit
    
    y_hat_phish = (p_mal >= 0.5).astype(int)   # 1 means "predict phish"
    y_pred = 1 - y_hat_phish  # map back to y-space (1=legit, 0=phish)
    
    # core metrics                   
    f1m = f1_score(y_val, y_pred, average="macro")         # temp decision at 0.5 on p_mal
    prauc = average_precision_score((y_val==0).astype(int), p_mal)             # AP wrt phishing as positive class
    brier = brier_score_loss((y_val==0).astype(int), p_mal)                     # smaller=better
    return calib, {"f1_macro@0.5_on_p_mal": float(f1m), 
                   "pr_auc_phish": float(prauc), 
                   "brier_phish": float(brier)}, p_mal

results, calibrated, pvals = {}, {}, {}
for name, model in candidates.items():
    cls, metrics, p_mal = fit_calibrated(name, model)
    calibrated[name] = cls
    pvals[name] = p_mal
    results[name] = metrics

Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### **Pick best by PR-AUC (tie-break F1)**

In [8]:
order = sorted(results.items(), key=lambda kv: (kv[1]["pr_auc_phish"], kv[1]["f1_macro@0.5_on_p_mal"]), reverse=True)
best_name, best_metrics = order[0]
best_model = calibrated[best_name]
p_mal = pvals[best_name]

### **Find single threshold (t) maximizing F1-macro**

In [9]:
grid = np.linspace(0.05, 0.95, 19)
f1s = []
for t in grid:
    y_hat = (p_mal >= t).astype(int)         # 1=phish prediction if p_mal>=t
    # but our y is 0=phish, 1=legit → map predictions to y-space:
    y_pred = 1 - y_hat
    f1s.append(f1_score(y_val, y_pred, average="macro"))
t_star = float(grid[int(np.argmax(f1s))])

### **Choose a symmetric band around t for a target gray-zone rate**

In [10]:
def pick_band_for_target(p_mal: np.ndarray, t_star: float, target=0.10, tol=0.002, max_iters=40):
    lo, hi = 0.0, 0.5  # search bounds on half-width
    def gray(half_w):
        low = max(0.0, t_star - half_w)
        high = min(1.0, t_star + half_w)
        return ((p_mal >= low) & (p_mal < high)).mean(), low, high

    for _ in range(max_iters):
        half_w = (lo + hi) / 2
        g, low, high = gray(half_w)
        if g > target + tol:   # too wide -> shrink
            hi = half_w
        elif g < target - tol: # too narrow -> widen
            lo = half_w
        else:
            low_f, high_f = float(low), float(high)
            return low_f, high_f, float(g)

    g, low, high = gray((lo + hi) / 2)
    return float(low), float(high), float(g)

# Compute the band 
low, high, gray = pick_band_for_target(p_mal, t_star=t_star, target=0.10)
print({"t_star": float(t_star), "low": low, "high": high, "gray_zone_rate": gray})

{'t_star': 0.44999999999999996, 'low': 0.19218749999999996, 'high': 0.7078125, 'gray_zone_rate': 0.09914177677698942}


### **Expand to gray-zone band around t targeting ~10–15%**

In [11]:
target_lo, target_hi = 0.10, 0.15
band_candidates = np.linspace(0.05, 0.40, 8)     # half-widths
chosen = (t_star, max(0.0, t_star-0.10), min(1.0, t_star+0.10), 0.0)  # default
for w in band_candidates:
    low, high = max(0.0, t_star - w), min(1.0, t_star + w)
    gray = ((p_mal >= low) & (p_mal < high)).mean()
    if target_lo <= gray <= target_hi:
        chosen = (t_star, float(low), float(high), float(gray)); break
t_star, low, high, gray_rate = chosen

### **Final metrics (forced decision and gray-zone rate)**

In [12]:
y_hat_star = (p_mal >= t_star).astype(int)
y_pred_star = 1 - y_hat_star
final_f1 = f1_score(y_val, y_pred_star, average="macro")
final_pr = average_precision_score((y_val==0).astype(int), p_mal)

summary = {
    "data_file": str(DATA),
    "best_model": best_name,
    "metrics_val": {
                    "pr_auc_phish": final_pr,
                    "f1_macro@t_star": final_f1,
                    "brier_phish": brier_score_loss((y_val==0).astype(int), p_mal),
    },
    "thresholds": {"t_star": float(t_star),
                    "low": float(low),
                    "high": float(high),
                    "gray_zone_rate": float(gray)},
    "class_mapping": {"phish": 0, 
                      "legit": 1},
    "seed": SEED,
}
print("Selection:", best_name, best_metrics)
print("Thresholds:", summary["thresholds"])

Selection: xgb {'f1_macro@0.5_on_p_mal': 0.9166721099659982, 'pr_auc_phish': 0.9580912093690597, 'brier_phish': 0.06462256688440587}
Thresholds: {'t_star': 0.44999999999999996, 'low': 0.14999999999999997, 'high': 0.75, 'gray_zone_rate': 0.14424098228321366}


## **log to MLflow + export thresholds.json**

In [13]:
# log to MLflow + export thresholds.json
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(EXPERIMENT)
with mlflow.start_run(run_name=f"{best_name}_calibrated"):
    mlflow.log_params({
        "model": best_name,
        "calibration": "isotonic_cv5",
        "seed": SEED,
        "features": X.shape[1],
        "train_rows": int(len(X_train)),
        "val_rows": int(len(X_val)),
        "data_file": str(DATA),
    })
    mlflow.log_metrics({
        "val_pr_auc_phish": summary["metrics_val"]["pr_auc_phish"],
        "val_f1_macro_t_star": summary["metrics_val"]["f1_macro@t_star"],
        "val_brier_phish": summary["metrics_val"]["brier_phish"],
        "gray_zone_rate": summary["thresholds"]["gray_zone_rate"],
        "t_star": summary["thresholds"]["t_star"],
        "low": summary["thresholds"]["low"],
        "high": summary["thresholds"]["high"],
    })
    # Save/export thresholds for serving
    with open(THRESH_PATH, "w", encoding="utf-8") as f:
        json.dump({
            "model": best_name,
            "class_mapping": summary["class_mapping"],
            "calibration": {"method": "isotonic", "cv": 5},
            "thresholds": summary["thresholds"],
            "data": {"file": summary["data_file"]},
            "seed": summary["seed"],
        }, f, indent=2)
    mlflow.log_artifact(THRESH_PATH)

print(f"MLflow tracking URI: {MLFLOW_URI}")
print(f"Wrote thresholds → {THRESH_PATH.resolve()}")


🏃 View run xgb_calibrated at: http://localhost:5000/#/experiments/1/runs/88a605b3a7ae4172aa40b01f9d541cc3
🧪 View experiment at: http://localhost:5000/#/experiments/1
MLflow tracking URI: http://localhost:5000
Wrote thresholds → D:\MLops\NetworkSecurity\configs\dev\thresholds.json
