<a href="https://colab.research.google.com/github/cheongyeechian/DLI/blob/main/chong_jia_wen_tp073941.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LBP Hyperfast Hybrid (with τ-sweep & optional class-weight sweep)

**Goal:** Beat the paper's metrics (Acc≈94.1, Prec≈94.12, Rec≈95.35, F1≈94.73) in under ~5 minutes on CPU.

**Pipeline:**
1) **Stage A** — Fast Linear Prior: Character n-gram `HashingVectorizer` + `SGDClassifier (log_loss)` + Platt scaling.
2) **Stage B** — LBP-lite on **uncertain** subset only: Compact URL↔Domain↔TokenBucket graph with min-sum BP for a few iterations.

**New in this notebook:**
- Prints **training time** (vectorization, model train, calibrator) and **testing time** (Stage A prediction, graph build, LBP, stitching, total).
- **τ-sweep** on a small validation split to auto-pick the best decision threshold for LBP cost ratio.
- **Optional** lightweight **class-weight sweep** for the positive class (phishing). Disabled by default to meet time budget; enable in `CONFIG` if needed.

**Expected files:**
- `/mnt/data/Train_data.csv` (1.2M rows)
- `/mnt/data/Test_data.csv`  (361k rows)

Columns expected: `url`, `label` where label is `good`/`bad` or `0`/`1`.

In [None]:
# ==========================
# Config
# ==========================
from google.colab import drive
drive.mount('/content/drive')

CONFIG = {
    # Data paths
    "train_csv": "/content/drive/My Drive/Colab Notebooks/Mendeley/Train_data.csv",
    "test_csv":  "/content/drive/My Drive/Colab Notebooks/Mendeley/Test_data.csv",

    # Vectorizer & model
    "char_ngram_range": (3,5),
    "n_features": 2**20,     # try 2**18 for faster/smaller
    "sgd_alpha": 1e-5,       # regularization
    "random_state": 42,

    # Splits
    "valid_size": 0.05,      # validation from TRAIN for tau/class-weight tuning
    "calib_size": 0.03,      # small calibration split from the remaining train

    # Uncertain window for handing off to LBP-lite
    "uncertain_low": 0.40,
    "uncertain_high": 0.60,

    # LBP-lite
    "lbp_iters": 6,
    "ths_plus": 0.6,
    "ths_minus": 1.0,

    # Tau sweep (validation only); best tau is used on TEST
    "enable_tau_sweep": True,
    "tau_grid": [round(x,2) for x in [0.45 + 0.01*i for i in range(11)]],

    # Optional class-weight sweep for phishing class (positive class=1)
    "enable_class_weight_sweep": False,  # set True to enable (slower)
    "pos_weight_grid": [1.0, 1.5],      # keep small to preserve time budget
}
CONFIG

Mounted at /content/drive


{'train_csv': '/content/drive/My Drive/Colab Notebooks/Mendeley/Train_data.csv',
 'test_csv': '/content/drive/My Drive/Colab Notebooks/Mendeley/Test_data.csv',
 'char_ngram_range': (3, 5),
 'n_features': 1048576,
 'sgd_alpha': 1e-05,
 'random_state': 42,
 'valid_size': 0.05,
 'calib_size': 0.03,
 'uncertain_low': 0.4,
 'uncertain_high': 0.6,
 'lbp_iters': 6,
 'ths_plus': 0.6,
 'ths_minus': 1.0,
 'enable_tau_sweep': True,
 'tau_grid': [0.45, 0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55],
 'enable_class_weight_sweep': False,
 'pos_weight_grid': [1.0, 1.5]}

In [None]:
# ==========================
# Imports & Utilities
# ==========================
import pandas as pd, numpy as np, re
from collections import defaultdict
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

RS = CONFIG["random_state"]

def norm_url(u: str) -> str:
    if not isinstance(u, str):
        u = str(u)
    u = u.strip().lower()
    return re.sub(r'^https?://', '', u)

def get_domain(u: str) -> str:
    return u.split('/')[0]

def shingles(s, k=5):
    if not s: return set()
    L = len(s)
    return { s[i:i+k] for i in range(max(0, L-k+1)) }

def bucket_id(s):
    # simple LSH-like bucket by XOR hashing a few shingles
    xs = list(shingles(s, 5))
    if not xs: return 0
    h = 0
    for w in xs[:64]:
        h ^= hash(w)
    return h & ((1<<20)-1)

def ensure_labels(series):
    s = series.copy()
    if s.dtype == 'O':
        s = s.str.lower().map({'good':0,'bad':1})
    s = s.fillna(0).astype(int)
    return s.values

def metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    return acc, prec, rec, f1

def print_metrics(tag, y_true, y_pred):
    acc, prec, rec, f1 = metrics(y_true, y_pred)
    print(f"{tag:20s} Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}")
    return acc,prec,rec,f1

In [None]:
# ==========================
# Load & Split
# ==========================
train_full = pd.read_csv(CONFIG['train_csv'])
test = pd.read_csv(CONFIG['test_csv'])
for df in (train_full, test):
    df['u'] = df['url'].map(norm_url)
y_full = ensure_labels(train_full['label'])
y_test = ensure_labels(test['label'])

# Validation split from TRAIN for tau/class-weight tuning
train_rest, valid = train_test_split(train_full, test_size=CONFIG['valid_size'], stratify=y_full, random_state=RS)
y_rest  = ensure_labels(train_rest['label'])
y_valid = ensure_labels(valid['label'])
print(f"Split: train_rest={len(train_rest)}, valid={len(valid)}")

Loaded data in 8.72s | train=1200000 test=361934
Split: train_rest=1140000, valid=60000


In [None]:
# ==========================
# Stage A training (with timing) — returns vectorizer, clf, calibrator
# ==========================
def train_stageA(train_df, y, pos_weight=1.0):
    cfg = CONFIG
    # Vectorize
    vec = HashingVectorizer(analyzer='char', ngram_range=cfg['char_ngram_range'],
                            n_features=cfg['n_features'], norm='l2', alternate_sign=False)
    X = vec.transform(train_df['u'])

    # Split for calibrator
    X_tr, X_cal, y_tr, y_cal = train_test_split(X, y, test_size=cfg['calib_size'], stratify=y, random_state=RS)

    # Model (class_weight either 'balanced' or manual)
    cw = {0:1.0, 1:pos_weight} if pos_weight!= 'balanced' else 'balanced'
    clf = SGDClassifier(loss='log_loss', alpha=cfg['sgd_alpha'], class_weight=cw,
                        early_stopping=True, n_iter_no_change=3, validation_fraction=0.02, random_state=RS)

    # Platt scaling
    scores_cal = clf.decision_function(X_cal).reshape(-1,1)
    cal = LogisticRegression(max_iter=100, random_state=RS)

    return vec, clf, cal

def predict_proba_stageA(vec, clf, cal, df):
    X = vec.transform(df['u'])
    s = clf.decision_function(X).reshape(-1,1)
    p1 = cal.predict_proba(s)[:,1]
    P = np.c_[1-p1, p1]
    return P

In [None]:
# ==========================
# LBP-lite helpers (graph build + inference)
# ==========================
def build_graph_for(df_unknown, df_observed, P_unknown):
    """Build compact graph with unknown URL nodes from df_unknown and observed neighbors from df_observed.
       Returns (prior, neighbors, edges, sim_cache, idx_maps) where unknown nodes are [0..nu-1]."""
    cfg = CONFIG
    ths_plus, ths_minus = cfg['ths_plus'], cfg['ths_minus']

    # Unknown set
    nu = len(df_unknown)
    U_dom_list = df_unknown['u'].map(get_domain).tolist()
    U_buc_list = df_unknown['u'].map(bucket_id).tolist()

    # Map domains/buckets to node ids
    dom2id, buc2id = {}, {}
    def get_id(m, key, base):
        if key not in m: m[key] = base + len(m)
        return m[key]
    nid = nu
    dom_ids = [get_id(dom2id, d, nid) for d in U_dom_list]
    nid = nu + len(dom2id)
    buc_ids = [get_id(buc2id, b, nid) for b in U_buc_list]
    N = nu + len(dom2id) + len(buc2id)

    # Priors
    prior = np.zeros((N,2), float)
    prior[:nu,:] = P_unknown

    # Aggregate observed labels to domains
    if len(df_observed):
        obs_dom = df_observed['u'].map(get_domain)
        y_obs = ensure_labels(df_observed['label'])
        dom_counts = defaultdict(lambda:[0,0])
        for d, y in zip(obs_dom, y_obs):
            if d in dom2id:
                dom_counts[d][y] += 1
        for d, did in dom2id.items():
            g,b = dom_counts[d][0], dom_counts[d][1]
            s = g+b
            if s>0:
                p1 = b/(s+1e-9); prior[did,:] = [1-p1, p1]
            else:
                prior[did,:] = [0.5,0.5]
    else:
        for d, did in dom2id.items():
            prior[did,:] = [0.5,0.5]

    for b, bid in buc2id.items():
        prior[bid,:] = [0.5,0.5]

    # Edges
    edges = []
    for i,(di,bi) in enumerate(zip(dom_ids, buc_ids)):
        edges.append((i, di)); edges.append((i, bi))

    # Similarity cache
    sim_cache = { (i, di): 0.8 for i,di in enumerate(dom_ids) }
    sim_cache.update({ (i, bi): 1.0 for i,bi in enumerate(buc_ids) })

    # Neighbor list
    neighbors = [[] for _ in range(N)]
    for (u,v) in list(edges)+[(v,u) for (u,v) in edges]:
        neighbors[u].append(v)

    return prior, neighbors, edges, sim_cache, {"nu":nu}

def lbp_infer(prior, neighbors, edges, sim_cache, iters=6, tau=0.5):
    # min-sum BP
    msg = { (u,v): np.zeros(2) for (u,v) in edges }
    rev = { (v,u): np.zeros(2) for (u,v) in edges }
    msg.update(rev)

    def psi(sim, same, ths_plus=CONFIG['ths_plus'], ths_minus=CONFIG['ths_minus']):
        return max(1e-6, min(ths_plus, 1.0 - sim)) if same else max(ths_minus, sim)

    for _ in range(iters):
        new_msg = {}
        for u, vs in enumerate(neighbors):
            for v in vs:
                msum = np.zeros(2)
                for w in neighbors[u]:
                    if w==v: continue
                    msum += msg[(w,u)]
                out = np.zeros(2)
                sim = sim_cache.get((u,v), 0.5)
                for l in (0,1):
                    cands = []
                    for lp in (0,1):
                        cands.append((1 - prior[u,lp]) + psi(sim, l==lp) + msum[lp])
                    out[l] = min(cands)
                new_msg[(u,v)] = out
        msg.update(new_msg)

    # Collect decisions for first nu nodes (unknown URLs)
    nu = np.where(np.all(prior==prior, axis=1))[0].shape[0]  # not used; will compute from neighbors
    # better: assume unknown URLs are those with any neighbor
    nu = 0
    for u in range(len(neighbors)):
        # unknown URL nodes have been placed at the beginning in build_graph_for
        if u==0 or u<nu: pass
    # we pass nu via idx_maps
    return msg


In [None]:
# ==========================
# Validation routine for a given Stage A model: run τ-sweep on VALID
# ==========================
def validate_tau(vec, clf, cal, train_rest_df, valid_df):
    # Stage A on VALID
    P_valid, t_pred = predict_proba_stageA(vec, clf, cal, valid_df)
    y_valid = ensure_labels(valid_df['label'])
    low, high = CONFIG['uncertain_low'], CONFIG['uncertain_high']
    mask = (P_valid[:,1]>=low) & (P_valid[:,1]<=high)
    idx = np.where(mask)[0]
    print(f"VALID uncertain in window [{low:.2f},{high:.2f}]: {len(idx)} of {len(valid_df)}")

    # Build graph using: unknown=valid[mask], observed=train_rest
    prior, neighbors, edges, sim_cache, meta = build_graph_for(valid_df.iloc[idx], train_rest_df, P_valid[idx])

    # Run LBP once and cache costs; then sweep tau cheaply
    # Run inference to get messages
    msg = { (u,v): np.zeros(2) for (u,v) in edges }
    rev = { (v,u): np.zeros(2) for (u,v) in edges }
    msg.update(rev)

    def psi(sim, same, ths_plus=CONFIG['ths_plus'], ths_minus=CONFIG['ths_minus']):
        return max(1e-6, min(ths_plus, 1.0 - sim)) if same else max(ths_minus, sim)

    for _ in range(CONFIG['lbp_iters']):
        new_msg = {}
        for u, vs in enumerate(neighbors):
            for v in vs:
                msum = np.zeros(2)
                for w in neighbors[u]:
                    if w==v: continue
                    msum += msg[(w,u)]
                out = np.zeros(2)
                sim = sim_cache.get((u,v), 0.5)
                for l in (0,1):
                    cands = []
                    for lp in (0,1):
                        cands.append((1 - prior[u,lp]) + psi(sim, l==lp) + msum[lp])
                    out[l] = min(cands)
                new_msg[(u,v)] = out
        msg.update(new_msg)

    # Compute URL costs once
    nu = len(valid_df.iloc[idx])
    costs = np.zeros((nu,2))
    for i in range(nu):
        msum = np.zeros(2)
        for v in neighbors[i]:
            msum += msg[(v,i)]
        costs[i,0] = (1 - prior[i,0]) + msum[0]
        costs[i,1] = (1 - prior[i,1]) + msum[1]

    # Sweep tau
    best = {"tau": 0.5, "f1": -1}
    for tau in (CONFIG['tau_grid'] if CONFIG['enable_tau_sweep'] else [0.5]):
        ratio = costs[:,1] / (costs[:,0] + 1e-9)
        yU = (ratio < tau).astype(int)
        y_pred = (P_valid[:,1] >= 0.5).astype(int)
        y_pred[idx] = yU
        _,_,_,f1 = metrics(y_valid, y_pred)
        if f1 > best['f1']:
            best = {"tau": tau, "f1": f1}
    print(f"Best τ on VALID: {best['tau']:.2f} (F1={best['f1']:.4f}) | Build {t_build:.2f}s, LBP {t_lbp:.2f}s")
    return best['tau']

In [None]:
# ==========================
# Class-weight sweep (optional) — returns best (vec,clf,cal) and timings
# ==========================
def sweep_class_weight(train_rest_df, y_rest, valid_df):
    best = {"f1": -1}
    for w in CONFIG['pos_weight_grid']:
        print(f"\n>>> Trying pos_class_weight={w}")
        vec, clf, cal, tim = train_stageA(train_rest_df, y_rest, pos_weight=w)
        # quick stage-A on VALID for sanity
        P_valid, _ = predict_proba_stageA(vec, clf, cal, valid_df)
        y_valid = ensure_labels(valid_df['label'])
        y_pred_A = (P_valid[:,1]>=0.5).astype(int)
        _,_,_,f1A = metrics(y_valid, y_pred_A)
        print(f"Stage-A VALID F1={f1A:.4f}")
        # tau validation via LBP
        tau = validate_tau(vec, clf, cal, train_rest_df, valid_df)
        # measure final F1 on VALID with that tau (already printed during validate_tau)
        if f1A > best['f1']:  # or keep the tau-based f1; here we keep the LBP-picked tau value
            best = {"w": w, "vec": vec, "clf": clf, "cal": cal, "tim": tim, "tau": tau, "f1": f1A}
    print(f"\nBest class weight={best['w']} with VALID Stage-A F1={best['f1']:.4f}; τ={best['tau']:.2f}")
    return best['vec'], best['clf'], best['cal'], best['tim'], best['tau']

In [None]:
# ==========================
# Train Stage A (with optional class-weight sweep) and pick τ on VALID
# ==========================
if CONFIG['enable_class_weight_sweep']:
    vec, clf, cal, tim_train, tau_best = sweep_class_weight(train_rest, y_rest, valid)
else:
    # Single training using 'balanced' to respect class imbalance, faster
    print("Training Stage A with class_weight='balanced' (fast path)...")
    # Temporarily call train_stageA with a sentinel to instruct 'balanced'
    vec, clf, cal, tim_train = train_stageA(train_rest, y_rest, pos_weight='balanced')
    tau_best = validate_tau(vec, clf, cal, train_rest, valid)

print("\nStage A timing (train path):")
for k,v in tim_train.items():
    print(f"  {k:10s}: {v:.2f}s")
print(f"Chosen τ (from VALID): {tau_best}")

Training Stage A with class_weight='balanced' (fast path)...
VALID uncertain in window [0.40,0.60]: 193 of 60000
Best τ on VALID: 0.45 (F1=0.6421) | Build 1.30s, LBP 0.03s

Stage A timing (train path):
  vectorize : 50.93s
  train     : 6.25s
  calibrate : 0.09s
Chosen τ (from VALID): 0.45


In [None]:
# ==========================
# Final EVALUATION on TEST (with full timing breakdown)
# ==========================
print("\n=== Testing on TEST set ===")
P_test, t_predA = predict_proba_stageA(vec, clf, cal, test)
y_pred = (P_test[:,1]>=0.5).astype(int)

# Uncertain slice
low, high = CONFIG['uncertain_low'], CONFIG['uncertain_high']
mask = (P_test[:,1]>=low) & (P_test[:,1]<=high)
idx = np.where(mask)[0]
print(f"TEST uncertain in window [{low:.2f},{high:.2f}]: {len(idx)} of {len(test)}")

# Build graph for TEST: unknown = test[idx], observed = train_rest
prior, neighbors, edges, sim_cache, meta = build_graph_for(test.iloc[idx], train_rest, P_test[idx])

# run min-sum
msg = { (u,v): np.zeros(2) for (u,v) in edges }
rev = { (v,u): np.zeros(2) for (u,v) in edges }
msg.update(rev)
def psi(sim, same, ths_plus=CONFIG['ths_plus'], ths_minus=CONFIG['ths_minus']):
    return max(1e-6, min(ths_plus, 1.0 - sim)) if same else max(ths_minus, sim)
for _ in range(CONFIG['lbp_iters']):
    new_msg = {}
    for u, vs in enumerate(neighbors):
        for v in vs:
            msum = np.zeros(2)
            for w in neighbors[u]:
                if w==v: continue
                msum += msg[(w,u)]
            out = np.zeros(2)
            sim = sim_cache.get((u,v), 0.5)
            for l in (0,1):
                cands = []
                for lp in (0,1):
                    cands.append((1 - prior[u,lp]) + psi(sim, l==lp) + msum[lp])
                out[l] = min(cands)
            new_msg[(u,v)] = out
    msg.update(new_msg)

# Decide labels for unknown URLs using chosen τ
nu = len(test.iloc[idx])
costs = np.zeros((nu,2))
for i in range(nu):
    msum = np.zeros(2)
    for v in neighbors[i]:
        msum += msg[(v,i)]
    costs[i,0] = (1 - prior[i,0]) + msum[0]
    costs[i,1] = (1 - prior[i,1]) + msum[1]
ratio = costs[:,1] / (costs[:,0] + 1e-9)
yU = (ratio < tau_best).astype(int)

y_pred[idx] = yU

acc,prec,rec,f1 = metrics(y_test, y_pred)
print(f"\nFINAL METRICS (TEST): Acc={acc:.4f}  Prec={prec:.4f}  Rec={rec:.4f}  F1={f1:.4f}")


=== Testing on TEST set ===
TEST uncertain in window [0.40,0.60]: 1045 of 361934

FINAL METRICS (TEST): Acc=0.9875  Prec=0.9217  Rec=0.4777  F1=0.6292

TIMINGS (TEST path)
----------------------------------------
Stage A predict      : 15.37s
Graph build (LBP)    : 1.30s
LBP inference        : 0.22s
Stitch predictions   : 0.00s
----------------------------------------
End-to-end (TEST)    : 16.94s
