# CNN Model 

Author: James Duong

In [1]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, regularizers
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [2]:
df = pd.read_csv("processed_urls.csv")
print(df['type'].value_counts())

  df = pd.read_csv("processed_urls.csv")


type
0    345738
1    159195
Name: count, dtype: int64


In [3]:
# Encode URLS
all_text = "\n".join(df['url'].astype(str)) # every char in the df
vocab = sorted(list(set(all_text)))         # get only the unique char and sort them 

# create list of unique char and assign a unique ID to each one
charToIdx = {}
idx = 1
for c in vocab:
    charToIdx[c] = idx
    idx = idx + 1

vocab_size = len(charToIdx) + 1
MAX_LEN = 200

# convert URL to fixed-length numeric array
def encode_url(s):
    s = str(s)
    seq = []

    for c in s:
        # if char not found in dictionary, use 0 as default value
        if c in charToIdx:
            seq.append(charToIdx[c])
        else:
            seq.append(0)

    seq = seq[:MAX_LEN]

    # pad with zeros if shorter than MAX_LEN
    while len(seq) < MAX_LEN:
        seq.append(0)

    return np.array(seq, dtype = np.int32)    # convert to a NumPy array with type int32

X = np.vstack(df['url'].apply(encode_url))
y = df['type'].astype(int).to_numpy()

In [4]:
X_pool, X_test, y_pool, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = SEED)

print("Pool size:", X_pool.shape)
print("Test size:", X_test.shape)

Pool size: (353453, 200)
Test size: (151480, 200)


In [5]:
def build_cnn():
    model = models.Sequential([
        layers.Embedding(vocab_size, 64, input_length=MAX_LEN), # convert each char into a 64-dim vector
        layers.Conv1D(128, 5, activation = 'relu'),   # 128 filters, looking at 5 char at a time   
        layers.GlobalMaxPooling1D(),
        layers.Dense(64, activation = 'relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(1, activation = 'sigmoid')
    ])
    model.compile(optimizer = 'adam',loss = 'binary_crossentropy')
    return model

In [6]:
def find_threshold(y_true, y_probs, recall_target=0.90):
    best_t, best_p = 0.5, 0
    for t in np.linspace(0, 1, 501):
        preds = (y_probs >= t).astype(int)  # convert prob into pred using threshold
        r = recall_score(y_true, preds)     # compute recall at this threshold
        p = precision_score(y_true, preds, zero_division=0) # compute precision at this threshold
        if r >= recall_target and p > best_p:
            best_t, best_p = t, p
    return best_t  

In [7]:
# K-Fold CV + Threshold Tuning
# split training pool into 3 folds
skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = SEED)

thresholds = [] # store best threshold from each split

for train_idx, val_idx in skf.split(X_pool, y_pool):
    X_train, X_val, y_train, y_val = X_pool[train_idx], X_pool[val_idx], y_pool[train_idx], y_pool[val_idx]
    cw = compute_class_weight('balanced', classes = np.unique(y_train), y = y_train)
    class_weights = {0: cw[0], 1: cw[1]}    # convert weights to dictionary form
    model = build_cnn()
    model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 3, batch_size = 128, class_weight = class_weights, verbose = 1)
    y_val_probs = model.predict(X_val).flatten()  # get prob predictions on validation set
    t = find_threshold(y_val, y_val_probs)
    thresholds.append(t)

final_threshold = np.mean(thresholds)
print("Mean threshold:", final_threshold)



Epoch 1/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 37ms/step - loss: 0.0557 - val_loss: 0.0261
Epoch 2/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 33ms/step - loss: 0.0293 - val_loss: 0.0223
Epoch 3/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 36ms/step - loss: 0.0245 - val_loss: 0.0197
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step
Epoch 1/3




[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 35ms/step - loss: 0.0540 - val_loss: 0.0273
Epoch 2/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 35ms/step - loss: 0.0275 - val_loss: 0.0247
Epoch 3/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 38ms/step - loss: 0.0231 - val_loss: 0.0221
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
Epoch 1/3




[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 31ms/step - loss: 0.0538 - val_loss: 0.0280
Epoch 2/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 33ms/step - loss: 0.0280 - val_loss: 0.0218
Epoch 3/3
[1m1841/1841[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 34ms/step - loss: 0.0233 - val_loss: 0.0196
[1m3682/3682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step
Mean threshold: 0.996


In [8]:
# Final training + timing
start_train = time.time()
final_model = build_cnn()
final_model.fit(X_pool, y_pool, validation_split = 0.1, epochs = 3, batch_size = 128, verbose = 1)
train_time = time.time() - start_train

Epoch 1/3




[1m2486/2486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 29ms/step - loss: 0.0426 - val_loss: 0.0239
Epoch 2/3
[1m2486/2486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 30ms/step - loss: 0.0214 - val_loss: 0.0207
Epoch 3/3
[1m2486/2486[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 34ms/step - loss: 0.0180 - val_loss: 0.0212


In [9]:
# Test prediction + timing
start_inf = time.time()
y_test_probs = final_model.predict(X_test).flatten()
inference_time = time.time() - start_inf
y_pred = (y_test_probs >= final_threshold).astype(int)

[1m4734/4734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step


In [10]:
# Print results
results = {
    "model": "cnn",
    "accuracy": accuracy_score(y_test, y_pred),
    "precision": precision_score(y_test, y_pred),
    "recall": recall_score(y_test, y_pred),
    "f1": f1_score(y_test, y_pred),
    "threshold": float(final_threshold),
    "train_time": train_time,           
    "inference_time": inference_time,
    "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
}

results

{'model': 'cnn',
 'accuracy': 0.9860509638236071,
 'precision': 0.9998247996145592,
 'recall': 0.9559245377834544,
 'f1': 0.9773819590884276,
 'threshold': 0.996,
 'train_time': 233.5655176639557,
 'inference_time': 15.924597024917603,
 'confusion_matrix': [[103713, 8], [2105, 45654]]}