In [1]:
#cell 1
import os
from google.colab import userdata

# Get kaggle.json from Colab Secrets
kaggle_json = userdata.get("KAGGLE_JSON")

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "w") as f:
    f.write(kaggle_json)

!chmod 600 /root/.kaggle/kaggle.json
!pip -q install kaggle

# Download dataset
!kaggle datasets download -d naserabdullahalam/phishing-email-dataset
!unzip -o phishing-email-dataset.zip -d data


Dataset URL: https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset
License(s): CC-BY-SA-4.0
Downloading phishing-email-dataset.zip to /content
  0% 0.00/77.1M [00:00<?, ?B/s]
100% 77.1M/77.1M [00:00<00:00, 1.59GB/s]
Archive:  phishing-email-dataset.zip
  inflating: data/CEAS_08.csv        
  inflating: data/Enron.csv          
  inflating: data/Ling.csv           
  inflating: data/Nazario.csv        
  inflating: data/Nigerian_Fraud.csv  
  inflating: data/SpamAssasin.csv    
  inflating: data/phishing_email.csv  


In [2]:

#cell 2
!ls data

CEAS_08.csv  Ling.csv	  Nigerian_Fraud.csv  SpamAssasin.csv
Enron.csv    Nazario.csv  phishing_email.csv


In [3]:
#cell 3
import pandas as pd

#combline files into a single source to use all the body text

files = ["Enron.csv", "Ling.csv", "CEAS_08.csv", "Nazario.csv", "Nigerian_Fraud.csv", "SpamAssasin.csv"]

dfs = []
for f in files:
    tmp = pd.read_csv(f"/content/data/{f}", usecols=["body", "label"])
    tmp["source"] = f.replace(".csv","")   # optional, useful for analysis
    dfs.append(tmp)

df = pd.concat(dfs, ignore_index=True)

print(df.shape)
print(df["label"].value_counts())
df.head()

# 1 is phishing email
# 0 is non-phishing


(82486, 3)
label
1    42891
0    39595
Name: count, dtype: int64


Unnamed: 0,body,label,source
0,( see attached file : hplno 525 . xls )\r\n- h...,0,Enron
1,- - - - - - - - - - - - - - - - - - - - - - fo...,0,Enron
2,"estimated actuals\r\nmarch 30 , 2001\r\nno flo...",0,Enron
3,( see attached file : hplno 530 . xls )\r\n- h...,0,Enron
4,( see attached file : hplno 601 . xls )\r\n- h...,0,Enron


In [4]:
#cell 4
import re

#clean up the text a bit

def clean_body(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", " <URL> ", text)
    text = re.sub(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", " <EMAIL> ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["body"] = df["body"].apply(clean_body)
df = df[df["body"].str.len() > 0].reset_index(drop=True)

#20000 char is ab 3000 words, emails are usually shorter
MAX_CHARS = 20000
df["body"] = df["body"].str.slice(0, MAX_CHARS)

In [21]:
#cell 5
# creating training and validation splits

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    df["body"].tolist(),
    df["label"].tolist(),
    test_size=0.2,    #80/20 train/validation split
    random_state=42,
    stratify=df["label"]
)

len(X_train), len(X_val)
MAX_TRAIN_SAMPLES = 50000
MAX_VAL_SAMPLES = 10000

X_train = X_train[:MAX_TRAIN_SAMPLES]
y_train = y_train[:MAX_TRAIN_SAMPLES]
X_val = X_val[:MAX_VAL_SAMPLES]
y_val = y_val[:MAX_VAL_SAMPLES]

print(len(X_train), len(X_val))



30000 8000


In [22]:
#cell 6
from transformers import DistilBertTokenizerFast

print("Loading DistilBERT tokenizer...")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


Loading DistilBERT tokenizer...


In [23]:
#cell 7
import random

sample_texts = random.sample(X_train, 2000)

max_len = 0
for sent in sample_texts:
    input_ids = tokenizer.encode(sent,add_special_tokens=True, max_length=512,truncation=True)
    max_len = max(max_len, len(input_ids))

print("Max sentence length (sample):", max_len)


Max sentence length (sample): 512


In [10]:
#cell 8

#calculating the average token size to beable to
import random
import numpy as np

# Sample texts (adjust size if needed)
sample_size = 2000
sample_texts = random.sample(X_train, sample_size)

token_lengths = []

for sent in sample_texts:
    tokens = tokenizer.encode(
        sent,
        add_special_tokens=True
    )
    token_lengths.append(len(tokens))

token_lengths = np.array(token_lengths)

print(f"Sample size: {sample_size}")
print(f"Average tokens: {token_lengths.mean():.1f}")
print(f"Median tokens: {np.median(token_lengths)}")
print(f"90th percentile: {np.percentile(token_lengths, 90)}")
print(f"95th percentile: {np.percentile(token_lengths, 95)}")
print(f"99th percentile: {np.percentile(token_lengths, 99)}")
print(f"Max tokens (sample): {token_lengths.max()}")


Sample size: 2000
Average tokens: 383.8
Median tokens: 183.0
90th percentile: 818.1000000000001
95th percentile: 1317.3499999999997
99th percentile: 3405.979999999999
Max tokens (sample): 7364


In [24]:
#cell 9
MAX_LEN = 256  # good tradeoff for emails + GPU memory


train_encodings = tokenizer(
    X_train,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)

val_encodings = tokenizer(
    X_val,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
    return_tensors="pt"
)


In [25]:
#cell 10

#importing and setting the device to utilize GPU
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import time

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [26]:
#cell 11
train_labels = torch.tensor(y_train, dtype=torch.long)
val_labels   = torch.tensor(y_val, dtype=torch.long)

train_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    train_labels
)

val_dataset = TensorDataset(
    val_encodings["input_ids"],
    val_encodings["attention_mask"],
    val_labels
)

BATCH_SIZE = 32  #small

train_loader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=BATCH_SIZE
)

val_loader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=BATCH_SIZE
)

print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))

Train batches: 938
Val batches: 250


In [27]:
#cell 12

#creating the model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)
model.to(device)

#setting the number of epochs
EPOCHS = 3
lr = 2e-5

#optimizer
optimizer = AdamW(model.parameters(), lr=lr)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

print("Total training steps:", total_steps)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total training steps: 1876


In [28]:
#cell 13
#definied the evaluation for the training loop

def format_time(seconds):
    return time.strftime("%H:%M:%S", time.gmtime(seconds))

def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    total_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            lab   = labels.detach().cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(lab)

    avg_loss = total_loss / len(dataloader)

    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="binary", pos_label=1, zero_division=0
    )

    return avg_loss, acc, prec, rec, f1, np.array(all_labels), np.array(all_preds)

In [29]:
# cell 14

#training loop

torch.manual_seed(42)
np.random.seed(42)

best_f1 = -1.0
PATIENCE = 1
MIN_DELTA = 0.001
patience_counter = 0
SAVE_DIR = "/content/best_distilbert_model"

# training loop
for epoch in range(1, EPOCHS + 1):
    print(f"\n~~~ Epoch {epoch}/{EPOCHS} ~~~")
    t0 = time.time()

    model.train()
    total_train_loss = 0.0

    for step, batch in enumerate(train_loader, start=1):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        if step % 200 == 0:
            print(f"  Step {step}/{len(train_loader)}  |  Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_time = format_time(time.time() - t0)

    print(f"\n  Avg Train Loss: {avg_train_loss:.4f}")
    print(f"  Training time:  {train_time}")

    # Evaluate after each epoch
    t1 = time.time()
    val_loss, val_acc, val_prec, val_rec, val_f1, y_true, y_pred = evaluate(model, val_loader)
    val_time = format_time(time.time() - t1)

    print(f"\n  Val Loss: {val_loss:.4f}")
    print(f"  Val Acc:  {val_acc:.4f}")
    print(f"  Val Prec: {val_prec:.4f}")
    print(f"  Val Rec:  {val_rec:.4f}")
    print(f"  Val F1:   {val_f1:.4f}")
    print(f"  Val time: {val_time}")

    # Save best model + early stopping
    if val_f1 > best_f1 + MIN_DELTA:
        best_f1 = val_f1
        patience_counter = 0
        print(f"New best F1 — saving model to {SAVE_DIR}")
        model.save_pretrained(SAVE_DIR)
        tokenizer.save_pretrained(SAVE_DIR)
    else:
        patience_counter += 1
        print(f"No meaningful improvement. Patience {patience_counter}/{PATIENCE}")

        if patience_counter >= PATIENCE:
            print("Early stopping triggered.")
            break



~~~ Epoch 1/2 ~~~
  Step 200/938  |  Loss: 0.0161
  Step 400/938  |  Loss: 0.0145
  Step 600/938  |  Loss: 0.0059
  Step 800/938  |  Loss: 0.2146

  Avg Train Loss: 0.1322
  Training time:  00:10:44

  Val Loss: 0.0543
  Val Acc:  0.9836
  Val Prec: 0.9779
  Val Rec:  0.9905
  Val F1:   0.9842
  Val time: 00:00:59
New best F1 — saving model to /content/best_distilbert_model

~~~ Epoch 2/2 ~~~
  Step 200/938  |  Loss: 0.0013
  Step 400/938  |  Loss: 0.0006
  Step 600/938  |  Loss: 0.0009
  Step 800/938  |  Loss: 0.0758

  Avg Train Loss: 0.0221
  Training time:  00:10:44

  Val Loss: 0.0533
  Val Acc:  0.9860
  Val Prec: 0.9881
  Val Rec:  0.9847
  Val F1:   0.9864
  Val time: 00:00:59
New best F1 — saving model to /content/best_distilbert_model


In [30]:
#cell 15

# the final accuracy statistics from the best epoch

from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("/content/best_distilbert_model")
model.to(device)

val_loss, val_acc, val_prec, val_rec, val_f1, y_true, y_pred = evaluate(model, val_loader)

print("Final Validation Metrics")
print("Accuracy:", val_acc)
print("Precision:", val_prec)
print("Recall:", val_rec)
print("F1:", val_f1)

print("\nConfusion Matrix (rows=true, cols=pred)")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report")
print(classification_report(y_true, y_pred, digits=4))

Final Validation Metrics
Accuracy: 0.986
Precision: 0.9880604288499025
Recall: 0.9847013113161729
F1: 0.9863780102164923

Confusion Matrix (rows=true, cols=pred)
[[3833   49]
 [  63 4055]]

Classification Report
              precision    recall  f1-score   support

           0     0.9838    0.9874    0.9856      3882
           1     0.9881    0.9847    0.9864      4118

    accuracy                         0.9860      8000
   macro avg     0.9859    0.9860    0.9860      8000
weighted avg     0.9860    0.9860    0.9860      8000



In [32]:
#cell 16

#saving the results to a text file

results_text = f"""DistiBERT Phishing Detection Results
MAX_LEN: {MAX_LEN}
BATCH_SIZE: {BATCH_SIZE}
EPOCHS: {EPOCHS}
LR: {lr}

Validation:
Accuracy:  {val_acc:.4f}
Precision: {val_prec:.4f}
Recall:    {val_rec:.4f}
F1:        {val_f1:.4f}

Confusion Matrix:
{confusion_matrix(y_true, y_pred)}
"""

with open("results.txt", "w") as f:
    f.write(results_text)

print("Saved results.txt")

Saved results.txt
