In [None]:
!pip install datasets

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import random

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

**Loading**

- Load the Enron spam dataset from Hugging Face. The dataset has a "train" and "test" split.
- We have further splitted the training set into training and validation sets (90/10 split).

In [None]:
raw_train = load_dataset("SetFit/enron_spam", split="train")
raw_test = load_dataset("SetFit/enron_spam", split="test")

raw_train_val = raw_train.train_test_split(test_size=0.1, seed=42)
train_dataset = raw_train_val['train']
val_dataset = raw_train_val['test']
test_dataset = raw_test

print("Train samples:", len(train_dataset))
print("Validation samples:", len(val_dataset))
print("Test samples:", len(test_dataset))

**Tokenization**

- We define a tokenization function that uses the pre-trained model’s tokenizer. Here, we use a maximum sequence length of 128 tokens. The tokenize_function does truncation and padding so that each sequence has the same length.

In [None]:
def tokenize_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length)

**Data Collation**

- This function is using torch.stack() so that batches are correctly created without causing type conversion issues.

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['label'] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

**Custom Classifier Model Design**

- **Architecture:**  
  1. **Encoder:**  
     We hace loaded pre-trained encoder (either DistilBERT or TinyBERT) using AutoModel.from_pretrained(). All encoder weights are frozen to focus on utilizing the encoder’s feature extraction capability without updating its parameters.
  2. **Classification Head (MLP):**  
     We have added an MLP on top of the encoder. It has a linear layer that will reduce the dimensionality, a ReLU activation function, a Dropout for regularization and a final linear layer that produces logits for binary classification.
- **Representation Extraction:**  
  This model uses the embedding of the CLS token from the encoder's output as the representation of the input sequence.

In [None]:
class CustomClassifier(nn.Module):
    def __init__(self, pretrained_model_name, hidden_size, dropout_rate=0.1):
        super(CustomClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained(pretrained_model_name)
        for param in self.encoder.parameters():
            param.requires_grad = False
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size // 2, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state
        cls_embedding = hidden_state[:, 0, :]
        logits = self.classifier(cls_embedding)
        return logits

**Training and Evaluation Process**

- **Training Loop:** We have used CrossEntropyLoss as it is suitable for classification tasks. The Adam optimizer is applied to update only the classifier head weights and to keep the encoder weights frozen. During training, the model’s performance can be tracked via accuracy, precision, recall, and F1 score.


In [None]:
def train_model(model, train_loader, val_loader, epochs=3, learning_rate=1e-3):
    model.to(device)
    optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_metrics = evaluate_model(model, val_loader)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f} | Val Acc: {val_metrics['accuracy']:.4f}, "
              f"Precision: {val_metrics['precision']:.4f}, Recall: {val_metrics['recall']:.4f}, F1: {val_metrics['f1']:.4f}")

    return model

**Evaluation:**  
  - We are checking the performance on the validation set at each epoch and the test set is used for final evaluation.

In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    preds, labels_all = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            batch_preds = torch.argmax(logits, dim=1)

            preds.extend(batch_preds.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())

    accuracy = accuracy_score(labels_all, preds)
    precision = precision_score(labels_all, preds)
    recall = recall_score(labels_all, preds)
    f1 = f1_score(labels_all, preds)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Preparing data for training

In [None]:
def prepare_data(tokenizer, train_ds, val_ds, test_ds, batch_size=16):
    train_ds = train_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    val_ds = val_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    test_ds = test_ds.map(lambda x: tokenize_function(x, tokenizer), batched=True)

    train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    return train_loader, val_loader, test_loader

**Train and Evaluate with Two Encoder LLMs**

 Here, we are running experiments with two models:

 - Model 1: DistilBERT with a hidden size of 768.
 - Model 2: TinyBERT with a hidden size of 312.


 The process is to load the corresponding tokenizer and tokenize the datasets, then create DataLoaders for train, validation, and test splits. After that, train classifier head and evaluate on test set

**Model 1**

In [None]:
print("\nExperiment 1: DistilBERT")
distilbert_model_name = "distilbert-base-uncased"
distilbert_hidden_size = 768
tokenizer_distilbert = AutoTokenizer.from_pretrained(distilbert_model_name)

train_loader_distilbert, val_loader_distilbert, test_loader_distilbert = prepare_data(tokenizer_distilbert, train_dataset, val_dataset, test_dataset, batch_size=16)
model_distilbert = CustomClassifier(distilbert_model_name, hidden_size=distilbert_hidden_size, dropout_rate=0.1)

model_distilbert = train_model(model_distilbert, train_loader_distilbert, val_loader_distilbert, epochs=3, learning_rate=1e-3)

In [None]:
test_metrics_distilbert = evaluate_model(model_distilbert, test_loader_distilbert)
print("\nDistilBERT Test Metrics:")
print(test_metrics_distilbert)

**Model 2**

In [None]:
print("\nExperiment 2: TinyBERT")
tinybert_model_name = "huawei-noah/TinyBERT_General_4L_312D"
tinybert_hidden_size = 312
tokenizer_tinybert = AutoTokenizer.from_pretrained(tinybert_model_name)

train_loader_tinybert, val_loader_tinybert, test_loader_tinybert = prepare_data(tokenizer_tinybert, train_dataset, val_dataset, test_dataset, batch_size=16)
model_tinybert = CustomClassifier(tinybert_model_name, hidden_size=tinybert_hidden_size, dropout_rate=0.1)

model_tinybert = train_model(model_tinybert, train_loader_tinybert, val_loader_tinybert, epochs=3, learning_rate=1e-3)

In [None]:
test_metrics_tinybert = evaluate_model(model_tinybert, test_loader_tinybert)
print("\nTinyBERT Test Metrics:")
print(test_metrics_tinybert)

**Evaluation Summary**

**DistilBERT:**  
The DistilBERT model shows very good learning curves and the validation accuracy is at 98.05% by the final epoch. Its test performance is also outstanding with 98.3% accuracy, 98.4% precision, 98.2% recall, and a 98.3% F1 score. These results clearly shows that the model is excellent in capturing details of the spam classification task.

**TinyBERT:**  
The TinyBERT model was started with a higher training loss but the validation accuracy improved and finally it is 93.47% at the third epoch. Its test metrics are 93.95% accuracy, 92.6% precision, 95.6% recall, and 94.1% F1 score. This is clearly very good performance even if slighly lower than DistilBERT. As it is a lightweight it is good when resources are less.

So, both models shows excellent performance for spam classification. But for this assignment task we are choosing DistilBERT as it shows higher performance.


Saving weights of both models

In [None]:
torch.save(model_distilbert.state_dict(), "distilbert_classifier.pt")
torch.save(model_tinybert.state_dict(), "tinybert_classifier.pt")

**References**

- Enron Spam Dataset: https://huggingface.co/datasets/SetFit/enron_spam
- DistilBERT Documentation: https://huggingface.co/docs/transformers/en/model_doc/distilbert
- TinyBERT Documentation: https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D
- PyTorch Documentation: https://pytorch.org/docs/stable/index.html
- Transformers Library: https://huggingface.co/transformers/
- Datasets Library: https://huggingface.co/docs/datasets/
- scikit-learn: https://scikit-learn.org/stable/user_guide.html
- NumPy Documentation: https://numpy.org/doc/stable/user/index.html#user
- Python random module Documentation: https://docs.python.org/3/library/random.html