In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import os
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 3e-4
D_MODEL = 256  # Embedding dimension
N_HEADS = 8    # Number of attention heads
N_LAYERS = 2   # Number of transformer layers

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

print(f"Using device: {DEVICE}")

Using device: cuda


In [3]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [4]:

def load_enron_data(spam_path, ham_path):
    """Load and preprocess the Enron spam/ham dataset."""
    spam_emails = []
    ham_emails = []

    # Load spam emails
    for filename in os.listdir(spam_path):
        with open(os.path.join(spam_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                spam_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    # Load ham emails
    for filename in os.listdir(ham_path):
        with open(os.path.join(ham_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                ham_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    # Create labels (1 for spam, 0 for ham)
    emails = spam_emails + ham_emails
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)

    # Clean the text
    cleaned_emails = []
    for email in emails:
        # Remove email headers
        try:
            content = email.split('\n\n', 1)[1]
        except IndexError:
            content = email

        # Remove special characters and extra whitespace
        content = re.sub(r'[^\w\s]', ' ', content)
        content = ' '.join(content.split())
        cleaned_emails.append(content)

    return cleaned_emails, labels

In [5]:
import numpy as np

def load_data_from_directories(max_items=None):
    spam_emails = []
    ham_emails = []

    for i in range(1, 7):
        spam_path = f'data/enron{i}/spam'
        ham_path = f'data/enron{i}/ham'

        # Load spam and ham emails
        spam, ham = load_enron_data(spam_path, ham_path)
        spam_emails.extend(spam)
        ham_emails.extend(ham)

    # Combine corresponding spam and ham emails into pairs
    email_pairs = list(zip(spam_emails, ham_emails))

    # Shuffle email pairs
    np.random.seed(42)
    np.random.shuffle(email_pairs)

    if max_items is not None:
        email_pairs = email_pairs[:max_items]

    # Separate spam and ham emails after shuffling
    spam_emails, ham_emails = zip(*email_pairs)

    return list(spam_emails), list(ham_emails)


In [6]:
class SpamTransformer(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, D_MODEL)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=D_MODEL * 4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N_LAYERS)
        self.classifier = nn.Linear(D_MODEL, 2)  # Binary classification

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)

        # Create padding mask for transformer
        mask = attention_mask.bool()

        # Pass through transformer
        x = self.transformer(x, src_key_padding_mask=~mask)

        # Pool the output (use [CLS] token or mean pooling)
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)

        # Classify
        return self.classifier(x)

In [7]:
def train_model(model, train_loader, val_loader, criterion, optimizer, patience=5):
    model.train()

    best_val_accuracy = 0.0
    best_model_state = None
    wait = 0

    for epoch in range(EPOCHS):
        total_loss = 0
        correct = 0
        total = 0

        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['label'].to(DEVICE)

                outputs = model(input_ids, attention_mask)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        train_accuracy = 100 * correct / total
        val_accuracy = 100 * val_correct / val_total
        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {100 * correct/total:.2f}%')
        print(f'Validation Accuracy: {100 * val_correct/val_total:.2f}%')
        print('-' * 50)

        # Early Stopping Logic
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()  # Save the best model
            wait = 0  # Reset patience counter
        else:
            wait += 1  # Increment patience counter

        if wait >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}. Best Validation Accuracy: {best_val_accuracy:.2f}%")
            break

    # Load the best model state
    if best_model_state:
        model.load_state_dict(best_model_state)


In [8]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    conf_matrix = confusion_matrix(true_labels, predictions)

    print("\nTest Set Evaluation:")
    print(f"Average Loss: {total_loss/len(test_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

In [9]:
class GaussianSpamTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Create GaussianBlock
        norm_axes = [1] * n_layers  # 1 for sequence dimension
        num_heads = [n_heads] * n_layers
        num_gaussians = [8] * n_layers  # You can experiment with this

        self.gaussian_block = GaussianBlock(
            norm_axes=norm_axes,
            num_heads=num_heads,
            num_gaussians=num_gaussians,
            num_layers=n_layers,
            padding_value=0  # Your padding token value
        )

        self.classifier = nn.Linear(d_model, 2)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x = self.gaussian_block(x)  # Just use the block directly
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)
        return self.classifier(x)

## Code from Paper

Link: https://github.com/gioannides/Gaussian-Adaptive-Attention/blob/main/gaussian_adaptive_attention/GaussianBlock.py

In [10]:
import torch
import torch.nn as nn

class GaussianAdaptiveAttention(nn.Module):
    def __init__(self, norm_axis, num_heads, num_gaussians, padding_value, mean_offset_init=0, eps=1e-8):
        super().__init__()
        if not isinstance(norm_axis, int):
            raise ValueError("norm_axis must be an integer.")
        if num_heads <= 0 or not isinstance(num_heads, int):
            raise ValueError("num_heads must be a positive integer.")
        if num_gaussians <= 0 or not isinstance(num_gaussians, int):
            raise ValueError("num_gaussians must be a positive integer.")

        self.norm_axis = norm_axis
        self.eps = eps
        self.num_heads = num_heads
        self.padding_value = padding_value
        self.num_gaussians = num_gaussians

        self.mean_offsets = nn.Parameter(torch.zeros(num_gaussians, dtype=torch.float))
        self.c = nn.Parameter(torch.exp(torch.randn(num_gaussians, dtype=torch.float)))

    def forward(self, x, return_attention_details=False):
        if x.dim() < 2:
            raise ValueError(f"Input tensor must have at least 2 dimensions, got {x.dim()}.")
        if self.norm_axis >= x.dim() or self.norm_axis < -x.dim():
            raise ValueError(f"norm_axis {self.norm_axis} is out of bounds for input tensor with {x.dim()} dimensions.")

        mask = x != self.padding_value if self.padding_value is not None else None
        x_masked = torch.where(mask, x, torch.zeros_like(x)) if mask is not None else x

        mean = x_masked.mean(dim=self.norm_axis, keepdim=True)
        var = x_masked.var(dim=self.norm_axis, keepdim=True) + self.eps

        mixture = 1
        for i in range(self.num_gaussians):
            adjusted_mean = mean + self.mean_offsets[i]
            y_norm = (x - adjusted_mean) / torch.sqrt(var)
            gaussian = torch.exp(-((y_norm ** 2) / (2.0 * (self.c[i] ** 2)))) / torch.sqrt(2 * torch.pi * (self.c[i] ** 2))
            mixture *= gaussian

        mixture /= mixture.sum(dim=self.norm_axis, keepdim=True).clamp(min=self.eps)

        if return_attention_details:
            return torch.where(mask, x * mixture, x) if mask is not None else x * mixture, mixture.detach()
        else:
            return torch.where(mask, x * mixture, x) if mask is not None else x * mixture


class MultiHeadGaussianAdaptiveAttention(nn.Module):
    def __init__(self, norm_axis, num_heads, num_gaussians, padding_value=None, eps=1e-8):
        super().__init__()
        self.norm_axis = norm_axis
        self.num_heads = num_heads
        self.attention_heads = nn.ModuleList([
            GaussianAdaptiveAttention(norm_axis, num_heads, num_gaussians, padding_value, eps)
            for _ in range(num_heads)
        ])

    def forward(self, x, return_attention_details=False):
        chunk_size = x.shape[self.norm_axis] // self.num_heads
        if chunk_size == 0:
            raise ValueError(f"Input tensor size along norm_axis ({self.norm_axis}) must be larger than the number of heads ({self.num_heads}).")

        outputs, attention_details_ = [], []
        for i in range(self.num_heads):
            start_index = i * chunk_size
            end_index = start_index + chunk_size if i < self.num_heads - 1 else x.shape[self.norm_axis]
            chunk = x.narrow(self.norm_axis, start_index, end_index - start_index)
            if return_attention_details:
                out, mixture = self.attention_heads[i](chunk, return_attention_details=True)
                outputs.append(out)
                attention_details_.append(mixture)
            else:
                outputs.append(self.attention_heads[i](chunk))

        if return_attention_details:
            return torch.cat(outputs, dim=self.norm_axis), torch.cat(attention_details_, dim=self.norm_axis)
        else:
            return torch.cat(outputs, dim=self.norm_axis)



class GaussianBlock(nn.Module):
    def __init__(self, norm_axes, num_heads, num_gaussians, num_layers, padding_value=None, eps=1e-8):
        super().__init__()
        if len(norm_axes) != num_layers or len(num_heads) != num_layers or len(num_gaussians) != num_layers:
            raise ValueError("Lengths of norm_axes, num_heads, and num_gaussians must match num_layers.")

        self.layers = nn.ModuleList([
            MultiHeadGaussianAdaptiveAttention(norm_axes[i], num_heads[i], num_gaussians[i], padding_value, eps)
            for i in range(num_layers)
        ])

    def forward(self, x, return_attention_details=False):
        attention_details_ = {}
        for idx, layer in enumerate(self.layers):
            if return_attention_details:
                x_, attention_details = layer(x, return_attention_details=True)
                attention_details_['layer_'+str(idx)] = attention_details
                x = x_ + x
            else:
                x = layer(x) + x

        if return_attention_details:
            return x, attention_details_
        return x

In [11]:
for i in range(1, 7):
    !wget --no-check-certificate -P data https://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron{i}.tar.gz
    !tar -xzf data/enron{i}.tar.gz -C data

--2025-01-15 15:41:32--  https://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron1.tar.gz
Resolving www2.aueb.gr (www2.aueb.gr)... 195.251.255.230
Connecting to www2.aueb.gr (www2.aueb.gr)|195.251.255.230|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 1802573 (1.7M) [application/x-gzip]
Saving to: ‘data/enron1.tar.gz’


2025-01-15 15:41:34 (1.68 MB/s) - ‘data/enron1.tar.gz’ saved [1802573/1802573]

--2025-01-15 15:41:34--  https://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron2.tar.gz
Resolving www2.aueb.gr (www2.aueb.gr)... 195.251.255.230
Connecting to www2.aueb.gr (www2.aueb.gr)|195.251.255.230|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 2905627 (2.8M) [application/x-gzip]
Saving to: ‘data/enron2.tar.gz’


2025-01-15 15:41:36 (2.60 MB/s) - ‘data/enron2.tar.gz’ saved [2905627/2905627]

--2025-01-15 15:41:36-

In [12]:
print("Loading Enron dataset...")
texts, labels = load_data_from_directories()

Loading Enron dataset...


In [13]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42  # 70% train, 30% for val+test
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42  # Split temp 50-50
)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
val_dataset = SpamDataset(val_texts, val_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [15]:
model = SpamTransformer(tokenizer.vocab_size).to(DEVICE)

# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [16]:
train_model(model, train_loader, val_loader, criterion, optimizer)

  output = torch._nested_tensor_from_mask(


Epoch 1:
Training Loss: 0.1609
Training Accuracy: 93.53%
Validation Accuracy: 97.51%
--------------------------------------------------
Epoch 2:
Training Loss: 0.0553
Training Accuracy: 98.26%
Validation Accuracy: 98.12%
--------------------------------------------------
Epoch 3:
Training Loss: 0.0306
Training Accuracy: 99.12%
Validation Accuracy: 98.18%
--------------------------------------------------
Epoch 4:
Training Loss: 0.0204
Training Accuracy: 99.44%
Validation Accuracy: 97.86%
--------------------------------------------------
Epoch 5:
Training Loss: 0.0132
Training Accuracy: 99.63%
Validation Accuracy: 97.92%
--------------------------------------------------
Epoch 6:
Training Loss: 0.0144
Training Accuracy: 99.66%
Validation Accuracy: 98.24%
--------------------------------------------------
Epoch 7:
Training Loss: 0.0071
Training Accuracy: 99.79%
Validation Accuracy: 98.04%
--------------------------------------------------
Epoch 8:
Training Loss: 0.0100
Training Accuracy

In [17]:
evaluate_model(model, test_loader, criterion)


Test Set Evaluation:
Average Loss: 0.1158
Accuracy: 0.9792
Precision: 0.9774
Recall: 0.9816
F1 Score: 0.9795

Confusion Matrix:
[[2443   58]
 [  47 2510]]


{'accuracy': 0.9792408066429419,
 'precision': 0.9774143302180686,
 'recall': 0.9816190848650762,
 'f1': 0.9795121951219512,
 'confusion_matrix': array([[2443,   58],
        [  47, 2510]])}

In [18]:
model = GaussianSpamTransformer(tokenizer.vocab_size, D_MODEL, N_HEADS, N_LAYERS).to(DEVICE)

# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [19]:
train_model(model, train_loader, val_loader, criterion, optimizer)

Epoch 1:
Training Loss: 0.4337
Training Accuracy: 88.33%
Validation Accuracy: 95.45%
--------------------------------------------------
Epoch 2:
Training Loss: 0.1554
Training Accuracy: 96.93%
Validation Accuracy: 96.95%
--------------------------------------------------
Epoch 3:
Training Loss: 0.0916
Training Accuracy: 98.02%
Validation Accuracy: 97.65%
--------------------------------------------------
Epoch 4:
Training Loss: 0.0651
Training Accuracy: 98.51%
Validation Accuracy: 97.88%
--------------------------------------------------
Epoch 5:
Training Loss: 0.0497
Training Accuracy: 98.86%
Validation Accuracy: 98.14%
--------------------------------------------------
Epoch 6:
Training Loss: 0.0390
Training Accuracy: 99.18%
Validation Accuracy: 98.34%
--------------------------------------------------
Epoch 7:
Training Loss: 0.0314
Training Accuracy: 99.34%
Validation Accuracy: 98.40%
--------------------------------------------------
Epoch 8:
Training Loss: 0.0256
Training Accuracy

In [20]:
evaluate_model(model, test_loader, criterion)


Test Set Evaluation:
Average Loss: 0.0402
Accuracy: 0.9885
Precision: 0.9830
Recall: 0.9945
F1 Score: 0.9887

Confusion Matrix:
[[2457   44]
 [  14 2543]]


{'accuracy': 0.9885330170027679,
 'precision': 0.98299188248937,
 'recall': 0.9945248337895972,
 'f1': 0.9887247278382582,
 'confusion_matrix': array([[2457,   44],
        [  14, 2543]])}