In [101]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import os
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [102]:
# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 15
LEARNING_RATE = 3e-4
D_MODEL = 256  # Embedding dimension
N_HEADS = 8    # Number of attention heads
N_LAYERS = 2   # Number of transformer layers


import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# Device configuration
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

DEVICE = torch.device("cpu")

print(f"Using device: {DEVICE}")

Using device: cpu


In [103]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [104]:
class SpamTransformer(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, D_MODEL)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=D_MODEL * 4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N_LAYERS)
        self.classifier = nn.Linear(D_MODEL, 2)  # Binary classification
        
    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        
        # Create padding mask for transformer
        mask = attention_mask.bool()
        
        # Pass through transformer
        x = self.transformer(x, src_key_padding_mask=~mask)
        
        # Pool the output (use [CLS] token or mean pooling)
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)
        
        # Classify
        return self.classifier(x)

In [105]:
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, batch_first=True):
        super().__init__()
        
        # Basic checks
        if embed_dim <= 0 or num_heads <= 0:
            raise ValueError(f"embed_dim ({embed_dim}) and num_heads ({num_heads}) must be greater than 0")
        if embed_dim % num_heads != 0:
            raise ValueError(f"embed_dim ({embed_dim}) must be divisible by num_heads ({num_heads})")
            
        # Save parameters
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.batch_first = batch_first
        self.scaling = float(self.head_dim) ** -0.5
        
        # Projection layers
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        
        # Initialize weights
        nn.init.xavier_uniform_(self.q_proj.weight)
        nn.init.xavier_uniform_(self.k_proj.weight)
        nn.init.xavier_uniform_(self.v_proj.weight)
        nn.init.xavier_uniform_(self.out_proj.weight)
        if bias:
            nn.init.constant_(self.q_proj.bias, 0.)
            nn.init.constant_(self.k_proj.bias, 0.)
            nn.init.constant_(self.v_proj.bias, 0.)
            nn.init.constant_(self.out_proj.bias, 0.)

    def forward(self, query, key, value, key_padding_mask=None, attn_mask=None, 
               need_weights=True, average_attn_weights=True, is_causal=False):
        # Check if inputs are batched and handle batch_first
        is_batched = query.dim() == 3
        if self.batch_first and is_batched:
            query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
        
        # Get sizes
        tgt_len, bsz, embed_dim = query.shape
        src_len = key.shape[0]
        
        # 1. Project inputs
        q = self.q_proj(query)
        k = self.k_proj(key)
        v = self.v_proj(value)
        
        # 2. Reshape for multi-head attention
        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
        k = k.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
        v = v.contiguous().view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
        
        # 3. Calculate attention scores
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scaling
        
        # 4. Apply masks if provided
        if attn_mask is not None:
            if attn_mask.dim() == 2:
                attn_mask = attn_mask.unsqueeze(0)
            if attn_mask.dtype == torch.bool:
                attn_weights.masked_fill_(attn_mask, float('-inf'))
            else:
                attn_weights += attn_mask
            
        if key_padding_mask is not None:
            key_padding_mask = key_padding_mask.to(torch.bool)  # Convert to boolean
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf'),
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

        if is_causal:
            causal_mask = torch.triu(
                torch.ones(tgt_len, src_len, dtype=torch.bool, device=query.device), 
                diagonal=1
            )
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            attn_weights = attn_weights.masked_fill(
                causal_mask.unsqueeze(0).unsqueeze(0),
                float('-inf')
            )
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
        
        # 5. Apply softmax and dropout
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)
        
        # 6. Apply attention to values
        attn_output = torch.matmul(attn_weights, v)
        
        # 7. Reshape and apply output projection
        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
        attn_output = self.out_proj(attn_output)
        
        # Handle batch_first for output
        if self.batch_first and is_batched:
            attn_output = attn_output.transpose(1, 0)
        
        if need_weights:
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
            if average_attn_weights:
                attn_weights = attn_weights.mean(dim=1)
            return attn_output, attn_weights
        else:
            return attn_output, None

In [106]:
class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, batch_first=True):
        super().__init__()
        
        # Multi-head self-attention
        # self.self_attn = nn.MultiheadAttention(
        #     embed_dim=d_model,
        #     num_heads=nhead,
        #     dropout=dropout,
        #     batch_first=batch_first
        # )

        self.self_attn = CustomMultiheadAttention(
            embed_dim=d_model,
            num_heads=nhead,
            dropout=dropout,
            batch_first=batch_first
        )
        
        # Feed-forward network
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, d_model)
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        self.batch_first = batch_first

    def forward(self, src, src_mask=None, is_causal=None, src_key_padding_mask=None):  # Added is_causal parameter
        # Self-attention block
        attn_output, _ = self.self_attn(src, src, src,
                                      attn_mask=src_mask,
                                      key_padding_mask=src_key_padding_mask,
                                      is_causal=is_causal)  # Added is_causal parameter
        
        src = src + self.dropout(attn_output)  # Residual connection
        src = self.norm1(src)  # Layer normalization
        
        # Feedforward block
        ff_output = self.feedforward(src)
        src = src + self.dropout(ff_output)  # Residual connection
        src = self.norm2(src)  # Layer normalization
        
        return src

In [107]:
class CustomSpamTransformer(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, D_MODEL)
        encoder_layer = CustomTransformerEncoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=D_MODEL * 4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N_LAYERS)
        self.classifier = nn.Linear(D_MODEL, 2)  # Binary classification
        
    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        
        # Create padding mask for transformer
        mask = attention_mask.bool()
        
        # Pass through transformer
        x = self.transformer(x, src_key_padding_mask=~mask)
        
        # Pool the output (use [CLS] token or mean pooling)
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)
        
        # Classify
        return self.classifier(x)

In [108]:
class GaussianSpamTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Create GaussianBlock
        norm_axes = [1] * n_layers  # 1 for sequence dimension
        num_heads = [n_heads] * n_layers
        num_gaussians = [8] * n_layers  # You can experiment with this
        
        self.gaussian_block = GaussianBlock(
            norm_axes=norm_axes,
            num_heads=num_heads,
            num_gaussians=num_gaussians,
            num_layers=n_layers,
            padding_value=0  # Your padding token value
        )
        
        self.classifier = nn.Linear(d_model, 2)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        x = self.gaussian_block(x)  # Just use the block directly
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)
        return self.classifier(x)

In [129]:
def train_model(model, train_loader, val_loader, criterion, optimizer, patience=5):
    model.train()
    
    best_val_accuracy = 0.0
    best_model_state = None
    wait = 0

    for epoch in range(EPOCHS):
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['label'].to(DEVICE)
                
                outputs = model(input_ids, attention_mask)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        train_accuracy = 100 * correct / total
        val_accuracy = 100 * val_correct / val_total
        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {100 * correct/total:.2f}%')
        print(f'Validation Accuracy: {100 * val_correct/val_total:.2f}%')
        print('-' * 50)

        # Early Stopping Logic
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()  # Save the best model
            wait = 0  # Reset patience counter
        else:
            wait += 1  # Increment patience counter
        
        if wait >= patience:
            print(f"Early stopping triggered at epoch {epoch+1}. Best Validation Accuracy: {best_val_accuracy:.2f}%")
            break

    # Load the best model state
    if best_model_state:
        model.load_state_dict(best_model_state)


In [110]:

def load_enron_data(spam_path, ham_path):
    """Load and preprocess the Enron spam/ham dataset."""
    spam_emails = []
    ham_emails = []
    
    # Load spam emails
    for filename in os.listdir(spam_path):
        with open(os.path.join(spam_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                spam_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Load ham emails
    for filename in os.listdir(ham_path):
        with open(os.path.join(ham_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                ham_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Create labels (1 for spam, 0 for ham)
    emails = spam_emails + ham_emails
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    
    # Clean the text
    cleaned_emails = []
    for email in emails:
        # Remove email headers
        try:
            content = email.split('\n\n', 1)[1]
        except IndexError:
            content = email
            
        # Remove special characters and extra whitespace
        content = re.sub(r'[^\w\s]', ' ', content)
        content = ' '.join(content.split())
        cleaned_emails.append(content)
    
    return cleaned_emails, labels

In [111]:
for i in range(1, 7):
    # !wget --no-check-certificate -P data https://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron{i}.tar.gz
    !tar -xzf data/enron{i}.tar.gz -C data

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [112]:
import numpy as np

def load_data_from_directories(max_items=None):
    spam_emails = []
    ham_emails = []

    for i in range(1, 2):
        spam_path = f'data/enron{i}/spam'
        ham_path = f'data/enron{i}/ham'
        
        # Load spam and ham emails
        spam, ham = load_enron_data(spam_path, ham_path)
        spam_emails.extend(spam)
        ham_emails.extend(ham)

    # Combine corresponding spam and ham emails into pairs
    email_pairs = list(zip(spam_emails, ham_emails))

    # Shuffle email pairs
    np.random.seed(42)
    np.random.shuffle(email_pairs)

    if max_items is not None:
        email_pairs = email_pairs[:max_items]

    # Separate spam and ham emails after shuffling
    spam_emails, ham_emails = zip(*email_pairs)

    return list(spam_emails), list(ham_emails)


In [113]:
print("Loading Enron dataset...")
texts, labels = load_data_from_directories()



Loading Enron dataset...


In [114]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42  # 70% train, 30% for val+test
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42  # Split temp 50-50
)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
val_dataset = SpamDataset(val_texts, val_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)

In [115]:
import torch
import torch.nn as nn

class GaussianAdaptiveAttention(nn.Module):
    def __init__(self, norm_axis, num_heads, num_gaussians, padding_value, mean_offset_init=0, eps=1e-8):
        super().__init__()
        if not isinstance(norm_axis, int):
            raise ValueError("norm_axis must be an integer.")
        if num_heads <= 0 or not isinstance(num_heads, int):
            raise ValueError("num_heads must be a positive integer.")
        if num_gaussians <= 0 or not isinstance(num_gaussians, int):
            raise ValueError("num_gaussians must be a positive integer.")

        self.norm_axis = norm_axis
        self.eps = eps
        self.num_heads = num_heads
        self.padding_value = padding_value
        self.num_gaussians = num_gaussians

        self.mean_offsets = nn.Parameter(torch.zeros(num_gaussians, dtype=torch.float))
        self.c = nn.Parameter(torch.exp(torch.randn(num_gaussians, dtype=torch.float)))

    def forward(self, x, return_attention_details=False):
        if x.dim() < 2:
            raise ValueError(f"Input tensor must have at least 2 dimensions, got {x.dim()}.")
        if self.norm_axis >= x.dim() or self.norm_axis < -x.dim():
            raise ValueError(f"norm_axis {self.norm_axis} is out of bounds for input tensor with {x.dim()} dimensions.")

        mask = x != self.padding_value if self.padding_value is not None else None
        x_masked = torch.where(mask, x, torch.zeros_like(x)) if mask is not None else x

        mean = x_masked.mean(dim=self.norm_axis, keepdim=True)
        var = x_masked.var(dim=self.norm_axis, keepdim=True) + self.eps

        mixture = 1
        for i in range(self.num_gaussians):
            adjusted_mean = mean + self.mean_offsets[i]
            y_norm = (x - adjusted_mean) / torch.sqrt(var)
            gaussian = torch.exp(-((y_norm ** 2) / (2.0 * (self.c[i] ** 2)))) / torch.sqrt(2 * torch.pi * (self.c[i] ** 2))
            mixture *= gaussian

        mixture /= mixture.sum(dim=self.norm_axis, keepdim=True).clamp(min=self.eps)

        if return_attention_details:
            return torch.where(mask, x * mixture, x) if mask is not None else x * mixture, mixture.detach()
        else:
            return torch.where(mask, x * mixture, x) if mask is not None else x * mixture
            
            
class MultiHeadGaussianAdaptiveAttention(nn.Module):
    def __init__(self, norm_axis, num_heads, num_gaussians, padding_value=None, eps=1e-8):
        super().__init__()
        self.norm_axis = norm_axis
        self.num_heads = num_heads
        self.attention_heads = nn.ModuleList([
            GaussianAdaptiveAttention(norm_axis, num_heads, num_gaussians, padding_value, eps)
            for _ in range(num_heads)
        ])

    def forward(self, x, return_attention_details=False):
        chunk_size = x.shape[self.norm_axis] // self.num_heads
        if chunk_size == 0:
            raise ValueError(f"Input tensor size along norm_axis ({self.norm_axis}) must be larger than the number of heads ({self.num_heads}).")

        outputs, attention_details_ = [], []
        for i in range(self.num_heads):
            start_index = i * chunk_size
            end_index = start_index + chunk_size if i < self.num_heads - 1 else x.shape[self.norm_axis]
            chunk = x.narrow(self.norm_axis, start_index, end_index - start_index)
            if return_attention_details:
                out, mixture = self.attention_heads[i](chunk, return_attention_details=True)
                outputs.append(out)
                attention_details_.append(mixture)
            else:
                outputs.append(self.attention_heads[i](chunk))

        if return_attention_details:
            return torch.cat(outputs, dim=self.norm_axis), torch.cat(attention_details_, dim=self.norm_axis)
        else:
            return torch.cat(outputs, dim=self.norm_axis)
            
            

class GaussianBlock(nn.Module):
    def __init__(self, norm_axes, num_heads, num_gaussians, num_layers, padding_value=None, eps=1e-8):
        super().__init__()
        if len(norm_axes) != num_layers or len(num_heads) != num_layers or len(num_gaussians) != num_layers:
            raise ValueError("Lengths of norm_axes, num_heads, and num_gaussians must match num_layers.")

        self.layers = nn.ModuleList([
            MultiHeadGaussianAdaptiveAttention(norm_axes[i], num_heads[i], num_gaussians[i], padding_value, eps)
            for i in range(num_layers)
        ])

    def forward(self, x, return_attention_details=False):
        attention_details_ = {}
        for idx, layer in enumerate(self.layers):
            if return_attention_details:
                x_, attention_details = layer(x, return_attention_details=True)
                attention_details_['layer_'+str(idx)] = attention_details
                x = x_ + x
            else:
                x = layer(x) + x

        if return_attention_details:
            return x, attention_details_
        return x

In [127]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Initialize model
model = SpamTransformer(tokenizer.vocab_size).to(DEVICE)
# model = CustomSpamTransformer(tokenizer.vocab_size).to(DEVICE)

# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [130]:
train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1:
Training Loss: 0.1056
Training Accuracy: 96.66%
Validation Accuracy: 94.97%
--------------------------------------------------
Epoch 2:
Training Loss: 0.0639
Training Accuracy: 98.26%
Validation Accuracy: 94.97%
--------------------------------------------------
Epoch 3:
Training Loss: 0.0441
Training Accuracy: 98.84%
Validation Accuracy: 96.13%
--------------------------------------------------
Epoch 4:
Training Loss: 0.0304
Training Accuracy: 99.23%
Validation Accuracy: 96.01%
--------------------------------------------------
Epoch 5:
Training Loss: 0.0172
Training Accuracy: 99.70%
Validation Accuracy: 95.23%
--------------------------------------------------
Epoch 6:
Training Loss: 0.0134
Training Accuracy: 99.83%
Validation Accuracy: 96.26%
--------------------------------------------------
Epoch 7:
Training Loss: 0.0228
Training Accuracy: 99.45%
Validation Accuracy: 96.01%
--------------------------------------------------
Epoch 8:
Training Loss: 0.0264
Training Accuracy

In [118]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    conf_matrix = confusion_matrix(true_labels, predictions)
    
    print("\nTest Set Evaluation:")
    print(f"Average Loss: {total_loss/len(test_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

In [119]:
evaluate_model(model, test_loader, criterion)


Test Set Evaluation:
Average Loss: 0.2596
Accuracy: 0.9652
Precision: 0.9425
Recall: 0.9383
F1 Score: 0.9404

Confusion Matrix:
[[536  13]
 [ 14 213]]


{'accuracy': 0.9652061855670103,
 'precision': 0.9424778761061947,
 'recall': 0.9383259911894273,
 'f1': 0.9403973509933775,
 'confusion_matrix': array([[536,  13],
        [ 14, 213]])}

In [123]:
model = GaussianSpamTransformer(tokenizer.vocab_size, D_MODEL, N_HEADS, N_LAYERS).to(DEVICE)

# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [124]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer)

Epoch 1:
Training Loss: 0.6050
Training Accuracy: 72.54%
Validation Accuracy: 66.62%
--------------------------------------------------
Epoch 2:
Training Loss: 0.4878
Training Accuracy: 73.29%
Validation Accuracy: 69.07%
--------------------------------------------------
Epoch 3:
Training Loss: 0.4024
Training Accuracy: 78.65%
Validation Accuracy: 77.06%
--------------------------------------------------
Epoch 4:
Training Loss: 0.3292
Training Accuracy: 86.41%
Validation Accuracy: 86.98%
--------------------------------------------------
Epoch 5:
Training Loss: 0.2669
Training Accuracy: 92.71%
Validation Accuracy: 91.11%
--------------------------------------------------
Epoch 6:
Training Loss: 0.2192
Training Accuracy: 95.06%
Validation Accuracy: 92.91%
--------------------------------------------------
Epoch 7:
Training Loss: 0.1844
Training Accuracy: 96.27%
Validation Accuracy: 94.07%
--------------------------------------------------
Epoch 8:
Training Loss: 0.1556
Training Accuracy

In [125]:
evaluate_model(model, test_loader, criterion)


Test Set Evaluation:
Average Loss: 0.1094
Accuracy: 0.9729
Precision: 0.9682
Recall: 0.9383
F1 Score: 0.9530

Confusion Matrix:
[[542   7]
 [ 14 213]]


{'accuracy': 0.9729381443298969,
 'precision': 0.9681818181818181,
 'recall': 0.9383259911894273,
 'f1': 0.9530201342281879,
 'confusion_matrix': array([[542,   7],
        [ 14, 213]])}