In [172]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd
import numpy as np
import os
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [173]:
# Hyperparameters
MAX_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 3e-4
D_MODEL = 256  # Embedding dimension
N_HEADS = 8    # Number of attention heads
N_LAYERS = 2   # Number of transformer layers


import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# Device configuration
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

DEVICE = torch.device("cpu")

print(f"Using device: {DEVICE}")

Using device: cpu


In [174]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [175]:
class SpamTransformer(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, D_MODEL)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=D_MODEL,
            nhead=N_HEADS,
            dim_feedforward=D_MODEL * 4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N_LAYERS)
        self.classifier = nn.Linear(D_MODEL, 2)  # Binary classification
        
    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)
        
        # Create padding mask for transformer
        mask = attention_mask.bool()
        
        # Pass through transformer
        x = self.transformer(x, src_key_padding_mask=~mask)
        
        # Pool the output (use [CLS] token or mean pooling)
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)
        
        # Classify
        return self.classifier(x)

In [176]:
def train_model(model, train_loader, val_loader, criterion, optimizer):
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        correct = 0
        total = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(DEVICE)
                attention_mask = batch['attention_mask'].to(DEVICE)
                labels = batch['label'].to(DEVICE)
                
                outputs = model(input_ids, attention_mask)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        print(f'Epoch {epoch+1}:')
        print(f'Training Loss: {total_loss/len(train_loader):.4f}')
        print(f'Training Accuracy: {100 * correct/total:.2f}%')
        print(f'Validation Accuracy: {100 * val_correct/val_total:.2f}%')
        print('-' * 50)

    


In [177]:

def load_enron_data(spam_path, ham_path):
    """Load and preprocess the Enron spam/ham dataset."""
    spam_emails = []
    ham_emails = []
    
    # Load spam emails
    for filename in os.listdir(spam_path):
        with open(os.path.join(spam_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                spam_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Load ham emails
    for filename in os.listdir(ham_path):
        with open(os.path.join(ham_path, filename), 'r', encoding='latin1') as f:
            try:
                content = f.read()
                ham_emails.append(content)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Create labels (1 for spam, 0 for ham)
    emails = spam_emails + ham_emails
    labels = [1] * len(spam_emails) + [0] * len(ham_emails)
    
    # Clean the text
    cleaned_emails = []
    for email in emails:
        # Remove email headers
        try:
            content = email.split('\n\n', 1)[1]
        except IndexError:
            content = email
            
        # Remove special characters and extra whitespace
        content = re.sub(r'[^\w\s]', ' ', content)
        content = ' '.join(content.split())
        cleaned_emails.append(content)
    
    return cleaned_emails, labels

In [178]:
for i in range(1, 7):
    # !wget --no-check-certificate -P data https://www2.aueb.gr/users/ion/data/enron-spam/preprocessed/enron{i}.tar.gz
    !tar -xzf data/enron{i}.tar.gz -C data

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [179]:
import numpy as np

def load_data_from_directories(max_items=None):
    spam_emails = []
    ham_emails = []

    for i in range(1, 2):
        spam_path = f'data/enron{i}/spam'
        ham_path = f'data/enron{i}/ham'
        
        # Load spam and ham emails
        spam, ham = load_enron_data(spam_path, ham_path)
        spam_emails.extend(spam)
        ham_emails.extend(ham)

    # Combine corresponding spam and ham emails into pairs
    email_pairs = list(zip(spam_emails, ham_emails))

    # Shuffle email pairs
    np.random.seed(42)
    np.random.shuffle(email_pairs)

    if max_items is not None:
        email_pairs = email_pairs[:max_items]

    # Separate spam and ham emails after shuffling
    spam_emails, ham_emails = zip(*email_pairs)

    return list(spam_emails), list(ham_emails)


In [180]:
print("Loading Enron dataset...")
texts, labels = load_data_from_directories()



Loading Enron dataset...


In [181]:
print(len(texts))
print(len(labels))
print(texts[:2])
print(labels[:2])

5172
5172
['Subject 8 th noms forwarded by ami chokshi corp enron on 02 07 2000 10 30 am troy _ a _ benoit reliantenergy com on 02 07 2000 10 19 10 am to ami chokshi cc subject 8 th noms see attached file hpl feb xls hpl feb xls', 'Subject ces desk move assuming the test results are good the move from the ces desks to the appropriate ena desks will take place either wednesday or thursday evening ces im east to ena im east ces im central ena im central ces im texas ena im texas this move will be done systematically for january forward business after this move is complete there should be no active deals remaining on the ces desks the following is a list of deals that i wanted to bring to your attention and review one of the base assumptions expressed by all was that there were no transport usage tickets booked on the ces desk this is not true however transport usage tickets will be included in the change over therefore this will not be a problem on the east desk three desk to desk deals 

In [182]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, labels, test_size=0.3, random_state=42  # 70% train, 30% for val+test
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42  # Split temp 50-50
)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

train_dataset = SpamDataset(train_texts, train_labels, tokenizer)
val_dataset = SpamDataset(val_texts, val_labels, tokenizer)
test_dataset = SpamDataset(test_texts, test_labels, tokenizer)

In [183]:
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Initialize model
model = SpamTransformer(tokenizer.vocab_size).to(DEVICE)

# Initialize loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [184]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1:
Training Loss: 0.2895
Training Accuracy: 88.23%
Validation Accuracy: 90.34%
--------------------------------------------------
Epoch 2:
Training Loss: 0.1255
Training Accuracy: 95.69%
Validation Accuracy: 95.49%
--------------------------------------------------
Epoch 3:
Training Loss: 0.0733
Training Accuracy: 98.07%
Validation Accuracy: 95.75%
--------------------------------------------------
Epoch 4:
Training Loss: 0.0388
Training Accuracy: 99.01%
Validation Accuracy: 95.49%
--------------------------------------------------


KeyboardInterrupt: 

In [185]:
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(DEVICE)
            attention_mask = batch['attention_mask'].to(DEVICE)
            labels = batch['label'].to(DEVICE)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    conf_matrix = confusion_matrix(true_labels, predictions)
    
    print("\nTest Set Evaluation:")
    print(f"Average Loss: {total_loss/len(test_loader):.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': conf_matrix
    }

In [186]:
print(model)

SpamTransformer(
  (embedding): Embedding(30522, 256)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features=256, out_features=2, bias=True)
)


In [189]:
import torch
import torch.nn as nn
import math

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()
        self.d_k = d_k  # Dimension of keys/queries
        
    def forward(self, Q, K, V, mask=None):
        # Calculate attention scores
        # (batch_size, n_heads, seq_len, seq_len)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply mask if provided
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        # Apply softmax to get attention weights
        attention_weights = torch.softmax(scores, dim=-1)
        
        # Calculate output
        output = torch.matmul(attention_weights, V)
        
        return output, attention_weights

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        # Linear layers for Q, K, V projections
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        
        self.attention = ScaledDotProductAttention(self.d_k)
        
        # Output linear layer
        self.W_o = nn.Linear(d_model, d_model)
        
    def forward(self, x, mask=None):
        batch_size = x.size(0)
        
        # Linear projections and reshape for multiple heads
        Q = self.W_q(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # Apply attention
        output, attention_weights = self.attention(Q, K, V, mask)
        
        # Reshape and apply output linear layer
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.W_o(output)
        
        return output, attention_weights

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.self_attention = MultiHeadAttention(d_model, n_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self attention
        attn_output, attention_weights = self.self_attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        
        return x, attention_weights

class CustomSpamTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=8, n_layers=2, d_ff=1024):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Stack of transformer encoder layers
        self.encoder_layers = nn.ModuleList([
            CustomTransformerEncoderLayer(d_model, n_heads, d_ff)
            for _ in range(n_layers)
        ])
        
        self.classifier = nn.Linear(d_model, 2)
        
    def forward(self, input_ids, attention_mask):
        # Embed input tokens
        x = self.embedding(input_ids)
        
        # Create attention mask
        # Convert from [batch_size, seq_len] to [batch_size, 1, 1, seq_len]
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        
        # Store attention weights from all layers
        attention_weights_list = []
        
        # Pass through encoder layers
        for encoder_layer in self.encoder_layers:
            x, attention_weights = encoder_layer(x, attention_mask)
            attention_weights_list.append(attention_weights)
        
        # Pool the output (mean pooling)
        x = torch.mean(x * attention_mask.squeeze(1).unsqueeze(-1), dim=1)
        
        # Classify
        return self.classifier(x), attention_weights_list

# Usage example:
model = CustomSpamTransformer(vocab_size=30000)  # or whatever your vocab size is

In [188]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Trainable Parameters: {count_parameters(model):,}")


Trainable Parameters: 9,393,666


In [191]:
print(model)
print(f"Trainable Parameters: {count_parameters(model):,}")


CustomSpamTransformer(
  (embedding): Embedding(30000, 256)
  (encoder_layers): ModuleList(
    (0-1): 2 x CustomTransformerEncoderLayer(
      (self_attention): MultiHeadAttention(
        (W_q): Linear(in_features=256, out_features=256, bias=True)
        (W_k): Linear(in_features=256, out_features=256, bias=True)
        (W_v): Linear(in_features=256, out_features=256, bias=True)
        (attention): ScaledDotProductAttention()
        (W_o): Linear(in_features=256, out_features=256, bias=True)
      )
      (feed_forward): FeedForward(
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (classifier): Linear(in_features=256, out_features=2, bias=True)
)
Trainable Par

In [244]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# PyTorch Scaled Dot-Product Attention Logic
def pytorch_scaled_dot_product_attention(query, key, value, mask=None):
    scores = torch.matmul(query, key.transpose(-2, -1)) / (key.size(-1) ** 0.5)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, value)
    return output

# Custom Self-Attention
class CustomSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.head_dim = d_model // n_heads

        assert self.head_dim * n_heads == d_model, "d_model must be divisible by n_heads"

        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()

        # Project inputs to Q, K, V
        q = self.q_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.n_heads, self.head_dim).transpose(1, 2)

        # Apply scaled dot-product attention
        attn_output = pytorch_scaled_dot_product_attention(q, k, v, mask)

        # Concatenate heads and project output
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        return self.out_proj(attn_output)

# Custom Transformer Encoder Layer
class CustomTransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward, dropout=0.1):
        super().__init__()
        self.self_attn = CustomSelfAttention(d_model, n_heads)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(dim_feedforward, d_model)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.self_attn(src, src_key_padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        src2 = self.linear2(self.dropout(F.relu(self.linear1(src))))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

# SpamTransformer Model
class SpamTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, dim_feedforward, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Custom Transformer Encoder
        encoder_layer = CustomTransformerEncoderLayer(
            d_model=d_model, n_heads=n_heads, dim_feedforward=dim_feedforward
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, input_ids, attention_mask):
        x = self.embedding(input_ids)

        # Create padding mask for transformer
        mask = attention_mask.bool()

        # Pass through transformer
        x = self.transformer(x, src_key_padding_mask=~mask)

        # Pool the output (mean pooling)
        x = torch.mean(x * attention_mask.unsqueeze(-1), dim=1)

        # Classify
        return self.classifier(x)


In [246]:
model = SpamTransformer(
    vocab_size=tokenizer.vocab_size,
    d_model=256,
    n_heads=8,
    n_layers=2,
    dim_feedforward=1024,
    num_classes=2
)



In [247]:
print(model)

SpamTransformer(
  (embedding): Embedding(30522, 256)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x CustomTransformerEncoderLayer(
        (self_attn): CustomSelfAttention(
          (q_proj): Linear(in_features=256, out_features=256, bias=True)
          (k_proj): Linear(in_features=256, out_features=256, bias=True)
          (v_proj): Linear(in_features=256, out_features=256, bias=True)
          (out_proj): Linear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=1024, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Linear(in_features

In [248]:
train_model(model, train_loader, val_loader, criterion, optimizer)


AttributeError: 'CustomSelfAttention' object has no attribute 'batch_first'

In [187]:
evaluate_model(model, test_loader, criterion)


Test Set Evaluation:
Average Loss: 0.1923
Accuracy: 0.9549
Precision: 0.8750
Recall: 0.9868
F1 Score: 0.9275

Confusion Matrix:
[[517  32]
 [  3 224]]


{'accuracy': 0.9548969072164949,
 'precision': 0.875,
 'recall': 0.986784140969163,
 'f1': 0.927536231884058,
 'confusion_matrix': array([[517,  32],
        [  3, 224]])}