In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from collections import defaultdict
import time

In [8]:
class BaseEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.layer1 = nn.Linear(embedding_dim, embedding_dim * 2)
        self.layer2 = nn.Linear(embedding_dim * 2, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        pooled = torch.mean(embedded, dim=1)
        x = F.relu(self.layer1(pooled))
        x = self.dropout(x)
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        return self.fc(x)
    
    def get_similarity(self, x1, x2):
        embedded1 = self.embedding(x1)
        embedded2 = self.embedding(x2)
        return F.cosine_similarity(embedded1.mean(1), embedded2.mean(1))

class CrossAttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.attention = nn.MultiheadAttention(embedding_dim, num_heads=4, batch_first=True, dropout=0.1)
        self.layer1 = nn.Linear(embedding_dim, embedding_dim * 2)
        self.layer2 = nn.Linear(embedding_dim * 2, embedding_dim)
        self.fc = nn.Linear(embedding_dim, vocab_size)
        self.layer_norm = nn.LayerNorm(embedding_dim)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)
        attended, _ = self.attention(embedded, embedded, embedded)
        attended = self.layer_norm(attended + embedded)  # Residual connection
        pooled = torch.mean(attended, dim=1)
        x = F.relu(self.layer1(pooled))
        x = self.dropout(x)
        x = F.relu(self.layer2(x))
        x = self.dropout(x)
        return self.fc(x)
    
    def get_similarity(self, x1, x2):
        e1, e2 = self.embedding(x1), self.embedding(x2)
        attn_output, _ = self.attention(e1, e2, e2)
        return torch.sum(attn_output * e1, dim=-1) / torch.sqrt(torch.tensor(self.embedding.embedding_dim))

def run_comparison(vocab_size=1000, embedding_dim=64, dataset_size=10000):
    # Generate more structured synthetic data
    sequences = []
    labels = []
    # Create patterns in the data
    for _ in range(dataset_size):
        if np.random.random() < 0.5:
            # Pattern 1: sum of first and last token maps to label
            seq = torch.randint(0, vocab_size//4, (10,))
            label = (seq[0] + seq[-1]) % vocab_size
        else:
            # Pattern 2: sequence of similar tokens predicts next
            base_token = torch.randint(0, vocab_size//4, (1,))
            noise = torch.randint(-2, 3, (10,))
            seq = (base_token + noise) % vocab_size
            label = (base_token + 3) % vocab_size
        sequences.append(seq)
        labels.append(label)
    
    data = torch.stack(sequences)
    labels = torch.tensor(labels)
    
    # Create dataloaders
    train_size = int(0.8 * len(data))
    train_data = DataLoader(list(zip(data[:train_size], labels[:train_size])), 
                           batch_size=32,
                           shuffle=True)
    test_data = DataLoader(list(zip(data[train_size:], labels[train_size:])), 
                          batch_size=32)
    
    # Train both models
    base_model = BaseEmbeddingModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
    cross_attn_model = CrossAttentionModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
    
    base_results = train_and_measure(base_model, train_data, test_data, epochs=20)
    cross_attn_results = train_and_measure(cross_attn_model, train_data, test_data, epochs=20)
    
    return {
        'base_model': base_results,
        'cross_attention': cross_attn_results
    }

In [9]:
# Run with smaller settings first
results = run_comparison(
    vocab_size=50,  # Even smaller vocabulary to start
    embedding_dim=32,
    dataset_size=2000
)

print_results(results, "Improved Configuration")

Epoch 1: Train Loss: 3.6935, Test Loss: 3.2273, Accuracy: 0.0850
Epoch 2: Train Loss: 2.8991, Test Loss: 2.6915, Accuracy: 0.2600
Epoch 3: Train Loss: 2.5839, Test Loss: 2.4154, Accuracy: 0.3400
Epoch 4: Train Loss: 2.3549, Test Loss: 2.2454, Accuracy: 0.3500
Epoch 5: Train Loss: 2.2396, Test Loss: 2.1354, Accuracy: 0.3775
Epoch 6: Train Loss: 2.1410, Test Loss: 2.0781, Accuracy: 0.4150
Epoch 7: Train Loss: 2.1149, Test Loss: 2.0210, Accuracy: 0.4525
Epoch 8: Train Loss: 2.0615, Test Loss: 1.9917, Accuracy: 0.4375
Epoch 9: Train Loss: 1.9952, Test Loss: 1.9608, Accuracy: 0.4475
Epoch 10: Train Loss: 1.9930, Test Loss: 1.9396, Accuracy: 0.4525
Epoch 11: Train Loss: 1.9729, Test Loss: 1.9087, Accuracy: 0.4600
Epoch 12: Train Loss: 1.9281, Test Loss: 1.8795, Accuracy: 0.4700
Epoch 13: Train Loss: 1.8945, Test Loss: 1.8616, Accuracy: 0.4625
Epoch 14: Train Loss: 1.8736, Test Loss: 1.8357, Accuracy: 0.4700
Epoch 15: Train Loss: 1.8564, Test Loss: 1.8241, Accuracy: 0.4775
Epoch 16: Train Los

In [7]:

# Run the comparison
def print_results(results, configuration):
    print(f"\n=== Results for {configuration} ===")
    print("\nBase Model:")
    print(f"Training Time: {results['base_model']['training_time']:.2f} seconds")
    print(f"Final Training Loss: {results['base_model']['metrics']['train_loss'][-1]:.4f}")
    print(f"Final Test Loss: {results['base_model']['metrics']['test_loss'][-1]:.4f}")
    print(f"Final Accuracy: {results['base_model']['metrics']['accuracy'][-1]:.4f}")
    
    print("\nCross-Attention Model:")
    print(f"Training Time: {results['cross_attention']['training_time']:.2f} seconds")
    print(f"Final Training Loss: {results['cross_attention']['metrics']['train_loss'][-1]:.4f}")
    print(f"Final Test Loss: {results['cross_attention']['metrics']['test_loss'][-1]:.4f}")
    print(f"Final Accuracy: {results['cross_attention']['metrics']['accuracy'][-1]:.4f}")

# Try with smaller vocabulary and dataset size
results = run_comparison(
    vocab_size=100,  # Smaller vocabulary
    embedding_dim=32,  # Smaller embedding dimension
    dataset_size=5000  # Smaller dataset
)

print_results(results, "Default Configuration")

Epoch 1: Train Loss: 4.2718, Test Loss: 3.9932, Accuracy: 0.0540
Epoch 2: Train Loss: 3.8338, Test Loss: 3.7558, Accuracy: 0.0950
Epoch 3: Train Loss: 3.6597, Test Loss: 3.6358, Accuracy: 0.1200
Epoch 4: Train Loss: 3.5441, Test Loss: 3.5437, Accuracy: 0.1310
Epoch 5: Train Loss: 3.4451, Test Loss: 3.4627, Accuracy: 0.1400
Epoch 6: Train Loss: 3.3560, Test Loss: 3.3921, Accuracy: 0.1460
Epoch 7: Train Loss: 3.2764, Test Loss: 3.3274, Accuracy: 0.1440
Epoch 8: Train Loss: 3.2037, Test Loss: 3.2745, Accuracy: 0.1440
Epoch 9: Train Loss: 3.1387, Test Loss: 3.2294, Accuracy: 0.1520
Epoch 10: Train Loss: 3.0813, Test Loss: 3.1873, Accuracy: 0.1510
Epoch 1: Train Loss: 4.0981, Test Loss: 3.8277, Accuracy: 0.0460
Epoch 2: Train Loss: 3.7429, Test Loss: 3.6741, Accuracy: 0.1000
Epoch 3: Train Loss: 3.5522, Test Loss: 3.5727, Accuracy: 0.1120
Epoch 4: Train Loss: 3.4042, Test Loss: 3.5079, Accuracy: 0.1240
Epoch 5: Train Loss: 3.2831, Test Loss: 3.4540, Accuracy: 0.1150
Epoch 6: Train Loss: 3.1

In [5]:
# Run with default settings
print("Running comparison with default settings...")
results_default = run_comparison(
    vocab_size=1000,
    embedding_dim=64,
    dataset_size=10000
)

# Run with smaller embedding dimension
print("\nRunning comparison with smaller embedding dimension...")
results_small = run_comparison(
    vocab_size=1000,
    embedding_dim=32,
    dataset_size=10000
)

# Print comparative results
def print_results(results, configuration):
    print(f"\n=== Results for {configuration} ===")
    print("\nBase Model:")
    print(f"Training Time: {results['base_model']['training_time']:.2f} seconds")
    print(f"Final Training Loss: {results['base_model']['metrics']['train_loss'][-1]:.4f}")
    print(f"Final Test Loss: {results['base_model']['metrics']['test_loss'][-1]:.4f}")
    print(f"Final Accuracy: {results['base_model']['metrics']['accuracy'][-1]:.4f}")
    
    print("\nCross-Attention Model:")
    print(f"Training Time: {results['cross_attention']['training_time']:.2f} seconds")
    print(f"Final Training Loss: {results['cross_attention']['metrics']['train_loss'][-1]:.4f}")
    print(f"Final Test Loss: {results['cross_attention']['metrics']['test_loss'][-1]:.4f}")
    print(f"Final Accuracy: {results['cross_attention']['metrics']['accuracy'][-1]:.4f}")

print_results(results_default, "Default Configuration (64d)")
print_results(results_small, "Small Embedding Configuration (32d)")

Running comparison with default settings...

Running comparison with smaller embedding dimension...

=== Results for Default Configuration (64d) ===

Base Model:
Training Time: 0.60 seconds
Final Training Loss: 6.4715
Final Test Loss: 6.9586
Final Accuracy: 0.0010

Cross-Attention Model:
Training Time: 2.78 seconds
Final Training Loss: 5.5849
Final Test Loss: 7.9746
Final Accuracy: 0.0005

=== Results for Small Embedding Configuration (32d) ===

Base Model:
Training Time: 0.48 seconds
Final Training Loss: 6.6879
Final Test Loss: 6.9316
Final Accuracy: 0.0000

Cross-Attention Model:
Training Time: 2.35 seconds
Final Training Loss: 6.3628
Final Test Loss: 7.3269
Final Accuracy: 0.0020
