In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [45]:
torch.manual_seed(43)


<torch._C.Generator at 0x7f41521e59d0>

In [46]:
#samples = observations
#num_of_news = news for a given point
def create_fake_data(samples = 5, num_of_news = 5, embedding_dim = 16):
    labels = torch.randint(0, 2, (samples,))
    data = torch.randn(samples, num_of_news, embedding_dim)
    return data, labels
    
    

In [47]:
class DotProductAttention(nn.Module):
    def __init__(self, embed_dim):
        super(DotProductAttention, self).__init__()
        self.query = nn.Parameter(torch.randn(embed_dim)) #Query vector
        
    def forward(self, embeddings):
        """
        embeddings: shape (batch_size, N, d)
                    - batch_size: how many samples in one batch (b)
                    - N: number of headlines per sample
                    - d: embedding dimension
        Returns: shape (batch_size, d)
                 A single aggregated vector for each sample.
        """
        batch_size, N, d = embeddings.shape
        
        ''' In attention mechanisms, we want to measure how relevant or similar each input (e.g., a headline embedding) is to a query vector 𝑞
        q. The dot product is a natural way to compute this similarity. '''
        dot_products = torch.einsum('bnd,d->bn', embeddings, self.query) / d**0.5
        
        attention_weights = F.softmax(dot_products, dim=1) #gives me a vector of (b, n)
        aggregated = torch.einsum('bn,bnd->bd', attention_weights, embeddings) #weight sum across all n
        
        return aggregated

In [48]:
class HeadlineClassifier(nn.Module):
    def __init__(self, embed_dim, hidden_dim, num_classes=2):
        super(HeadlineClassifier, self).__init__()
        
        self.attention = DotProductAttention(embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

        
    def forward(self, embeddings): #embeddings = input data
        
        agg_vector = self.attention(embeddings) #(b, d)
        
        x = F.relu(self.fc1(agg_vector))
        logits = self.fc2(x)
        
        ''' The reason we don’t apply an activation function (like softmax) in the
        final layer is that PyTorch’s loss functions (like nn.CrossEntropyLoss) expect raw logits as input. '''
        return logits

In [49]:
num_samples = 100
num_headlines = 5
embed_dim = 16

embeddings, labels = create_fake_data(num_samples, num_headlines, embed_dim)

In [50]:
dataset = torch.utils.data.TensorDataset(embeddings, labels)

In [51]:
batch_size = 16
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [52]:
hidden_dim = 32
num_classes = 2  # binary classification
model = HeadlineClassifier(embed_dim, hidden_dim, num_classes)

In [53]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [54]:
num_epochs = 5
for epoch in range(num_epochs):
    epoch_loss = 0.0
    correct = 0
    total = 0
    
    for batch_embeddings, batch_labels in dataloader:
        
        logits = model(batch_embeddings)
        loss = criterion(logits, batch_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        _, predicted = torch.max(logits, dim=1)
        correct += (predicted == batch_labels).sum().item()
        total += batch_embeddings.size(0)
        
        
    epoch_loss /= total
    accuracy = 100.0 * correct / total
    print(f"Epoch [{epoch+1}/{num_epochs}], "
          f"Loss: {epoch_loss:.4f}, "
          f"Accuracy: {accuracy:.2f}%")

Epoch [1/5], Loss: 0.0484, Accuracy: 52.00%
Epoch [2/5], Loss: 0.0493, Accuracy: 51.00%
Epoch [3/5], Loss: 0.0483, Accuracy: 51.00%
Epoch [4/5], Loss: 0.0485, Accuracy: 51.00%
Epoch [5/5], Loss: 0.0480, Accuracy: 52.00%
