In [46]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np

"""
Setting some hyperparameters
"""
# size of the query, key, value and z vectors 
# in the attention layer (64 was used in paper)
ATTENTION_OUTPUT_DIM = 64
WORD_EMBEDDING_DIM = 128


class SelfAttentionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        """
        input_dim : dim of the input
        output_dim : dim of the query, key, value and z vectors
        """
        super().__init__()
        self.W_q = nn.Linear(input_dim, output_dim, bias=False)
        self.W_k = nn.Linear(input_dim, output_dim, bias=False)
        self.W_v = nn.Linear(input_dim, output_dim, bias=False)
        self.output_sqrt = torch.Tensor([np.sqrt(output_dim)])
    
    def forward(self, x):
        queries = self.W_q(x)
        keys = self.W_k(x)
        values = self.W_v(x)
        scores = queries @ keys.T
        softmax_scores = F.softmax(scores / self.output_sqrt, dim=1)
        # softmax_scores_shape is (input_dim, input_dim), so for each word a score with
        # all other words is calculated
        # scores shape is (input_dim, output_dim)
        # weighted_values shape is (input_dim, input_dim, output_dim)
        # weighted_values = softmax_scores * values
        weighted_values = torch.mul(values, softmax_scores.unsqueeze(-1).expand_as(values))
        z = torch.sum(weighted_values, dim=1)
        return z
    
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, input_dim, self_attention_output_dim, output_dim, num_attentions):
        """
        input_dim : dim of the input
        self_attention_output_dim : dim of the query, key ,value and z vectors of the self attention layers
        output_dim : size of the transformed z
        num_attentions : number of attentions, which will be concatonated and then transformed
        """
        super().__init__()
        self.self_attention_layers = nn.ModuleList(
            [SelfAttentionLayer(input_dim, self_attention_output_dim) for i in range(num_attentions)]
        )
        self.W_o = nn.Linear(self_attention_output_dim * num_attentions, output_dim)
    
    def forward(self, x):
        heads = [self_attention_layer(x) for self_attention_layer in self.self_attention_layers]
        heads_concat = torch.cat(heads, dim=1)
        z = self.W_o(heads_concat)
        return z

In [47]:
test = torch.ones((30, WORD_EMBEDDING_DIM)).float()
multi_headed_attention = MultiHeadAttentionLayer(WORD_EMBEDDING_DIM, ATTENTION_OUTPUT_DIM, ATTENTION_OUTPUT_DIM, 8)

print(multi_headed_attention(test))

RuntimeError: expand(torch.FloatTensor{[30, 30, 1]}, size=[30, 64]): the number of sizes provided (2) must be greater or equal to the number of dimensions in the tensor (3)

In [45]:
num_words = 30 
hidden_dim = 64

softmax_scores = torch.ones((30, 30))
z_scores = torch.ones((30, 64))

print((softmax_scores @ z_scores).shape)

RuntimeError: Expected tensor to have size 30 at dimension 1, but got size 1 for argument #2 'batch2' (while checking arguments for bmm)