# Studying the mechanism of self-attention

[GitHub link](https://github.com/foobar167/junkyard/blob/master/fine_tuning/Attention_study.ipynb) and
[Colab link](https://colab.research.google.com/drive/1912LC9Tn7lyBzIwlZd2_QuNJqBtZzZ2D) to this Python script.

* Video [Understanding the Self-Attention Mechanism in **8 min**](https://youtu.be/W28LfOld44Y) - theory
* Video [Multi-head Attention Mechanism Explained](https://youtu.be/W6s9i02EiR0) in **4 min** - theory
* Video [Implementing the Self-Attention Mechanism from Scratch in PyTorch](https://youtu.be/ZPLym9rJtM8) - **4 min** to implement + **11 min** testing and playing with code

![Self-Attention scheme](./data/Self-Attention_Layer.jpg)

## Attention implementation

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Attention(nn.Module):
    """ Attention mechanism implementation """
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.Q = nn.Linear(d_in, d_out)  # initialize fully connected layer with random values
        self.K = nn.Linear(d_in, d_out)
        self.V = nn.Linear(d_in, d_out)

    def forward(self, x):
        queries = self.Q(x)  # function performs the math operation y=xA^{T}+b
        keys = self.K(x)
        values = self.V(x)
        # torch.bmm  - Batch Matrix-Matrix Product
        # Use torch.matmul for Multi-Head Attention
        scores = torch.bmm(queries, keys.transpose(1, 2))
        # Normalize. Scores should be independent from the number of elements
        scores = scores * (self.d_out ** -0.5)
        # Apply the softmax so that the sum of the values in a row equals 1.0
        attention = F.softmax(scores, dim=-1)  # normalize in the last dimension
        hidden_states = torch.bmm(attention, values)
        return hidden_states

## Implementation of a very simple tokenizer

In [2]:
BOS_token = 0  # BOS (Beginning of Sequence) the same as SOS (Start of Sequence)
EOS_token = 1  # EOS (End of Sequence)

index2word = {
    BOS_token: "BOS",
    EOS_token: "EOS",
}

text = "How are you doing ? I am good and you ?"
vocabulary = set(text.lower().split(" "))  # set of unique words

for word in vocabulary:
    index2word[len(index2word)] = word

print(*list(index2word.items()), sep="\n")

word2index = {w: i for i, w in index2word.items()}
print("\n", *list(word2index.items()), sep="\n")

(0, 'BOS')
(1, 'EOS')
(2, 'doing')
(3, 'are')
(4, 'good')
(5, 'you')
(6, 'and')
(7, 'i')
(8, 'how')
(9, '?')
(10, 'am')


('BOS', 0)
('EOS', 1)
('doing', 2)
('are', 3)
('good', 4)
('you', 5)
('and', 6)
('i', 7)
('how', 8)
('?', 9)
('am', 10)


In [3]:
def convert2tensor(sentence):
    """ Convert sentence to tensor """
    words_list = sentence.lower().split(" ")
    indices = [word2index[word] for word in words_list]
    # Add new dimension for batches using view(1,-1) at the beginning of the tensor
    return torch.tensor(indices, dtype=torch.long).view(1, -1)


def convert2sentence(tensor):
    """ Convert tensor with tokens to sentence """
    indices = tensor.tolist()[0]
    words_list = [index2word[index] for index in indices]
    return " ".join(words_list).capitalize()


sentence = "How are you doing ?"

input_tensor = convert2tensor(sentence)
print(f"Input tensor: {input_tensor}")
print(f"Tensor size: {input_tensor.size()}")

output_sentence = convert2sentence(input_tensor)
print(f"Output sentence: {output_sentence}")

Input tensor: tensor([[8, 3, 5, 2, 9]])
Tensor size: torch.Size([1, 5])
Output sentence: How are you doing ?


## Create a very small neural network with attention layer

In [4]:
HIDDEN_SIZE = 10
VOCAB_SIZE = len(word2index)

embedding = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
attention = Attention(HIDDEN_SIZE, HIDDEN_SIZE)
sentence = "How are you doing ?"
input_tensor = convert2tensor(sentence)
embedded = embedding(input_tensor)

print(embedded)
# size: [batch_size, sentence_length, hidden_size]
print(f"Tensor size: {embedded.size()}")

tensor([[[-0.6878,  0.5455, -0.5097,  0.4990, -0.4259, -0.6922, -1.0510,
          -0.1118, -0.8900,  1.8558],
         [ 0.5680, -0.0979,  1.8615,  1.1312, -1.8843, -1.5381, -2.0612,
           1.2283,  0.7979, -0.0315],
         [-1.8679, -0.8784,  0.1699,  1.8107,  1.7111,  0.1360, -1.0298,
          -0.7877,  0.6809,  0.5554],
         [-0.0272, -0.3456, -0.9135,  0.9014, -1.0063,  1.0562, -0.9728,
          -0.6862, -1.9597, -0.6360],
         [ 0.9884,  1.0415,  0.5122,  0.5511,  0.0531,  0.4898, -0.0100,
          -0.7635, -0.0098, -0.3157]]], grad_fn=<EmbeddingBackward0>)
Tensor size: torch.Size([1, 5, 10])


In [5]:
hidden_states = attention(embedded)
print(hidden_states)
print(hidden_states.size())

tensor([[[-0.3834, -0.4076, -0.5102,  0.2269,  0.0488,  0.1977, -0.3881,
           0.1375, -0.1205,  0.3651],
         [-0.3657, -0.4375, -0.1336,  0.3992,  0.3447,  0.5735, -0.4873,
          -0.0157, -0.2402, -0.0249],
         [-0.3765, -0.4694, -0.3771,  0.2756,  0.2334,  0.2106, -0.3236,
           0.0290, -0.1926,  0.2362],
         [-0.3883, -0.5096, -0.2760,  0.3214,  0.3184,  0.3379, -0.3530,
          -0.0234, -0.2576,  0.1241],
         [-0.3913, -0.5547, -0.2644,  0.3192,  0.3872,  0.2695, -0.2815,
          -0.0705, -0.2917,  0.1177]]], grad_fn=<BmmBackward0>)
torch.Size([1, 5, 10])


In [6]:
d_in = HIDDEN_SIZE
d_out = HIDDEN_SIZE

Q = nn.Linear(d_in, d_out)  # initialize fully connected layer with random values
K = nn.Linear(d_in, d_out)
V = nn.Linear(d_in, d_out)

queries = Q(embedded)  # function performs the math operation y=xA^{T}+b
keys = K(embedded)
values = V(embedded)

print(queries.size(), keys.size(), values.size(), sep="\n")

torch.Size([1, 5, 10])
torch.Size([1, 5, 10])
torch.Size([1, 5, 10])


In [7]:
scores = torch.bmm(queries, keys.transpose(1, 2))
print(scores.size())

scores = scores * (d_out ** -0.5)
attention_tensor = F.softmax(scores, dim=-1)
print("", attention_tensor.size(), sep="\n")
print(f"Sum of the rows of the last dimension:\n\t{attention_tensor.sum(dim=-1)}")
print(attention_tensor)

hidden_states = torch.bmm(attention_tensor, values)
print("", hidden_states.size(), sep="\n")
print(hidden_states)

torch.Size([1, 5, 5])

torch.Size([1, 5, 5])
Sum of the rows of the last dimension:
	tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SumBackward1>)
tensor([[[0.1781, 0.3114, 0.1765, 0.1437, 0.1903],
         [0.1685, 0.1874, 0.2166, 0.3007, 0.1268],
         [0.2280, 0.1492, 0.1531, 0.2761, 0.1937],
         [0.1903, 0.1494, 0.1475, 0.3015, 0.2113],
         [0.1966, 0.1333, 0.2375, 0.2498, 0.1828]]],
       grad_fn=<SoftmaxBackward0>)

torch.Size([1, 5, 10])
tensor([[[ 0.0824, -0.0145,  0.0324, -0.1796, -0.4023,  0.3836, -0.6102,
          -0.2606, -0.7194,  0.0121],
         [-0.1716, -0.0481,  0.0365,  0.1145, -0.3638,  0.2093, -0.8715,
          -0.1609, -0.5933, -0.0490],
         [-0.2490, -0.0671,  0.0547,  0.1299, -0.3752,  0.2864, -0.8101,
          -0.2255, -0.5545, -0.0812],
         [-0.3095, -0.0868,  0.0293,  0.1527, -0.3813,  0.2707, -0.7959,
          -0.2332, -0.5277, -0.0496],
         [-0.1420, -0.0465,  0.0840,  0.1591, -0.3431,  0.2532, -0.8829,
       

## Multi-head Attention Implementaion

![Multi-head Attention scheme](./data/Multihead_Attention.jpg)

In [8]:
# TODO. Not implemented yet