# Studying the mechanism of self-attention

[GitHub link](https://github.com/foobar167/junkyard/blob/master/fine_tuning/Attention_study.ipynb) and
[Colab link](https://colab.research.google.com/drive/1912LC9Tn7lyBzIwlZd2_QuNJqBtZzZ2D) to this Python script.

* Video [Understanding the Self-Attention Mechanism in **8 min**](https://youtu.be/W28LfOld44Y) - theory
* Video [Multi-head Attention Mechanism Explained](https://youtu.be/W6s9i02EiR0) in **4 min** - theory
* Video [Implementing the Self-Attention Mechanism from Scratch in PyTorch](https://youtu.be/ZPLym9rJtM8) - **4 min** to implement + **11 min** testing and playing with code

![Self-Attention scheme](./data/Self-Attention_Layer.jpg)

## Attention implementation

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Attention(nn.Module):
    """ Attention mechanism implementation """
    def __init__(self, d_in, d_out):
        super().__init__()
        self.d_in = d_in
        self.d_out = d_out
        self.Q = nn.Linear(d_in, d_out)  # initialize fully connected layer with random values
        self.K = nn.Linear(d_in, d_out)
        self.V = nn.Linear(d_in, d_out)

    def forward(self, x):
        queries = self.Q(x)  # function performs the math operation y=xA^{T}+b
        keys = self.K(x)
        values = self.V(x)
        # torch.bmm  - Batch Matrix-Matrix Product
        # Use torch.matmul for Multi-Head Attention
        scores = torch.bmm(queries, keys.transpose(1, 2))
        # Normalize. Scores should be independent from the number of elements
        scores = scores * (self.d_out ** -0.5)
        # Apply the softmax so that the sum of the values in a row equals 1.0
        attention = F.softmax(scores, dim=-1)  # normalize in the last dimension
        hidden_states = torch.bmm(attention, values)
        return hidden_states

## Implementation of a very simple tokenizer

In [2]:
BOS_token = 0  # BOS (Beginning of Sequence) the same as SOS (Start of Sequence)
EOS_token = 1  # EOS (End of Sequence)

index2word = {
    BOS_token: "BOS",
    EOS_token: "EOS",
}

text = "How are you doing ? I am good and you ?"
vocabulary = set(text.lower().split(" "))  # set of unique words

for word in vocabulary:
    index2word[len(index2word)] = word

print(*list(index2word.items()), sep="\n")

word2index = {w: i for i, w in index2word.items()}
print("\n", *list(word2index.items()), sep="\n")

(0, 'BOS')
(1, 'EOS')
(2, 'doing')
(3, '?')
(4, 'you')
(5, 'am')
(6, 'i')
(7, 'good')
(8, 'are')
(9, 'how')
(10, 'and')


('BOS', 0)
('EOS', 1)
('doing', 2)
('?', 3)
('you', 4)
('am', 5)
('i', 6)
('good', 7)
('are', 8)
('how', 9)
('and', 10)


In [3]:
def convert2tensor(sentence):
    """ Convert sentence to tensor """
    words_list = sentence.lower().split(" ")
    indices = [word2index[word] for word in words_list]
    # Add new dimension for batches using view(1,-1) at the beginning of the tensor
    return torch.tensor(indices, dtype=torch.long).view(1, -1)


def convert2sentence(tensor):
    """ Convert tensor with tokens to sentence """
    indices = tensor.tolist()[0]
    words_list = [index2word[index] for index in indices]
    return " ".join(words_list).capitalize()


sentence = "How are you doing ?"

input_tensor = convert2tensor(sentence)
print(f"Input tensor: {input_tensor}")
print(f"Tensor size: {input_tensor.size()}")

output_sentence = convert2sentence(input_tensor)
print(f"Output sentence: {output_sentence}")

Input tensor: tensor([[9, 8, 4, 2, 3]])
Tensor size: torch.Size([1, 5])
Output sentence: How are you doing ?


## Create a very small neural network with attention layer

In [4]:
HIDDEN_SIZE = 10
VOCAB_SIZE = len(word2index)

embedding = nn.Embedding(VOCAB_SIZE, HIDDEN_SIZE)
attention = Attention(HIDDEN_SIZE, HIDDEN_SIZE)
sentence = "How are you doing ?"
input_tensor = convert2tensor(sentence)
embedded = embedding(input_tensor)

print(embedded)
# size: [batch_size, sentence_length, hidden_size]
print(f"Tensor size: {embedded.size()}")

tensor([[[-0.2835, -0.1142,  0.7625,  1.8695,  0.7673,  0.8024, -1.4880,
           0.3132,  0.6777, -1.2837],
         [ 1.4617,  1.5022,  0.7845, -0.5096, -1.3534, -2.0501, -0.5282,
           0.7364, -1.1479,  0.9985],
         [-0.5725,  1.4598,  1.5003,  1.2356,  0.6263,  0.9765, -2.4509,
           0.3613,  0.2789, -0.9566],
         [ 0.4424, -0.8964,  0.8295, -0.1787, -0.2392,  0.2189, -0.1659,
          -0.4627,  1.8268,  0.1060],
         [ 0.2091,  1.3789, -0.0426,  1.7296,  0.2907, -2.0111, -0.9097,
          -0.8147,  1.0213,  1.2564]]], grad_fn=<EmbeddingBackward0>)
Tensor size: torch.Size([1, 5, 10])


In [5]:
hidden_states = attention(embedded)
print(hidden_states)
print(hidden_states.size())

tensor([[[ 0.5218, -0.4102,  0.7409, -0.1872,  0.2991, -0.8574, -0.2982,
           0.2765,  0.7004,  0.0800],
         [ 0.3374, -0.1027,  0.5648, -0.0186,  0.0531, -0.3391, -0.1287,
           0.0431,  0.2931, -0.1135],
         [ 0.4792, -0.3542,  0.7240, -0.1440,  0.2760, -0.7879, -0.2573,
           0.2385,  0.6349,  0.0734],
         [ 0.6686, -0.6763,  0.8214, -0.4642,  0.1728, -0.8602, -0.4216,
           0.4950,  0.7693, -0.0709],
         [ 0.3343, -0.0983,  0.6696,  0.0642,  0.2517, -0.6062, -0.1253,
           0.0789,  0.4570,  0.0975]]], grad_fn=<BmmBackward0>)
torch.Size([1, 5, 10])


In [6]:
d_in = HIDDEN_SIZE
d_out = HIDDEN_SIZE

Q = nn.Linear(d_in, d_out)  # initialize fully connected layer with random values
K = nn.Linear(d_in, d_out)
V = nn.Linear(d_in, d_out)

queries = Q(embedded)  # function performs the math operation y=xA^{T}+b
keys = K(embedded)
values = V(embedded)

print(queries.size(), keys.size(), values.size(), sep="\n")

torch.Size([1, 5, 10])
torch.Size([1, 5, 10])
torch.Size([1, 5, 10])


In [7]:
scores = torch.bmm(queries, keys.transpose(1, 2))
print(scores.size())

scores = scores * (d_out ** -0.5)
attention_tensor = F.softmax(scores, dim=-1)
print("", attention_tensor.size(), sep="\n")
print(f"Sum of the rows of the last dimension:\n\t{attention_tensor.sum(dim=-1)}")
print(attention_tensor)

hidden_states = torch.bmm(attention_tensor, values)
print("", hidden_states.size(), sep="\n")
print(hidden_states)

torch.Size([1, 5, 5])

torch.Size([1, 5, 5])
Sum of the rows of the last dimension:
	tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SumBackward1>)
tensor([[[0.1774, 0.2592, 0.1660, 0.1918, 0.2056],
         [0.1930, 0.1687, 0.1423, 0.2866, 0.2094],
         [0.1885, 0.1985, 0.2020, 0.1938, 0.2172],
         [0.1798, 0.2606, 0.1428, 0.1899, 0.2269],
         [0.2162, 0.1744, 0.1309, 0.2362, 0.2423]]],
       grad_fn=<SoftmaxBackward0>)

torch.Size([1, 5, 10])
tensor([[[-0.0965, -0.2134, -0.0656,  0.3100,  0.1607, -0.1591,  0.1684,
           0.1657, -0.1143,  0.0348],
         [-0.1321, -0.1620, -0.0675,  0.1672,  0.1138, -0.1034,  0.1976,
           0.2082, -0.2740,  0.1266],
         [-0.2073, -0.2190, -0.0708,  0.2720,  0.0842, -0.1246,  0.1825,
           0.2124, -0.2098,  0.0960],
         [-0.0623, -0.2234, -0.0489,  0.3278,  0.1846, -0.1751,  0.1581,
           0.1520, -0.1109,  0.0309],
         [-0.1480, -0.2043, -0.0438,  0.2239,  0.1240, -0.1276,  0.1825,
       

## Multi-head Attention Implementaion

Multi-head Attention scheme

![Multi-head Attention scheme](./data/Multihead_Attention.jpg)

Attention head scheme

![Attention head scheme](./data/Attention_head.jpg)

In [8]:
# TODO. Not implemented yet