#**Multi-Head Attention**
***by Đắt Ngô***

# Introduction

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image01.png?raw=true)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image02.png?raw=true)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image03.png?raw=true)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image04.png?raw=true)

# Mục mới

# Implement

In [None]:
import torch
import torch.nn as nn

import math

In [None]:
class BertSelfAttention(nn.Module):
      def __init__(self, config):
        super().__init__()
        assert config["hidden_size"] % config["num_of_attention_heads"] == 0, "The hidden size is not a multiple of the number of attention heads"

        self.num_attention_heads = config['num_of_attention_heads']
        self.attention_head_size = int(config['hidden_size'] / config['num_of_attention_heads'])
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config['hidden_size'], self.all_head_size)
        self.key = nn.Linear(config['hidden_size'], self.all_head_size)
        self.value = nn.Linear(config['hidden_size'], self.all_head_size)

        self.dense = nn.Linear(config['hidden_size'], config['hidden_size'])

      def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

      def forward(self, hidden_states):
        mixed_query_layer = self.query(hidden_states)                             # [Batch_size x Seq_length x Hidden_size]
        mixed_key_layer = self.key(hidden_states)                                 # [Batch_size x Seq_length x Hidden_size]
        mixed_value_layer = self.value(hidden_states)                             # [Batch_size x Seq_length x Hidden_size]
        
        query_layer = self.transpose_for_scores(mixed_query_layer)                # [Batch_size x Num_of_heads x Seq_length x Head_size]
        key_layer = self.transpose_for_scores(mixed_key_layer)                    # [Batch_size x Num_of_heads x Seq_length x Head_size]
        value_layer = self.transpose_for_scores(mixed_value_layer)                # [Batch_size x Num_of_heads x Seq_length x Head_size]

        
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # [Batch_size x Num_of_heads x Seq_length x Seq_length]
        attention_scores = attention_scores / math.sqrt(self.attention_head_size) # [Batch_size x Num_of_heads x Seq_length x Seq_length]
        attention_probs = nn.Softmax(dim=-1)(attention_scores)                    # [Batch_size x Num_of_heads x Seq_length x Seq_length]
        context_layer = torch.matmul(attention_probs, value_layer)                # [Batch_size x Num_of_heads x Seq_length x Head_size]

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()            # [Batch_size x Seq_length x Num_of_heads x Head_size]
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) # [Batch_size x Seq_length x Hidden_size]
        context_layer = context_layer.view(*new_context_layer_shape)              # [Batch_size x Seq_length x Hidden_size]
        
        output =  self.dense(context_layer)
        
        return output

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image05.png?raw=true)

        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)  

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image06.png?raw=true)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image07.png?raw=true)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        context_layer = torch.matmul(attention_probs, value_layer)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image08.png?raw=true)

        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)

![Image Error](https://github.com/datnnt1997/multi-head_self-attention/blob/master/images/image09.png?raw=true)
 
    output =  self.dense(context_layer)

Khởi tạo model self_Attention

In [None]:
config = {
    "num_of_attention_heads": 2,
    "hidden_size": 4
}

In [None]:
selfattn = BertSelfAttention(config)
print(selfattn)

BertSelfAttention(
  (query): Linear(in_features=4, out_features=4, bias=True)
  (key): Linear(in_features=4, out_features=4, bias=True)
  (value): Linear(in_features=4, out_features=4, bias=True)
  (dense): Linear(in_features=4, out_features=4, bias=True)
)


Khởi tạo ngẫu nhiên embedding đầu vào

In [None]:
embed_rand = torch.rand((1,3,4))
print(f"Embed Shape: {embed_rand.shape}")
print(f"Embed Values:\n{embed_rand}")

Embed Shape: torch.Size([1, 3, 4])
Embed Values:
tensor([[[0.0552, 0.1801, 0.1834, 0.2938],
         [0.3367, 0.1240, 0.7171, 0.6137],
         [0.7021, 0.6584, 0.2185, 0.7412]]])


# Forward input embedding với SelfAttention

In [None]:
output = selfattn(embed_rand)
print(f"Output Shape: {output.shape}")
print(f"Output Values:\n{output}")

Output Shape: torch.Size([1, 3, 4])
Output Values:
tensor([[[ 0.1697,  0.4565,  0.4477, -0.0096],
         [ 0.1689,  0.4558,  0.4477, -0.0084],
         [ 0.1706,  0.4573,  0.4455, -0.0104]]], grad_fn=<AddBackward0>)
