### Coding Attention Mechanisms

In [2]:
# A compact self-attention class. It uses nn.Parameters to define the weight matrices Wk Wq Wv.

import torch.nn as nn
import torch

class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_key = nn.Parameter(torch.rand(d_in, d_out))
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        K = x @ self.W_key
        Q = x @ self.W_query
        V = x @ self.W_value

        attention_scores = Q @ K.T
        attention_weights = torch.softmax(attention_scores / K.shape[-1]**0.5, dim=-1)
        return attention_weights @ V

  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),


In [3]:
torch.manual_seed(123)

# Example input values to try the self-attention layer.
inputs = torch.tensor(
[[0.43, 0.15, 0.89],
 [0.57, 0.85, 0.64], 
 [0.22, 0.58, 0.33],
 [0.77, 0.25, 0.10],
 [0.05, 0.80, 0.55]]
)

d_in, d_out = 3, 2
sa_v1 = SelfAttention_v1(d_in, d_out)
print(sa_v1(inputs))
del inputs, d_in, d_out

tensor([[0.2685, 0.7413],
        [0.2738, 0.7564],
        [0.2668, 0.7366],
        [0.2618, 0.7218],
        [0.2712, 0.7495]], grad_fn=<MmBackward0>)


In [None]:
# Self-attention V2. It uses nn.Linear to define the matrices. nn.Linear is efficient than nn.Parameter, used in the
# prevous class, as it is optimized for matrix multiplication.

def SelfAttention_v2(nn.Module):

    def __init__(self, in_dim, out_dim):
        self.in_dim = in_dim
        self.out_dim = out_dim

    def forward(self, x):
