In [7]:
import torch
import torch.nn as nn


In [10]:
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.randn(d_in, d_out))
        self.W_key = nn.Parameter(torch.randn(d_in, d_out))
        self.W_value = nn.Parameter(torch.randn(d_in, d_out))

    def forward(self, x):
        queries = x @ self.W_query
        keys = x @ self.W_key
        values = x @ self.W_value
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1] ** 0.5, dim=-1)
        context_vec = attn_weights @ values
        return context_vec

In [12]:
vocab = {"Your": 0, "journey": 1, "starts": 2, "with": 3, "one": 4, "step": 5}
embedding_dim = 3
tokenized_input = ["Your", "journey", "starts", "with", "one", "step"]

In [13]:
# Random embeddings
torch.manual_seed(123)
embeddings = torch.randn(len(vocab), embedding_dim)

In [14]:
# Convert tokens to embeddings
inputs = torch.stack([embeddings[vocab[word]] for word in tokenized_input])

In [15]:
# Initialize and apply self-attention
sa_v1 = SelfAttention_v1(d_in=3,d_out=2)
print(sa_v1(inputs))

tensor([[ 0.3322,  1.5411],
        [ 0.9328,  3.1144],
        [ 0.9211,  3.0893],
        [-0.8639, -4.1216],
        [-1.0136, -5.0366],
        [-1.0167, -5.0538]], grad_fn=<MmBackward0>)
