In [2]:
sentence = 'Life is short, eat dessert first'

dc = {s: i for i, s in enumerate(sorted(sentence.replace(',', '').split()))}

print(dc)


{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}


In [3]:
import torch

sentence_int = torch.tensor(
    [dc[s] for s in sentence.replace(',', '').split()]
)
print(sentence_int)

tensor([0, 4, 5, 2, 1, 3])


In [4]:
import torch
import torch.nn as nn

class SelfAttention(nn.Module):
    def __init__(self, d_in, d_k, d_v):
        super().__init__()
        # Query, Key, Value weight matrices
        self.W_q = nn.Parameter(torch.rand(d_in, d_k))
        self.W_k = nn.Parameter(torch.rand(d_in, d_k))
        self.W_v = nn.Parameter(torch.rand(d_in, d_v))

    def forward(self, x):
        # x shape: (seq_len, d_in)
        queries = x @ self.W_q         # (seq_len, d_k)
        keys    = x @ self.W_k         # (seq_len, d_k)
        values  = x @ self.W_v         # (seq_len, d_v)

        # Scaled dot-product attention
        scores  = queries @ keys.T     # (seq_len, seq_len)
        scores  = scores / (keys.shape[-1] ** 0.5)
        weights = torch.softmax(scores, dim=-1)  # normalize

        # Output is combination of weights with values
        out     = weights @ values     # (seq_len, d_v)
        return out

# Example usage
torch.manual_seed(123)

# Suppose we have a tiny 6×3 input embedding
dummy_input = torch.rand(6, 3)

# Create attention layer: input dim 3 → k/q dim 2 → value dim 4
attn = SelfAttention(d_in=3, d_k=2, d_v=4)

output = attn(dummy_input)
print(output)
print("Output shape:", output.shape)


tensor([[0.7016, 0.7310, 0.5688, 0.5260],
        [0.7317, 0.7599, 0.5930, 0.5488],
        [0.6644, 0.6888, 0.5320, 0.4946],
        [0.7354, 0.7654, 0.5981, 0.5526],
        [0.6769, 0.7027, 0.5441, 0.5050],
        [0.7201, 0.7482, 0.5830, 0.5397]], grad_fn=<MmBackward0>)
Output shape: torch.Size([6, 4])
