In [1]:
import torch

inputs = torch.tensor(
[[0.43, 0.15, 0.89], # Your (x^1)
[0.55, 0.87, 0.66], # journey (x^2)
[0.57, 0.85, 0.64], # starts (x^3)
[0.22, 0.58, 0.33], # with (x^4)
[0.77, 0.25, 0.10], # one (x^5)
[0.05, 0.80, 0.55]] # step (x^6)
)

Computing Attention Scores



In [3]:
query = inputs[1]
attn_scores = torch.empty(inputs.shape[0])
for i in range(inputs.shape[0]):
    attn_scores[i] = torch.dot(query, inputs[i])

print(attn_scores)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


Attention Scores Normalization

In [4]:
attn_scores = torch.softmax(attn_scores, dim=0)
print(attn_scores)

tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])


Computing Context Vector


In [22]:
query=inputs[1]
context_vector = torch.zeros(query.shape)

for i, x_i in enumerate(inputs):
    context_vector += attn_scores[i]*x_i
    print(context_vector)

print(context_vector)


tensor([0.0596, 0.0208, 0.1233])
tensor([0.1904, 0.2277, 0.2803])
tensor([0.3234, 0.4260, 0.4296])
tensor([0.3507, 0.4979, 0.4705])
tensor([0.4340, 0.5250, 0.4813])
tensor([0.4419, 0.6515, 0.5683])
tensor([0.4419, 0.6515, 0.5683])


Computing All Context Vectors

In [24]:
attention_scores = torch.empty(6, 6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attention_scores[i ,j] = torch.dot(x_i, x_j)

print(attention_scores)

# or attention_scores = inputs @ inputs.T

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [25]:
attention_scores = torch.softmax(attention_scores, dim=1)
print(attention_scores)

tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [28]:
all_context_vectors = attention_scores @ inputs
print(all_context_vectors)

tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])


Self-attention with trainable weights

In [29]:
x_2 = inputs[1]
d_in = 3
d_out = 2

In [33]:
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [35]:
query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

print(query_2)

tensor([0.5764, 0.9590])


In [39]:
keys = inputs @ W_query
value = inputs @ W_value

In [42]:
keys_2 = keys[1]
attn_score_22 = query_2.dot(keys_2)
print(attn_score_22)

tensor(1.2519)


In [44]:
attn_scores_2 = query_2 @ keys.T
print(attn_scores_2)

tensor([0.7268, 1.2519, 1.2534, 0.6667, 0.9272, 0.6986])


Normalizing Attention Scores

In [48]:
d_k = keys.shape[-1]
attn_scores_2 = torch.softmax(attn_scores_2 / d_k ** 0.5, dim=-1)
print(attn_scores_2)

tensor([0.1430, 0.2073, 0.2076, 0.1371, 0.1648, 0.1402])


In [50]:
context_vector_2 = attn_scores_2 @ value
print(context_vector_2)

tensor([0.9003, 0.2890])
