Input sequence: Dream big and work for it

In [1]:
import torch

inputs = torch.tensor(
    [
        [0.72, 0.45, 0.31],# 'Dream' (x^1)
        [0.75, 0.20, 0.55],# 'big'   (x^2)
        [0.30, 0.80, 0.40],# 'and'   (x^3)
        [0.85, 0.35, 0.60],# 'work'  (x^4)
        [0.55, 0.15, 0.75],# 'for'   (x^5)
        [0.25, 0.20, 0.85],# 'it'    (x^6)
    ]
)

# Corresponding words
words = ['Dream', 'big', 'and', 'work', 'for', 'it']

We want to generate the context vector for second token.

In [2]:
x_2 = inputs[1]  # Embedding for 'big'
print(f"Input embedding for 'big': {x_2}")
d_in = inputs.shape[1]  # Input dimension
print(f"Input dimension: {d_in}")
d_out = 2  # Output dimension
print(f"Output dimension: {d_out}")

Input embedding for 'big': tensor([0.7500, 0.2000, 0.5500])
Input dimension: 3
Output dimension: 2


Randomly initialising the Wk, Wq, Wv matrices

In [3]:
torch.manual_seed(123)  # For reproducibility
W_query = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
print(f"W_query:\n{W_query}\n")
W_key = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
print(f"W_key:\n{W_key}\n")
W_value = torch.nn.Parameter(torch.randn(d_in, d_out), requires_grad=False)
print(f"W_value:\n{W_value}\n")

W_query:
Parameter containing:
tensor([[-0.1115,  0.1204],
        [-0.3696, -0.2404],
        [-1.1969,  0.2093]])

W_key:
Parameter containing:
tensor([[-0.9724, -0.7550],
        [ 0.3239, -0.1085],
        [ 0.2103, -0.3908]])

W_value:
Parameter containing:
tensor([[ 0.2350,  0.6653],
        [ 0.3528,  0.9728],
        [-0.0386, -0.8861]])



In [4]:
query_2 = x_2 @ W_query
print(f"Query vector for 'big': {query_2}\n")
key_2 = x_2 @ W_key
print(f"Key vector for 'big': {key_2}\n")
value_2 = x_2 @ W_value
print(f"Value vector for 'big': {value_2}\n")

Query vector for 'big': tensor([-0.8158,  0.1573])

Key vector for 'big': tensor([-0.5488, -0.8030])

Value vector for 'big': tensor([0.2256, 0.2062])



Calculating Q, K & V using inputs(X), Wq, Wk & Wv

In [5]:
keys = inputs @ W_key
print(f"Keys:\n{keys}\n")
values = inputs @ W_value
print(f"Values:\n{values}\n")
queries = inputs @ W_query
print(f"Queries:\n{queries}\n")

Keys:
tensor([[-4.8914e-01, -7.1363e-01],
        [-5.4880e-01, -8.0295e-01],
        [ 5.1548e-02, -4.6967e-01],
        [-5.8694e-01, -9.1428e-01],
        [-3.2846e-01, -7.2469e-01],
        [ 4.7331e-04, -5.4268e-01]])

Values:
tensor([[ 0.3160,  0.6421],
        [ 0.2256,  0.2062],
        [ 0.3373,  0.6234],
        [ 0.3000,  0.3743],
        [ 0.1532, -0.1528],
        [ 0.0965, -0.3923]])

Queries:
tensor([[-0.6176,  0.0433],
        [-0.8158,  0.1573],
        [-0.8079, -0.0725],
        [-0.9423,  0.1437],
        [-1.0144,  0.1871],
        [-1.1192,  0.1599]])



Keys corresponding to the second token and the attention of second token to itself

In [6]:
keys_2 = keys[1]
print(f"Key vector for second token ('big'): {keys_2}\n")
attention_score_22 = query_2.dot(keys_2)
print(f"Attention score between 'big' and 'big': {attention_score_22}\n")

Key vector for second token ('big'): tensor([-0.5488, -0.8030])

Attention score between 'big' and 'big': 0.3214397430419922



All attenstion scores for query number 2

In [7]:
attention_scores_2 = queries[1] @ keys.T
print(f"All attention scores for query number 2:\n{attention_scores_2}\n")

All attention scores for query number 2:
tensor([ 0.2868,  0.3214, -0.1159,  0.3350,  0.1540, -0.0857])



Attention scores (not weights) matrix

In [8]:
attention_scores = queries @ keys.T # omega matrix
print(f"Attention scores (not weights) matrix:\n{attention_scores}\n")

Attention scores (not weights) matrix:
tensor([[ 0.2712,  0.3042, -0.0522,  0.3229,  0.1715, -0.0238],
        [ 0.2868,  0.3214, -0.1159,  0.3350,  0.1540, -0.0857],
        [ 0.4469,  0.5016, -0.0076,  0.5405,  0.3179,  0.0390],
        [ 0.3583,  0.4017, -0.1161,  0.4217,  0.2053, -0.0784],
        [ 0.3627,  0.4065, -0.1402,  0.4244,  0.1976, -0.1020],
        [ 0.4333,  0.4858, -0.1328,  0.5107,  0.2517, -0.0873]])



Scale by 1/sqrt(d) and then take softmax

In [9]:
d_k = keys.shape[-1]
attention_weigths_2 = torch.nn.functional.softmax(attention_scores_2 / torch.sqrt(torch.tensor(d_k)), dim=-1)
print(f"All attention scores for query number 2:\n{attention_scores_2}\n")
print(f"All attention weights for query number 2:\n{attention_weigths_2}\n")

All attention scores for query number 2:
tensor([ 0.2868,  0.3214, -0.1159,  0.3350,  0.1540, -0.0857])

All attention weights for query number 2:
tensor([0.1821, 0.1867, 0.1370, 0.1885, 0.1658, 0.1400])



NOTE 1: Softmax peaks when numbers are scaled

In [10]:
tensor = torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])
softmax = torch.nn.functional.softmax(tensor, dim=-1)
print(f"Softmax of {tensor} without scaling is {softmax}")
scaled_tensor = tensor * 8
scaled_softmax = torch.nn.functional.softmax(scaled_tensor, dim=-1)
print(f"Softmax of {tensor} with scaling is {scaled_softmax}")

Softmax of tensor([ 0.1000, -0.2000,  0.3000, -0.2000,  0.5000]) without scaling is tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
Softmax of tensor([ 0.1000, -0.2000,  0.3000, -0.2000,  0.5000]) with scaling is tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])


NOTE 2: Scaling has to be such that the varince of QK.T is close to 1.

In [11]:
# Function to compute variance before and after scaling
def compute_variance_scaling(d_k, num_samples=10000):
    q = torch.randn(num_samples, d_k)
    k = torch.randn(num_samples, d_k)
    scores = q @ k.T
    var_before = torch.var(scores).item()
    scaled_scores = scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    var_after = torch.var(scaled_scores).item()
    return var_before, var_after

d_k_values = [8, 16, 32, 64, 128, 256]
for d_k in d_k_values:
    var_before, var_after = compute_variance_scaling(d_k)
    print(f"d_k: {d_k}, Variance before scaling: {var_before:.4f}, Variance after scaling: {var_after:.4f}")


d_k: 8, Variance before scaling: 8.1428, Variance after scaling: 1.0179
d_k: 16, Variance before scaling: 15.9418, Variance after scaling: 0.9964
d_k: 32, Variance before scaling: 32.1296, Variance after scaling: 1.0040
d_k: 64, Variance before scaling: 63.9599, Variance after scaling: 0.9994
d_k: 128, Variance before scaling: 128.0868, Variance after scaling: 1.0007
d_k: 256, Variance before scaling: 256.2036, Variance after scaling: 1.0008


Context vector corresponding to second input token

In [12]:
context_vector_2 = attention_weigths_2 @ values
print(f"Context vector corresponding to second input token:\n{context_vector_2}\n")
# The context vector for the second token 'big' is computed by taking the weighted sum of the value vectors using the attention weights.

Context vector corresponding to second input token:
tensor([0.2413, 0.2311])



In [13]:
print(f"So, the input vector {inputs[1]} for the token '{words[1]}' is transformed to the context vector {context_vector_2} using self-attention mechanism.\n")

So, the input vector tensor([0.7500, 0.2000, 0.5500]) for the token 'big' is transformed to the context vector tensor([0.2413, 0.2311]) using self-attention mechanism.



In [14]:
# Attention weights for all queries
attention_weights = torch.nn.functional.softmax(attention_scores / torch.sqrt(torch.tensor(d_k)), dim=-1)
print(f"Attention weights matrix:\n{attention_weights}\n")
# Context vectors for all input tokens
context_vectors = attention_weights @ values
print(f"Context vectors for all input tokens:\n{context_vectors}\n")

Attention weights matrix:
tensor([[0.1678, 0.1681, 0.1644, 0.1683, 0.1667, 0.1647],
        [0.1681, 0.1685, 0.1639, 0.1686, 0.1667, 0.1642],
        [0.1681, 0.1687, 0.1634, 0.1691, 0.1668, 0.1639],
        [0.1683, 0.1688, 0.1634, 0.1690, 0.1667, 0.1638],
        [0.1684, 0.1689, 0.1632, 0.1691, 0.1667, 0.1636],
        [0.1686, 0.1692, 0.1628, 0.1694, 0.1667, 0.1632]])

Context vectors for all input tokens:
tensor([[0.2383, 0.2178],
        [0.2384, 0.2181],
        [0.2384, 0.2181],
        [0.2384, 0.2183],
        [0.2384, 0.2184],
        [0.2385, 0.2185]])



In [15]:
import torch.nn as nn
# Python class for generating context vectors using self-attention
class selfAttention(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias = False):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
    
    def forward(self, X):
        keys = self.W_k(X)
        queries = self.W_q(X)
        values = self.W_v(X)

        attention_scores = torch.nn.functional.softmax(queries @ keys.T / torch.sqrt(torch.tensor(keys.shape[-1])), dim=-1)
        context_vectors = attention_scores @ values
        return context_vectors

In [16]:
torch.manual_seed(123)  # For reproducibility
SA = selfAttention(d_in=3, d_out=2)
context_vectors = SA(inputs)
print(f"Context vectors from selfAttention class:\n{context_vectors}\n")

Context vectors from selfAttention class:
tensor([[-0.5282, -0.0051],
        [-0.5288, -0.0036],
        [-0.5276, -0.0066],
        [-0.5289, -0.0040],
        [-0.5289, -0.0032],
        [-0.5287, -0.0033]], grad_fn=<MmBackward0>)

