In [1]:
import torch


inputs=torch.tensor(
    [[0.43,0.15,0.89], #your input(x^1)
     [0.55,0.87,0.66], #journey(x^2)
     [0.57,0.85,0.64], #starts(x^3)
     [0.22,0.58,0.33], #with (x^4)
     [0.77,0.25,0.10], #one(x^5)
     [0.05,0.80,0.55]  #step(x^6)
     ]
)


In [3]:
x_2=inputs[1] #A
d_in=inputs.shape[1] #B
d_out=2 #C

torch.manual_seed(123)
W_query=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_key=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)
W_value=torch.nn.Parameter(torch.rand(d_in,d_out),requires_grad=False)


print(W_query)
print(W_key)
print(W_value)


Parameter containing:
tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])
Parameter containing:
tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])
Parameter containing:
tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])


In [5]:
query_2= x_2 @ W_query
key_2=x_2 @ W_key
value_2=x_2 @ W_value

print(query_2)
print(key_2)
print(value_2)

tensor([0.4306, 1.4551])
tensor([0.4433, 1.1419])
tensor([0.3951, 1.0037])


In [6]:
keys=inputs @ W_key
values=inputs @ W_value
queries=inputs @ W_query

print(keys.shape)
print(keys)

print(values.shape)
print(values)

print(queries.shape)
print(queries)

torch.Size([6, 2])
tensor([[0.3669, 0.7646],
        [0.4433, 1.1419],
        [0.4361, 1.1156],
        [0.2408, 0.6706],
        [0.1827, 0.3292],
        [0.3275, 0.9642]])
torch.Size([6, 2])
tensor([[0.1855, 0.8812],
        [0.3951, 1.0037],
        [0.3879, 0.9831],
        [0.2393, 0.5493],
        [0.1492, 0.3346],
        [0.3221, 0.7863]])
torch.Size([6, 2])
tensor([[0.2309, 1.0966],
        [0.4306, 1.4551],
        [0.4300, 1.4343],
        [0.2355, 0.7990],
        [0.2983, 0.6565],
        [0.2568, 1.0533]])


In [None]:
#attention score for 2 query and 2 key
attn_score_22=query_2.dot(key_2)
print(attn_score_22)


#attention scores for 2 query
attn_score_2=query_2 @ keys.T
print(attn_score_2)

#attention scores for all queries
attn_scores=queries @ keys.T
print(attn_scores)

#normalize attention weights
## we need make sure we scale by square root of keys dimension, because softmax function is sensitive to input so high values becomes exponentially large and output becomes too peaky (stability in training)
## the square root is used for scaling because square root helps keep the variance close to 1 and as small as possible. 
## apply softmax

#for 2nd query
d_k=keys.shape[-1]
attn_weights_2=torch.softmax(attn_score_2/d_k**0.5, dim=-1)
print(attn_weights_2)
print(d_k)



tensor(1.8524)
tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])
tensor([[0.9231, 1.3545, 1.3241, 0.7910, 0.4032, 1.1330],
        [1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440],
        [1.2544, 1.8284, 1.7877, 1.0654, 0.5508, 1.5238],
        [0.6973, 1.0167, 0.9941, 0.5925, 0.3061, 0.8475],
        [0.6114, 0.8819, 0.8626, 0.5121, 0.2707, 0.7307],
        [0.8995, 1.3165, 1.2871, 0.7682, 0.3937, 1.0996]])
tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
2


In [None]:
#calculating context vectors

#for 2nd query
context_vec_2= attn_weights_2@values
print(context_vec_2)




tensor([0.3061, 0.8210])


In [10]:
import torch.nn as nn

class SelfAttention_v1(nn.Module):

    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.rand(d_in, d_out))
        self.W_key   = nn.Parameter(torch.rand(d_in, d_out))
        self.W_value = nn.Parameter(torch.rand(d_in, d_out))

    def forward(self, x):
        keys = x @ self.W_key
        queries = x @ self.W_query
        values = x @ self.W_value
        
        attn_scores = queries @ keys.T # omega
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1
        )

        context_vec = attn_weights @ values
        return context_vec

In [11]:
torch.manual_seed(123)

sa_v1=SelfAttention_v1(d_in,d_out)
print(sa_v1(inputs))

tensor([[0.2996, 0.8053],
        [0.3061, 0.8210],
        [0.3058, 0.8203],
        [0.2948, 0.7939],
        [0.2927, 0.7891],
        [0.2990, 0.8040]], grad_fn=<MmBackward0>)


In [12]:
class SelfAttention_v2(nn.Module):

    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)

    def forward(self, x):
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        
        attn_scores = queries @ keys.T
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)

        context_vec = attn_weights @ values
        return context_vec

In [13]:
torch.manual_seed(123)

sa_v1=SelfAttention_v2(d_in,d_out)
print(sa_v1(inputs))

tensor([[-0.5337, -0.1051],
        [-0.5323, -0.1080],
        [-0.5323, -0.1079],
        [-0.5297, -0.1076],
        [-0.5311, -0.1066],
        [-0.5299, -0.1081]], grad_fn=<MmBackward0>)
