In [2]:
import torch
import torch.nn as nn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Implementing self-attention with trainable weights
This self-attention mechanism is also called scaled dot product attention. We want to compute context vectors
as weighted sums over the input vectors specific to a certain input element.<br> 
In the first step of the self-attention mechanism with trainable weight matrices, we
compute query (q), key (k), and value (v) vectors for input elements x<br>
The query vector q
(2)
is obtained
via matrix multiplication between the input x
(2) and the weight matrix Wq.
<br>Similarly, we obtain
the key and value vectors via matrix multiplication involving the weight matrices Wk and Wv

In [3]:
input_text = 'Your journey starts with one step'

# sample embeddings of input text
inputs_emb = torch.tensor(
[[0.43, 0.15, 0.89], # Your (x^1)
[0.55, 0.87, 0.66], # journey (x^2)
[0.57, 0.85, 0.64], # starts (x^3)
[0.22, 0.58, 0.33], # with (x^4)
[0.77, 0.25, 0.10], # one (x^5)
[0.05, 0.80, 0.55]] # step (x^6)
)

### Step 1: initialize the query, key and value weight vectors
<br>Weight parameters are the fundamental, learned coefficients that define the network's connections, while attention weights are dynamic, context-specific values.
Even though our temporary goal is to only compute the one context vector, z (2) , we still require the key and value vectors for all input elements as they are involved in computing the attention weights with respect to the query q (2)

A "query" is analogous to a search query in a database. It represents the
current item (e.g., a word or token in a sentence) the model focuses on or
tries to understand. The query is used to probe the other parts of the input
sequence to determine how much attention to pay to them.
<br>The "key" is like a database key used for indexing and searching. In the
attention mechanism, each item in the input sequence (e.g., each word in a
sentence) has an associated key. These keys are used to match with the query.
<br>The "value" in this context is similar to the value in a key-value pair in a
database. It represents the actual content or representation of the input items.
Once the model determines which keys (and thus which parts of the input)
are most relevant to the query (the current focus item), it retrieves the
corresponding values.

In [16]:
torch.manual_seed(2345)
x_2=inputs_emb[1]  # querry input
d_in = inputs_emb.shape[1]
d_out=2



# Step1: manually setting querry, key and value weights matrices
W_query = torch.nn.Parameter(torch.rand(d_in, d_out),requires_grad=False) # although these are trainable parameters, for convinience we have set the requires_grad=False
W_key =  torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value=torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)


query_2 = x_2@W_query
query_2

tensor([0.8585, 0.5272])

### Step2: Compute the attention scores of input tokens w.r.t query
The attention score computation is a dot-product between  the query and
key obtained by transforming the inputs via the respective weight matrices.

In [19]:
# 

keys = inputs_emb@W_key
attn_score21 = query_2@keys[0] # attention score for input 1 w.r.t query
attn_score22 = query_2@keys[0] # attention score of input 2 w.r.t qury. Here input2 is query itself
# and so on....

# all attention score with respect to query 2
attn_scores_2 = query_2@keys.T

### Step3: Compute the attention weights from attention scores
First scale the attention scores by deviding with square root of embeded dimension of keys and then normalize through softmax function.

In [29]:
d_k = keys.shape[-1]
attn_scores_2_scalled = attn_scores_2/np.sqrt(d_k)
attn_weights_2 = torch.softmax(attn_scores_2_scalled, dim=-1)
print('attention weights: ', attn_weights_2)
print('\nsum of weights: ', attn_weights_2.sum())

attention weights:  tensor([0.1270, 0.2193, 0.2178, 0.1390, 0.1422, 0.1546])

sum of weights:  tensor(1.)


### Step4: Context Vector
Obtain the context vector by multiplying each value vector with its respective attention weight and then sum up

In [64]:
values=inputs_emb@W_value
values

tensor([[0.4204, 0.7237],
        [0.8305, 1.0672],
        [0.8125, 1.0579],
        [0.5063, 0.5819],
        [0.2597, 0.5928],
        [0.7082, 0.7108]])

In [66]:
context_vect_2 = attn_weights_2@values
context_vect_2

tensor([0.6293, 0.8315])

### Implementing a compact self-attention Python class

In [72]:
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        # weights for query, key and values
        self.W_query = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.W_key = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
        self.W_value = nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
    def forward(self, x):
        # x= input embedding matrix
        # query matri
        query = x@self.W_query
        # key matrix
        key = x@self.W_key
        # Value matrix
        value = x@self.W_value
        # attention scores (keys w.r.t queries)
        all_attention_scores=q@k.T
        all_attention_scores_scalled=all_attention_scores/np.sqrt(k.shape[-1])
        all_attention_weights=torch.softmax(all_attention_scores_scalled, dim=-1)
        context_vectors = all_attention_weights@values
        return context_vectors

In [73]:
torch.manual_seed(345)
self_attn = SelfAttention_v1(3,2)
context_vectors=  self_attn(inputs_emb)
context_vectors

tensor([[0.6204, 0.8173],
        [0.6491, 0.8449],
        [0.6486, 0.8444],
        [0.6251, 0.8217],
        [0.6244, 0.8211],
        [0.6319, 0.8281]])

In [74]:
import crewai

ModuleNotFoundError: No module named 'crewai'