# Self Attention Single vs Multi Head Exploration

* main goal is to implement single + multi-head self attention

## Setup

In [91]:
import numpy as np

In [117]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

In [118]:
# embedding dim
d = 60

In [119]:
sentence = "i like to eat apples"

In [120]:
words = sentence.split(" ")

In [121]:
print(words)

['i', 'like', 'to', 'eat', 'apples']


In [122]:
def generate_embedding(d):
    return np.random.rand(d)

In [123]:
input = np.array([generate_embedding(d) for i in words])

In [124]:
input.shape

(5, 60)

## Single Head Self Attention

In [125]:
_ = """

ok so to implement this, i just need a sequence of tokens, create "embeddings" for them, then do self-attention with those embeddings

formula: W = softmax((Q * K.T)/sqrt(d)) * V

"""

In [126]:
W_q = np.random.rand(d, d)
W_k = np.random.rand(d, d)
W_v = np.random.rand(d, d)

In [127]:
Q = input @ W_q
K = input @ W_k
V = input @ W_v

In [128]:
print(Q.shape)

(5, 60)


In [129]:
W = softmax((Q @ K.T) / np.sqrt(d))

In [130]:
W.shape

(5, 5)

In [131]:
Y = W @ input

In [132]:
Y.shape

(5, 60)

In [133]:
_ = """ Y represents the output of the self-attention layer """

## Multi-Head Attention

In [134]:
h = 4

In [135]:
_ = """ ok so for mha, we have h: q, k, v """

In [136]:
d_k = d // h

In [137]:
W_q_heads = [np.random.rand(d, d_k) for _ in range(h)]
W_k_heads = [np.random.rand(d, d_k) for _ in range(h)]
W_v_heads = [np.random.rand(d, d_k) for _ in range(h)]

In [138]:
heads_output = []

for i in range(h):
    Q_i = input @ W_q_heads[i]
    K_i = input @ W_k_heads[i]
    V_i = input @ W_v_heads[i]

    scores = (Q_i @ K_i.T) / np.sqrt(d_k)
    A_i = softmax(scores)

    head_output = A_i @ V_i
    heads_output.append(head_output)

In [139]:
multihead_output = np.concatenate(heads_output, axis=1)

In [140]:
multihead_output.shape

(5, 60)