In [1]:
import numpy as np
import math

# Self Attention From Scratch Numpy

In [54]:
# setting our values to explore self attention
# we begin with the embedding dimension
embed_dim = 512

l = 2000 # length of the sequence

# dimension of the embddings
d_k = embed_dim
d_v = embed_dim
d_q = embed_dim

In [7]:
# initialize our linear layers using numpy, think of these as our weights
# realistically these values will be adjusted as they are weighted parameters
query = np.random.randn(l, d_q)
key = np.random.randn(l, d_k)
value = np.random.randn(l, d_v)

In [8]:
query.shape, key.shape, value.shape

((2000, 512), (2000, 512), (2000, 512))

In [17]:
# checking the variance and std of each of our weights
def stats(name, x, fn): print(f"{name} {fn.__name__}: {fn(x)}") 

In [24]:
stats("query", query, np.mean)
stats("query", query, np.var)

query mean: 0.001069653035030377
query var: 1.0010328363909236


In [30]:
# so now lets actually calculate self-attention

# first we will matrix multiply the query and keys - think of this as a lookup
attn = np.matmul(query, key.T)

# this should give us a (embedded_dim, embedded_dim) sized matrix
attn.shape

(2000, 2000)

In [31]:
# next thing we need to do is to scale down the values
# as we can see, the stats of this matrix don't represent the initial stats. Which could through off the NN
stats("attention-pre", attn, np.mean)
stats("attention-pre", attn, np.var)

attention-pre mean: 0.0019910089655932645
attention-pre var: 513.2093028039534


In [36]:
# so we scale it down by the embedding dimension size of we initialized earlier
attn_normalized =  attn / math.sqrt(d_k)

In [37]:
# this should give us better stats
stats("attention-post", attn_normalized, np.mean)
stats("attention-post", attn_normalized, np.var)

attention-post mean: 8.799099631088812e-05
attention-post var: 1.0023619195389712


In [56]:
# masking
# this ensures words dont get context from words genereated in the future
# this is very important in the decoder step as this would be cheating for the neural net
# however, this isnt very important for the encoder step


# lets create a mask
mask = np.tril(np.ones((l, l)))
mask.shape

(2000, 2000)

In [57]:
# to give an idea of what the mask looks like. 
# for each sample, it will turn the future 'word' or 'token' as a 0 so it will not compute it
mask

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 1., 1., 0.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [58]:
# now we will turn the 0s into -inf
# and all the ones a zero
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

mask

array([[  0., -inf, -inf, ..., -inf, -inf, -inf],
       [  0.,   0., -inf, ..., -inf, -inf, -inf],
       [  0.,   0.,   0., ..., -inf, -inf, -inf],
       ...,
       [  0.,   0.,   0., ...,   0., -inf, -inf],
       [  0.,   0.,   0., ...,   0.,   0., -inf],
       [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [63]:
# why do we want to do this?
# the reason is, we are adding the mask to the attention calculation. So if we kept the one, this will shift of values by one which
# will skew the data and we dont want this
attn_normalized + mask

array([[-0.36844932,        -inf,        -inf, ...,        -inf,
               -inf,        -inf],
       [-0.32793882, -0.32327196,        -inf, ...,        -inf,
               -inf,        -inf],
       [ 1.31093826, -0.67981886,  0.96435411, ...,        -inf,
               -inf,        -inf],
       ...,
       [-0.13647658,  1.28365034, -0.00917514, ...,  1.31312584,
               -inf,        -inf],
       [ 1.11890654, -0.68432322,  0.07081183, ..., -0.27636055,
         0.71917982,        -inf],
       [ 2.43944293,  0.52733486,  1.09641017, ..., -0.61578768,
         0.51607829,  1.05108062]])

In [66]:
# next we will be applying a softmax to get our attention matrix
def softmax(x): return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [69]:
attention_matrix = softmax(attn_normalized + mask)
attention_matrix

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.98833286e-01, 5.01166714e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.42390154e-01, 7.40861396e-02, 3.83523707e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.43633168e-04, 1.00806770e-03, 2.76708644e-04, ...,
        1.03822324e-03, 0.00000000e+00, 0.00000000e+00],
       [9.04626776e-04, 1.49051621e-04, 3.17166775e-04, ...,
        2.24136529e-04, 6.06555199e-04, 0.00000000e+00],
       [3.74031637e-03, 5.52701134e-04, 9.76419895e-04, ...,
        1.76213242e-04, 5.46514501e-04, 9.33147392e-04]])

In [76]:
# lets now just write a simple function that calculates self attention
# remember, we will only want the mask in the decoder and not the encoder
def self_attention(query, key, value, mask=None):
    scaled = np.matmul(query, key.T) / math.sqrt(query.shape[-1])
    if mask: scaled = scaled + mask
    attention_matrix = softmax(scaled)
    out = np.matmul(attention_matrix, value)
    return attention_matrix, out