In [150]:
pip install tiktoken -q

In [151]:
# converting text to tokens

import tiktoken
import numpy as np

tokenizer=tiktoken.encoding_for_model('gpt-4o')

In [152]:
text='The cat sat on a mat'

tokens=tokenizer.encode(text)

In [153]:
embedding_dimensions=400

In [154]:
# generating our own embedding matrix


word_embeddings=np.random.randn(50000,embedding_dimensions)*0.2

print(f"matrix size => {word_embeddings.shape}")
print(f"total parameters => {word_embeddings.shape[0]*word_embeddings.shape[1]:,}")

matrix size => (50000, 400)
total parameters => 20,000,000


In [155]:
# finding embedding for our text

embeddings=[]

for i in range(len(tokens)):
    embeddings.append(word_embeddings[tokens[i]])


In [156]:
# positional encoding calculation function

def positional_embeddings_cal(tokens_len, embedding_dimensions):
    positional_embeddings=np.random.randn(tokens_len,embedding_dimensions)
    for pos in range(tokens_len):
        for i in range(embedding_dimensions):
            if(i%2==0):
                positional_embeddings[pos][i]=np.sin(pos/10000**(i/embedding_dimensions))
            else:
                positional_embeddings[pos][i]=np.cos(pos/10000**((i-1)/embedding_dimensions))
    return positional_embeddings


In [157]:
# calculating final embeddings=embeddings+positional_embeddings

positional_embeddings=positional_embeddings_cal(len(tokens),embedding_dimensions)

final_embeddings=embeddings+positional_embeddings

print("Final embeddings shape:", final_embeddings.shape)
print("First token vector:", final_embeddings[0][:5])


Final embeddings shape: (6, 400)
First token vector: [0.15985043 0.9803748  0.0248639  1.39522085 0.06549021]


In [158]:
# number of heads and number of tokens in sentence

heads=8
tokens_size=len(tokens)
head_dim=int(embedding_dimensions/heads)

In [159]:
# will divide the final_embeddings for each head



# final_embeddings=final_embeddings.reshape(tokens_size,heads,head_dim)

# print(f"dimensions of embeddings for multi-heads => {final_embeddings.shape}")

In [160]:
d_k=400
d_v=400

In [161]:
# initializing w_q, w_k, w_v for multihead

w_q=np.random.randn(embedding_dimensions,d_k)*0.2
w_k=np.random.randn(embedding_dimensions,d_k)*0.2
w_v=np.random.randn(embedding_dimensions,d_v)*0.2

In [162]:
# converting embeddings for matmul

# final_embeddings=np.swapaxes(final_embeddings,0,1)

# print(f"dimensions of embeddings after swapping => {final_embeddings.shape}")

In [163]:
# calculating Q, K, V for each head

Q=final_embeddings@w_q
K=final_embeddings@w_k
V=final_embeddings@w_v

print(f'dimesions of Q, K, V : {Q.shape, K.shape, V.shape}')

dimesions of Q, K, V : ((6, 400), (6, 400), (6, 400))


In [211]:
# concating Q,K,V for attention calculation

Q=Q.reshape(tokens_size,heads,head_dim)
K=K.reshape(tokens_size,heads,head_dim)
V=V.reshape(tokens_size,heads,head_dim)

Q=Q.transpose(1,0,2)
K=K.transpose(1,0,2)
V=V.transpose(1,0,2)


print(f'dimesions of Q, K, V : {Q.shape, K.shape, V.shape}')


dimesions of Q, K, V : ((8, 6, 50), (8, 6, 50), (8, 6, 50))


In [212]:
# defining softmax function

def softmax(x):
    for head in range(x.shape[0]):
        for i in range(x.shape[1]):
            s=0
            for j in range(x.shape[2]):
                s+=np.exp(x[head][i][j])
            for j in range(x.shape[2]):
                x[head][i][j]=np.exp(x[head][i][j])/s
    return x

In [213]:
# calculaitng attention

K=K.transpose(0,2,1)

similarity_score=Q@K

scaled_dot_product=(similarity_score/np.sqrt(d_k))

attention_weights=softmax(scaled_dot_product)

print(f"Attention weights Shape: {attention_weights.shape}\n")

# attention_weights=attention_weights.transpose(1,0,2)
# attention_weights=attention_weights.reshape(attention_weights.shape[0],-1)/8

contextual_emdeddings=attention_weights@V


contextual_emdeddings=contextual_emdeddings.transpose(1,0,2)
contextual_emdeddings=contextual_emdeddings.reshape(contextual_emdeddings.shape[0],-1)


W_o = np.random.randn(heads * head_dim, embedding_dimensions) * 0.2
final_output = contextual_emdeddings @ W_o

# print("Attention weights (after multi-head concat and softmax) for every token in sentence:")
# print(f"Shape: {attention_weights.shape}\n")

# # Display as a table
# print("           ", end="")
# for token in tokens:
#     print(f"{f"'{tokenizer.decode([token])}'":>8}", end="")
# print("    | Sum")
# print("-" * 70)

# for i, token in enumerate(tokens):
#     print(f"{f"'{tokenizer.decode([token])}'":>8}   ", end="")
#     for j in range(len(tokens)):
#         print(f"{attention_weights[i,j]:>8.3f}", end="")
#     print(f"  | {attention_weights[i].sum():.3f}")

print("="*90)
print()
print('Contextual embeddings after multi-head attention calculation :\n')
print(f"Final output Shape: {final_output.shape} as d_v=400 before spliting")
print()
for i, token in enumerate(tokens):
    print(f"Token {token} ('{tokenizer.decode([token])}')")
    print(f"   -> First 10 values : {final_output[i][:10].round(3)}")
    print()

Attention weights Shape: (8, 6, 6)


Contextual embeddings after multi-head attention calculation :

Final output Shape: (6, 400) as d_v=400 before spliting

Token 976 ('The')
   -> First 10 values : [ 12.905 -16.903  10.272  -2.295  -1.195  -3.487  -6.019   4.227 -10.514
  -9.059]

Token 9059 (' cat')
   -> First 10 values : [  8.097 -18.872   8.876   5.371  -0.063   3.554 -10.938  13.415  -6.129
 -12.051]

Token 10139 (' sat')
   -> First 10 values : [ 14.671 -13.537   9.13   -3.22   -3.885  -1.065  -7.857   5.308 -10.129
  -9.751]

Token 402 (' on')
   -> First 10 values : [  2.86  -18.651   6.334   6.741  -0.623   4.745 -16.825  13.486  -2.456
  -8.696]

Token 261 (' a')
   -> First 10 values : [ 14.243 -17.972  11.785  -3.772  -3.609   1.147 -10.608   6.465 -10.13
  -7.749]

Token 2450 (' mat')
   -> First 10 values : [  3.637 -11.634   8.981   9.056  -4.331   3.431 -20.526  18.158  -4.744
  -5.725]

