In [1]:
pip install tiktoken -q

In [43]:
# converting text to tokens

import tiktoken
import numpy as np

tokenizer=tiktoken.encoding_for_model('gpt-4o')

In [126]:
text='The cat sat on a mat'

tokens=tokenizer.encode(text)

print(f"Tokens - {tokens}")
for i in range(len(tokens)):
    print(f"{tokens[i]} => '{tokenizer.decode([tokens[i]])}' ")

Tokens - [976, 9059, 10139, 402, 261, 2450]
976 => 'The' 
9059 => ' cat' 
10139 => ' sat' 
402 => ' on' 
261 => ' a' 
2450 => ' mat' 


In [67]:
embedding_dimensions=100

In [151]:
# generating our own embedding matrix


word_embeddings=np.random.randn(50000,embedding_dimensions)*0.2

print(f"matrix size => {word_embed.shape}")
print(f"total parameters => {word_embed.shape[0]*word_embed.shape[1]:,}")

matrix size => (50000, 768)
total parameters => 38,400,000


In [37]:
pip install gensim -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h

In [152]:
# getting embedding for generated tokens

import gensim.downloader as api

word_embeds_glove=api.load('glove-wiki-gigaword-100')

In [156]:
print('Embedding for first token - ')
print()
print(f"'{tokenizer.decode([tokens[0]])}' ===> {word_embeds_glove[tokens[0]]}")

Embedding for first token - 

'The' ===> [-0.24727    0.78307    0.44702   -0.084627   0.71603    0.25502
  0.27146    0.31966   -1.0295    -0.47782    0.90986    0.0031653
  0.32648    0.38458    0.61674    0.049444   0.11239    0.46098
 -0.5804     0.77743    0.95716    0.80546    0.30824   -0.031789
  0.49038    0.17329   -0.45901    0.0175    -0.22298    0.060455
 -0.53695    1.2004    -0.068347   0.11316   -0.30841    0.40903
 -0.68957   -0.35439   -0.37194    0.8153     0.47602   -0.068007
  0.070446  -0.44186    0.25154   -0.46211    0.3426    -0.93956
  0.4344     0.60238   -0.55132    0.16768    0.073117   0.72065
 -0.18476   -2.3391    -0.49756    0.31342    1.4492     0.88318
 -0.72643   -0.4188     0.55796    0.23447   -0.1621     0.75175
  0.76326   -0.50907    0.25204    0.57319   -0.2553     0.13437
 -0.30616   -0.86615    0.87236    0.0071972 -0.68176   -0.12665
 -0.36754    0.093175   0.12012    0.78489   -0.63707    0.024234
 -0.57392    0.24726   -0.11193   -0.23344 

In [159]:
# finding embedding for our text

embeddings=[]

for i in range(len(tokens)):
    embeddings.append(word_embeddings[tokens[i]])


In [184]:
# positional encoding calculation function

def positional_embeddings_cal(tokens_len, embedding_dimensions):
    positional_embeddings=np.random.randn(tokens_len,embedding_dimensions)
    for pos in range(tokens_len):
        for i in range(embedding_dimensions):
            if(i%2==0):
                positional_embeddings[pos][i]=np.sin(pos/10000**(i/embedding_dimensions))
            else:
                positional_embeddings[pos][i]=np.cos(pos/10000**((i-1)/embedding_dimensions))
    return positional_embeddings


In [190]:
# calculating final embeddings=embeddings+positional_embeddings

positional_embeddings=positional_embeddings_cal(len(tokens),embedding_dimensions)

final_embeddings=embeddings+positional_embeddings

print("Final embeddings shape:", final_embeddings.shape)
print("First token vector:", final_embeddings[0][:5])


Final embeddings shape: (6, 100)
First token vector: [-0.19237136  1.67402152 -0.07199606  1.28888887 -0.24032851]


In [186]:
# Entering attention part
# defining w_q, w_k, w_v - for now taking as random matrix as currently not having learned matrix

d_k=70
d_v=50

w_q=np.random.randn(embedding_dimensions,d_k)*0.2
w_k=np.random.randn(embedding_dimensions,d_k)*0.2
w_v=np.random.randn(embedding_dimensions,d_v)*0.2

In [187]:
# finding Q, K, V

Q=final_embeddings@w_q
K=final_embeddings@w_k
V=final_embeddings@w_v

In [188]:
# defining softmax function

def softmax(x):
    for i in range(x.shape[0]):
        s=0
        for j in range(x.shape[1]):
            s+=np.exp(x[i][j])
        for j in range(x.shape[1]):
            x[i][j]=np.exp(x[i][j])/s
    return x

In [189]:
# calculating attention

similarity_score=Q@K.T
scaled_dot_product=similarity_score/np.sqrt(d_k)
attention_weights=softmax(scaled_dot_product)

contextual_emdeddings=attention_weights@V

print("Attention weights (after softmax) for every token in sentence:")
print(f"Shape: {attention_weights.shape}\n")

# Display as a table
print("           ", end="")
for token in tokens:
    print(f"{f"'{tokenizer.decode([token])}'":>8}", end="")
print("    | Sum")
print("-" * 70)

for i, token in enumerate(tokens):
    print(f"{f"'{tokenizer.decode([token])}'":>8}   ", end="")
    for j in range(len(tokens)):
        print(f"{attention_weights[i,j]:>8.3f}", end="")
    print(f"  | {attention_weights[i].sum():.3f}")

print()
print()
print("="*70)
print('Contextual embeddings after attention calculation :')
print(f"Shape: {contextual_emdeddings.shape} as d_v=50")
print()
for i, token in enumerate(tokens):
    print(f"Token {token} ('{tokenizer.decode([token])}')")
    print(f"   -> First 10 values : {contextual_emdeddings[i][:10].round(3)}")
    print()

Attention weights (after softmax) for every token in sentence:
Shape: (6, 6)

              'The'  ' cat'  ' sat'   ' on'    ' a'  ' mat'    | Sum
----------------------------------------------------------------------
   'The'      0.034   0.021   0.069   0.088   0.248   0.539  | 1.000
  ' cat'      0.049   0.017   0.084   0.121   0.230   0.499  | 1.000
  ' sat'      0.162   0.031   0.063   0.085   0.162   0.497  | 1.000
   ' on'      0.177   0.068   0.091   0.139   0.225   0.300  | 1.000
    ' a'      0.091   0.060   0.091   0.317   0.287   0.154  | 1.000
  ' mat'      0.058   0.047   0.081   0.296   0.405   0.112  | 1.000


Contextual embeddings after attention calculation :
Shape: (6, 50) as d_v=50

Token 976 ('The')
   -> First 10 values : [ 0.867 -0.536  1.151 -1.726  0.066 -0.299 -0.733  0.816  0.068 -1.924]

Token 9059 (' cat')
   -> First 10 values : [ 0.925 -0.578  1.106 -1.637  0.06  -0.314 -0.711  0.762  0.128 -1.978]

Token 10139 (' sat')
   -> First 10 values : [ 1.17  -0.