<a href="https://colab.research.google.com/github/erioluwa01/llm_preprocessing/blob/main/positional_vector_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install Gensim
!pip install gensim

# Step 2: Download the model using Gensim API
import gensim.downloader as api
model = api.load("word2vec-google-news-300")  # ⏳ Will take time


print(model['king'])
print(model.most_similar('computer'))


[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [2]:
word_vectors = model
#so lets see how the vector embeddings of a word looks like
print(word_vectors['computers']) #This example is accessing the vector for the word 'computer'


[ 0.32421875 -0.24316406  0.11523438  0.25976562 -0.18847656  0.10595703
 -0.10205078  0.10693359  0.28710938  0.01428223  0.0100708  -0.20214844
  0.19238281  0.07714844 -0.03686523  0.06933594 -0.0013504   0.26757812
  0.12011719  0.02746582 -0.0072937  -0.04443359  0.15625     0.10693359
  0.1640625  -0.07177734  0.02355957 -0.03930664 -0.05004883 -0.17480469
 -0.06054688 -0.10839844 -0.17382812  0.01843262  0.14160156 -0.4140625
 -0.43554688 -0.12792969  0.1484375  -0.04882812 -0.11914062  0.23046875
  0.265625    0.10400391  0.27929688  0.06933594 -0.03881836  0.31640625
 -0.40625     0.05712891 -0.01324463 -0.09960938  0.05737305 -0.18945312
 -0.15039062  0.23632812 -0.05102539 -0.17871094 -0.21972656  0.14746094
  0.16308594  0.04736328 -0.13183594  0.22070312 -0.04003906  0.05517578
 -0.2734375   0.42773438 -0.25585938  0.06591797  0.05419922 -0.25
  0.14453125 -0.00531006 -0.08984375 -0.01312256  0.08349609 -0.203125
 -0.0022583  -0.25390625  0.08935547  0.08447266  0.27539062

In [3]:
print(word_vectors['cat'].shape)

(300,)


In [5]:
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593831062317), ('monarchy', 0.5087411999702454)]


In [6]:
#Checking the similarities between a pair of words too
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))


0.76640123
0.6510956
0.7643474
0.8543272
0.7594368
0.114080824


In [7]:
#I can also find words similar to a given word
print(word_vectors.most_similar("tower", topn=5))


[('towers', 0.8531750440597534), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.594687819480896), ('responded_Understood_Atlasjet', 0.5931612253189087)]


In [8]:
#I can also check for similarities
import numpy as np
#words to compare

word_1 = 'man'
word_2 = 'woman'
word_3 = 'semiconductor'
word_4 = 'earthworm'
word_5 = 'nephew'
word_6 = 'niece'


#Calculating the vector difference
vector_difference1 = model[word_1] - model[word_2]
vector_difference2 = model[word_3] - model[word_4]
vector_difference3 = model[word_5] - model[word_6]

#Calculating the magnitude of the vector difference
magnitude_of_difference1 = np.linalg.norm(vector_difference1)
magnitude_of_difference2 = np.linalg.norm(vector_difference2)
magnitude_of_difference3 = np.linalg.norm(vector_difference3)

#I want to print the magnitude of the difference
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word_1, word_2, magnitude_of_difference1))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word_3, word_4, magnitude_of_difference2))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word_5, word_6, magnitude_of_difference3))



The magnitude of the difference between 'man' and 'woman' is 1.73
The magnitude of the difference between 'semiconductor' and 'earthworm' is 5.67
The magnitude of the difference between 'nephew' and 'niece' is 1.96


In [9]:
## Creating token embeddings
#SO i want to illustrate how converting token ids to vector embedding works by using the example of the token ids [2,3,5,1]

In [12]:
#!pip install torch
import torch

input_ids = torch.tensor([2,3,5,1])

In [14]:
vocab_size = 6
output_dim = 3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [15]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [16]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [17]:
print(embedding_layer(input_ids))


tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [18]:
### A more realistic example .. exploring positional embeddings
vocab_size=50252
output_dim=256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)


In [51]:
raw_text = """
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
"""


In [55]:
#import torch
from torch.utils.data import Dataset, DataLoader

# Step 1: Embedding layer parameters
vocab_size = 50252
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# Step 2: Simple hash-based tokenizer (simulating token IDs)
def simple_tokenizer(text):
    tokens = text.lower().split()
    token_ids = [hash(word) % vocab_size for word in tokens]
    return token_ids

# Step 3: Dataset class for text chunks (input/target pairs)
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, text, max_length, stride):
        self.token_ids = simple_tokenizer(text)
        self.samples = []
        for i in range(0, len(self.token_ids) - max_length, stride):
            input_seq = self.token_ids[i:i+max_length]
            target_seq = self.token_ids[i+1:i+max_length+1]
            self.samples.append((torch.tensor(input_seq), torch.tensor(target_seq)))

    def __getitem__(self, idx):
        return self.samples[idx]  # (inputs, targets)

    def __len__(self):
        return len(self.samples)

# Step 4: Function to create dataloader
def create_dataloader_v1(text, batch_size, max_length, stride, shuffle):
    dataset = TextDataset(text, max_length, stride)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

# Step 5: Define your raw input text
raw_text = """
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)
"""

# Step 6: Create the dataloader
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)

# Step 7: Fetch a batch from the dataloader
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

# Step 8: Print shapes and content
print(" Input IDs:\n", inputs)
print("Shape:", inputs.shape)  # Expected: [8, 4]

print("\n Target IDs:\n", targets)
print("Shape:", targets.shape)  # Expected: [8, 4]

# Step 9: Apply embedding
embedded_inputs = token_embedding_layer(inputs)
print("\n Embedded shape:", embedded_inputs.shape)  # Expected: [8, 4, 256]


 Input IDs:
 tensor([[ 1027, 49411, 48580, 29002],
        [13290, 43054, 37923, 34031],
        [17575, 38639, 34031, 24096],
        [ 6164, 17643, 21333, 48938],
        [ 9076,  6141,  8825, 11290],
        [39917, 11290, 38430, 34087],
        [35518, 19793, 44362, 41725],
        [41805, 36983, 46409, 49411]])
Shape: torch.Size([8, 4])

 Target IDs:
 tensor([[49411, 48580, 29002, 13290],
        [43054, 37923, 34031, 17575],
        [38639, 34031, 24096,  6164],
        [17643, 21333, 48938,  9076],
        [ 6141,  8825, 11290, 39917],
        [11290, 38430, 34087, 35518],
        [19793, 44362, 41725, 41805],
        [36983, 46409, 49411, 15826]])
Shape: torch.Size([8, 4])

 Embedded shape: torch.Size([8, 4, 256])


In [57]:
## now the token ID tensor is 8x4-dimensional, meaning the data batch consist of 8 text samples with 4 tokens
## Now I want to use the embedding layer toe embed these token IDs into 256-dimensional vectors
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [58]:
##Now I have to add a positional embedding vector to each of those vectors
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [59]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [60]:
#Note that i want to add one postion vector to each of the 4 tokens ..
#The same positional embeddings are applied to each input of 4 tokens (because there are only 4 positions)

## As shown in the preceding code example, the input to the pos_embeddings is usually a
## Pladehold of sequence of numbers 0,1,.., up to the maximum input length - 1
## The context_length is a variable that represents the supported input size of the llm
## Here, I chose it simila to the maximum length of the input text
## In practice, the input text can be longer than the supported context length in which case we have to truncate

# The positonal embedding tensor consis of four 256-dimensional vectors. We can now add these directly to the token embedding
