<a href="https://colab.research.google.com/github/elliemci/building-LLM/blob/main/token_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Token ID Embeding

Conversion of words to continuous vector representation

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/LLM

/content/drive/MyDrive/Colab Notebooks/LLM


In [None]:
%ls

pytorch_wormup.ipynb           the-verdict.txt        tokenizing_text.ipynb
sliding_window_sampling.ipynb  token_embedding.ipynb


In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [None]:
import importlib
import tiktoken

import torch
from torch.utils.data import Dataset, DataLoader

### Tokenize

In [None]:
# instantiate BPE tokenizer from tiktoken, which BPE builds its vocabulary by iteratively
# merging frequent characters into subwords and frequent subwords into words
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
# Tokenize text
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)

print(len(enc_text))

5146


### Dataset and Data Loder

In [None]:
class Dataset(Dataset):
  """ Extends PyTorch Dataset class and defines how individual
      rows are fetched from the dataset, witch each row constiting
      of a number of token IDs max_length + 1. """

  def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
        return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
def dataloader(text, batch_size=4, max_length=256, stride=128, shuffle=True):
  """ A PyTorch DataLoader which uses the dataset generated by DatasetV1
      to load the inputs in batches."""

  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = Dataset(text, tokenizer, max_length, stride)
  dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=shuffle)

  return dataloader

In [None]:
dataloader = dataloader(raw_text, batch_size=8, max_length=4, stride=5, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)

print(f"first batch:\n{first_batch}")
print(f"\nsecond batch:\n{second_batch}")

first batch:
[tensor([[   40,   367,  2885,  1464],
        [ 3619,   402,   271, 10899],
        [  257,  7026, 15632,   438],
        [  257,   922,  5891,  1576],
        [  568,   340,   373,   645],
        [ 5975,   284,   502,   284],
        [  326,    11,   287,   262],
        [  286,   465, 13476,    11]]), tensor([[  367,  2885,  1464,  1807],
        [  402,   271, 10899,  2138],
        [ 7026, 15632,   438,  2016],
        [  922,  5891,  1576,   438],
        [  340,   373,   645,  1049],
        [  284,   502,   284,  3285],
        [   11,   287,   262,  6001],
        [  465, 13476,    11,   339]])]

second batch:
[tensor([[  550,  5710,   465, 12036],
        [ 6405,   257,  5527, 27075],
        [  290,  4920,  2241,   287],
        [ 4489,    64,   319,   262],
        [41976,    13,   357, 10915],
        [ 2138,  1807,   340,   561],
        [  587, 10598,   393, 28537],
        [  198,   198,     1,   464]]), tensor([[ 5710,   465, 12036,    11],
        [  257

## Position independent embedding

Embeding layer converts the token ID into a continuous vector representation where the same token ID always gets mapped to the same vector, regarless of where the token ID is positioned in the input sequence. <br>
**Note:** The self-attention mechanism is position-agnostic <br><br>

Create a emebdding of size 250 for 50257 words, instead of BPF tokenizer vocabulary of 50257 words in GPT-3 emebding size of 12288.

In [None]:
vocab_size = 50257
output_dim = 256
# max length of input text
max_length = 4
# suported input size of the LLM
context_length = max_length

# the input text is truncated if max_length > context_length

In [None]:
# instantiate the embeding layer in PyTorch with small random values weight matrix
torch.manual_seed(123)

# embed each tocken of each batch into a output_dim - dimensional vector
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(f"randomly initialized weight matrix:\n{embedding_layer.weight}")

randomly initialized weight matrix:
Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ...,  1.3337,  0.0771, -0.0522],
        [ 0.2386,  0.1411, -1.3354,  ..., -0.0315, -1.0640,  0.9417],
        [-1.3152, -0.0677, -0.1350,  ..., -0.3181, -1.3936,  0.5226],
        ...,
        [ 0.5871, -0.0572, -1.1628,  ..., -0.6887, -0.7364,  0.4479],
        [ 0.4438,  0.7411,  1.1263,  ...,  1.2091,  0.6781,  0.3331],
        [-0.2537,  0.1446,  0.7203,  ..., -0.2134,  0.2144,  0.3006]],
       requires_grad=True)


The Weight matrix has six rows and three columns. There is one row for each of the six possible tokens in the vocabulary. And there is one column for each of the three embedding dimensions.

The embeding vectors values are optimized during LLM training as part of the LLM optimization.

In [None]:
# example of converting a single token ID of 1 into a 256-dim embedding vector
print(embedding_layer(torch.tensor([1])))

tensor([[ 0.2386,  0.1411, -1.3354, -2.9340,  0.1141, -1.2072, -0.3008,  0.1427,
         -1.3027, -0.4919, -2.1429,  0.9488, -0.5684, -0.0646,  0.6647, -2.7836,
          1.1366,  0.9089,  0.9494,  0.0266, -0.9221,  0.7034, -0.3659, -0.1965,
         -0.9207,  0.3154, -0.0217,  0.3441,  0.2271, -0.4597, -0.6183,  0.2461,
         -0.4055, -0.8368,  1.2277, -0.4297, -2.2121, -0.3780,  0.9838, -1.0895,
          0.2017,  0.0221, -1.7753, -0.7490,  0.2781, -0.9621, -0.4223, -1.1036,
          0.2473,  1.4549, -0.2835, -0.3767, -0.0306, -0.0894, -0.1965, -0.9713,
          0.9005, -0.2523,  1.0669, -0.2985,  0.8558,  1.6098, -1.1893,  1.1677,
          0.3277, -0.8331, -1.6179,  0.2265, -0.4382,  0.3265, -1.5786, -1.3995,
          0.5446, -0.0830, -1.1753,  1.7825,  1.7524, -0.2135,  0.4095,  0.0465,
          0.6367, -0.1943, -0.8614,  0.5338,  0.9376, -0.9225,  0.7047, -0.2722,
          0.0144, -0.6411,  2.3902, -1.4256, -0.4619, -1.5539, -0.3338,  0.2405,
          2.1065,  0.5509, -

In [None]:
# Apply the instantiated embedding layer to 4 input token IDs to obtain embedding vectors
token_embeddings = embedding_layer(first_batch[0])
print(token_embeddings)

tensor([[[-6.3964e-02,  3.3174e-01,  1.0698e-01,  ...,  5.3491e-01,
          -8.0244e-01, -2.3238e+00],
         [-3.5248e-01,  3.5087e-01,  9.8728e-01,  ..., -1.8466e+00,
          -1.7034e+00,  3.2226e-01],
         [ 1.0017e+00,  9.2986e-01, -1.2633e+00,  ..., -1.2256e+00,
           1.1179e+00,  1.3427e-01],
         [ 7.9961e-01,  2.2837e+00, -6.5249e-01,  ..., -1.1217e+00,
           4.7057e-01,  1.5314e-01]],

        [[-2.4273e-01,  9.1447e-01,  1.0885e+00,  ..., -8.6509e-01,
           3.5269e+00,  7.2247e-01],
         [-5.4342e-01,  1.6203e+00,  1.2222e+00,  ...,  6.8139e-01,
          -1.4032e+00,  1.4922e-01],
         [-3.5035e-01, -9.3247e-01, -1.2900e+00,  ..., -1.4980e+00,
           1.3996e-01,  3.7301e-01],
         [ 6.7677e-02, -1.0757e+00,  1.3706e-01,  ..., -3.9116e-01,
          -1.0974e+00,  7.5906e-01]],

        [[ 1.1902e-01,  7.6004e-01, -9.3059e-01,  ...,  6.4696e-04,
           8.6058e-01, -1.3698e+00],
         [ 9.6366e-03,  5.7844e-01,  3.1312e-01,  .

## Positional Encoding

Positional embedding understands the order and relationships between tokens, ensuring context-aware predictions.<br>

Position information is achived through relative and absolute positional embeddings.<br><br>
**Absolute positional embeddings** for each position in the input sequence, a unique embedding is added to the token's embedding to convey its exact location.<br>
*OpenAI's GTP model* uses absolute positional embeddings that are optimize during the training process.


### Absolute Embedding

Create another embedding layer that has the same dimension as the token_embedding_layer, a placeholder vector which contains a sequence of numbers 0,1, ... , up to the maximum input length - 1

In [None]:
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

print(f"Size of posisiton embedindg tensor: {pos_embeddings.shape}")
print(f"Posisiton embedindg tensor: {pos_embeddings}")

Size of posisiton embedindg tensor: torch.Size([4, 256])
Posisiton embedindg tensor: tensor([[-0.9694,  0.4167,  0.5280,  ..., -0.0023,  1.1440,  0.8301],
        [-1.4067, -0.8280, -0.3587,  ..., -0.9408,  1.5647, -0.6394],
        [ 0.3999, -0.3997, -1.9166,  ...,  0.1630,  0.2393, -0.1784],
        [-0.5676,  1.7856, -0.0915,  ..., -0.7356,  1.2118, -1.1895]],
       grad_fn=<EmbeddingBackward0>)


add the positional embedding tensor to the token embeddings

In [None]:
input_embeddings = token_embeddings + pos_embeddings

print(f"Size of input embedding tensor for the first row of the dataset:\n{input_embeddings.shape}")
print(f" Input embeddings:\n{input_embeddings}")

Size of input embedding tensor for the first row of the dataset:
torch.Size([8, 4, 256])
 Input embeddings:
tensor([[[-1.0334e+00,  7.4843e-01,  6.3500e-01,  ...,  5.3259e-01,
           3.4160e-01, -1.4937e+00],
         [-1.7592e+00, -4.7717e-01,  6.2860e-01,  ..., -2.7873e+00,
          -1.3879e-01, -3.1713e-01],
         [ 1.4016e+00,  5.3016e-01, -3.1799e+00,  ..., -1.0627e+00,
           1.3572e+00, -4.4152e-02],
         [ 2.3202e-01,  4.0693e+00, -7.4397e-01,  ..., -1.8572e+00,
           1.6823e+00, -1.0363e+00]],

        [[-1.2121e+00,  1.3312e+00,  1.6165e+00,  ..., -8.6741e-01,
           4.6709e+00,  1.5526e+00],
         [-1.9501e+00,  7.9228e-01,  8.6353e-01,  ..., -2.5938e-01,
           1.6145e-01, -4.9017e-01],
         [ 4.9515e-02, -1.3322e+00, -3.2066e+00,  ..., -1.3350e+00,
           3.7927e-01,  1.9458e-01],
         [-4.9991e-01,  7.0983e-01,  4.5585e-02,  ..., -1.1267e+00,
           1.1434e-01, -4.3041e-01]],

        [[-8.5037e-01,  1.1767e+00, -4.0257e-01,

### Relative Positional embeddings

The model learns the relationships in terms of distance between tokens.This model can generalize better to sequences of varying lengths even if hasn't seen such lengths during training