<a href="https://colab.research.google.com/github/dev-nileshpawar/python-aiml/blob/main/text_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import re

In [13]:
res = re.split(r'([,.?_!"()\']|--|\s)', "Hello, world. This is a test.")
result = [item.strip() for item in res if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '.']


In [14]:
res1 = re.split(r'\s', "Name name is nilesh")

In [15]:
res1

['Name', 'name', 'is', 'nilesh']

In [16]:
res

['Hello',
 ',',
 '',
 ' ',
 'world',
 '.',
 '',
 ' ',
 'This',
 ' ',
 'is',
 ' ',
 'a',
 ' ',
 'test',
 '.',
 '']

In [17]:
with open("./sample_data/the-verdict.txt", encoding="utf-8") as f:
  raw_text = f.read()

In [18]:
print(raw_text[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [19]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(preprocessed[:30])
print(len(preprocessed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4649


In [20]:
all_words = sorted(set(preprocessed))

In [21]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [22]:
for i, item in enumerate(vocab.items()):
  print(item)
  if(i==20):
    break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


In [23]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        # Mapping from string → integer token ID
        self.str_to_int = vocab

        # Reverse mapping from integer → string token
        self.int_to_str = {}
        for token_str, token_id in vocab.items():
            self.int_to_str[token_id] = token_str

    def encode(self, text):
        # Split text into words + punctuation
        parts = re.split(r'([,.?_!"()\']|--|\s)', text)

        # Clean parts (remove empty and whitespace-only items)
        preprocessed = []
        for item in parts:
            stripped = item.strip()
            if stripped:
                preprocessed.append(stripped)

        # Convert tokens to IDs
        ids = []
        for token in preprocessed:
            ids.append(self.str_to_int[token])

        return ids

    def decode(self, ids):
        # Convert IDs back to tokens
        tokens = []
        for token_id in ids:
            tokens.append(self.int_to_str[token_id])

        # Join tokens with spaces
        text = " ".join(tokens)

        # Remove unwanted space before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)

        return text


In [24]:
token_encode = SimpleTokenizerV1(vocab=vocab)
encoded_text = token_encode.encode("My is")
print(token_encode.decode(encoded_text))

My is


In [25]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))

all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [26]:
for i, item in enumerate(list(vocab.items())[-10:]):
  print(item)

('year', 1151)
('years', 1152)
('yellow', 1153)
('yet', 1154)
('you', 1155)
('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


# **Encoding**

In [27]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        # Mapping from string → integer token ID
        self.str_to_int = vocab

        # Reverse mapping from integer → string token
        self.int_to_str = {}
        for token_str, token_id in vocab.items():
            self.int_to_str[token_id] = token_str

    def encode(self, text):
        # Split text into words + punctuation
        parts = re.split(r'([,.?_!"()\']|--|\s)', text)

        # Clean parts (remove empty and whitespace-only items)
        preprocessed = []
        for item in parts:
            stripped = item.strip()
            if stripped:
                preprocessed.append(stripped)

        # Convert tokens to IDs
        ids = []
        for token in preprocessed:
          if token in self.str_to_int:
            ids.append(self.str_to_int[token])
          else:
            ids.append("<|unk|>")

        return ids

    def decode(self, ids):
        # Convert IDs back to tokens
        tokens = []
        for token_id in ids:
          if token_id in self.int_to_str:
            tokens.append(self.int_to_str[token_id])
          else:
            tokens.append("<|unk|>")

        # Join tokens with spaces
        text = " ".join(tokens)

        # Remove unwanted space before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)

        return text


In [28]:
tokenizerv2 = SimpleTokenizerV2(vocab=vocab)
encoded_text = tokenizerv2.encode("My name is")
print(tokenizerv2.decode(encoded_text))
# print(encoded_text)

My <|unk|> is


In [29]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = "<|endoftext|>".join((text1, text2))

In [30]:
print(text)

Hello, do you like tea?<|endoftext|>In the sunlit terraces of the palace.


In [31]:
text_encoded = tokenizerv2.encode(text)
print(tokenizerv2.decode(text_encoded))

<|unk|>, do you like tea? <|unk|> the sunlit terraces of the <|unk|>.


# **Byte Pair Encoding**

In [32]:
# no code for Byte Pair encoding

# **Use TikToken Library to create tokens**

In [33]:
import tiktoken

In [34]:
tokenizer=tiktoken.get_encoding("gpt2");
tokenizer.n_vocab

50257

In [35]:
tiktoken.list_encoding_names()

['gpt2',
 'r50k_base',
 'p50k_base',
 'p50k_edit',
 'cl100k_base',
 'o200k_base',
 'o200k_harmony']

In [36]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownword."
)

integer = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integer)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 4775, 13]


In [37]:
strings = tokenizer.decode(integer)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownword.


In [38]:
str="LLM learn to predict one word at a time."

for i, item in enumerate(str.split(' ')):
  print(" ".join(str.split(" ")[0:i+1]))

LLM
LLM learn
LLM learn to
LLM learn to predict
LLM learn to predict one
LLM learn to predict one word
LLM learn to predict one word at
LLM learn to predict one word at a
LLM learn to predict one word at a time.


In [39]:
import torch

In [40]:
from torch.utils.data import Dataset, DataLoader

class MyDataset (Dataset):
  def __init__(self, text, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids)-max_length, stride):
      input_chunk = token_ids[i:i+max_length]
      target_chunk = token_ids[i+1:i+1+max_length]

      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [41]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = MyDataset(txt, tokenizer, max_length, stride)

  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

  return dataloader

In [42]:
with open("./sample_data/the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

In [43]:
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=1,
    max_length=4,
    stride=1,
    shuffle=True
)

In [44]:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[8104,  866, 1973,  262]]), tensor([[  866,  1973,   262, 37918]])]


In [45]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 938, 4842, 1650,  353]]), tensor([[4842, 1650,  353,  438]])]


In [46]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs,targets = next(data_iter)
print("input: ", inputs)
print("targets: ", targets)

input:  tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
targets:  tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [47]:
input_ids = torch.tensor([2,3,5,1])

In [48]:
vocab_size=6
output_dim=3
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [49]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [50]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [51]:
input_token_ids = torch.tensor([2,3,5,1])
print(embedding_layer(input_token_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# **Encoding word positions**

In [52]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [53]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("tokenids", inputs)
print("input shape", inputs.shape)

tokenids tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
input shape torch.Size([8, 4])


In [54]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [55]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)


In [56]:
pos_embeddings=pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [57]:
input_embeddings = token_embeddings + pos_embeddings

In [58]:
input_embeddings

tensor([[[ 2.2288,  0.5619,  0.8286,  ..., -0.6272, -0.2987,  0.8900],
         [ 2.0903, -0.4664, -0.0593,  ...,  0.9115, -1.0493, -1.6473],
         [-0.7158, -0.8304,  1.2494,  ...,  2.3952,  1.8773,  0.8051],
         [ 0.2703,  0.4029,  3.0514,  ...,  0.3595, -1.4548,  0.8310]],

        [[ 3.2835,  1.1749, -1.4150,  ..., -0.3281,  2.4332,  0.6924],
         [-0.2199, -0.9114, -0.1750,  ...,  1.5337, -0.1998,  0.1462],
         [ 1.5197, -1.4240,  0.4391,  ...,  1.0494, -1.4318,  2.3057],
         [ 0.2893,  0.8346, -0.1884,  ...,  1.9602,  0.8709,  0.8796]],

        [[ 0.9662,  0.0952, -0.4640,  ..., -1.0320,  1.6290,  1.7771],
         [ 2.4468, -0.2154,  1.4984,  ...,  1.8766,  0.5595, -0.1423],
         [-0.3856, -2.5393,  1.1556,  ...,  3.6157,  1.3267,  0.4944],
         [-0.2487, -0.5275,  2.0009,  ...,  0.2930,  0.5977,  1.3300]],

        ...,

        [[ 0.1219,  0.3991, -3.2740,  ..., -1.1921,  2.6637,  2.6728],
         [ 1.2438, -1.6436, -1.1101,  ..., -0.7464, -0.98