# Lecture 7: Tokenizer

## First tokenize the entire short story (The Verdict)

In [23]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

print("Total characters", len(raw_text))
raw_text[:99]

Total characters 20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

In [24]:
# Use regular expressions with re

import re
text = "Hello, world. This is a test!"
result = re.split(r'(\s)', text) # split where white spaces

print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test!']


In [25]:
# also split commas and periods
result = re.split(r'([,.!]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '!', '']


In [26]:
result = [item for item in result if item.strip()] # gets rid of whitespace
result

['Hello', ',', 'world', '.', 'This', 'is', 'a', 'test', '!']

Remove whitespace to reduce memory but can also have some defects where whitespace may have more meaning (i.e. in Python code).

When building an LLM, think for applications whether it makes sense to remove whitespace.

In [27]:
# This is the final simple tokenization scheme
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

In [28]:
preproccessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preproccessed = [item for item in preproccessed if item.strip()]
print(preproccessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [29]:
print(len(preproccessed))

4690


## Now we must convert tokens into Token IDs (step 2)

*Tokens now have to have numerical representations*

In [30]:
all_words = sorted(set(preproccessed))
vocab_size = len(all_words)
vocab_size

1130

In [31]:
# Map the sorted vocab to a number (order)
vocab = {token:integer for integer,token in enumerate(all_words)}

In [32]:
for i, item in list(enumerate(vocab.items()))[:50]:
  print(item)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)


**Alphabetical order in the vocab** (see above)
- Individual tokens
- Individual integers

We also need a decoder... map ID back to token (decode)

Define two functions, _encode_ and __decode__

In [33]:
class SimpleTokenizer:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, input_text):
    preproccessed = re.split(r'([,.:;?_!"()\']|--|\s)', input_text)
    preproccessed = [item.strip() for item in preproccessed if item.strip()]
    ids = [self.str_to_int[s] for s in preproccessed]
    return ids
  
  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # add spaces back but not between punctuation!
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

Take some text, encode it and decode it back! This is a simple sanity check.

In [34]:
tokenizer = SimpleTokenizer(vocab)
text = """"It's the last he painted, you know"
          Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [35]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know" Mrs. Gisburn said with pardonable pride.'

Now encode something not in our vocab

In [36]:
text = "Where is my five iron?!!"
ids = tokenizer.encode(text)
print(ids)

KeyError: 'Where'

This shows an error message because this word is not in our vocab. This motivates __special context tokens...__
- The tokenizer will handle unknown words
- Unknown text token <|unk|>
- Also add end of text token <|endoftext|>

Add these two!

In [None]:
# add two more tokens to our vocab
import enum


all_tokens = sorted(list(set(preproccessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(vocab) # verify we added two tokens bc 1130 before

1132

In [None]:
# show these two tokens were actually added
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
class SimpleTokenizer2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i:s for s,i in vocab.items()}

  def encode(self, input_text):
    preproccessed = re.split(r'([,.:;?_!"()\']|--|\s)', input_text)
    preproccessed = [item.strip() for item in preproccessed if item.strip()]
    preproccessed = [
      (item if item in self.str_to_int
      else "<|unk|>") for item in preproccessed
    ]
    ids = [self.str_to_int[s] for s in preproccessed]
    return ids
  
  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # add spaces back but not between punctuation!
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizer2(vocab)

text1 = "Where is my five iron?!!"
text2 = "Hey there, do you like tea?"

text = " <|endoftext|> ".join((text1, text2))
print(text)

Where is my five iron?!! <|endoftext|> Hey there, do you like tea?


In [None]:
tokenizer.encode(text)

[1131,
 584,
 697,
 445,
 1131,
 10,
 0,
 0,
 1130,
 1131,
 992,
 5,
 355,
 1126,
 628,
 975,
 10]

In [None]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> is my five <|unk|>?!! <|endoftext|> <|unk|> there, do you like tea?'

_We can restore where the unknown tokens are without failure_

The original text does not inlcude 'Where,' 'hey,' 'iron' but no error is formed

__Some more tokens:__
- BOS (beginning of sequence)
- EOS (end of sequence)
- PAD (padding)

GPT will ONLY use <|endoftext|>

GPT also does byte-pair encoding (no <|unk|>) so it breaks words down into sub-units (so worst case just the individual characters)

# Lecture 8: Byte-pair encoding

Last time, we did a very simple tokenizer, but GPT actually does byte-pair which we will start now!

tiktoken allows the use of OpenAIs GPT2 BPE tokenizer to use their vocabulary on a large corpus to tokenize any text. _I understand how BPE works (I've done it by hand), but it makes sense to use the pre-developed tokenizer_.

In [38]:
# first install the byte-pair tokenizer

! pip install tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [39]:
import importlib
from importlib import metadata
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
  "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
  "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


This tokenizer __works__ with unknown tokens because goes to character level!

In [None]:
integers = tokenizer.encode("aekerwon ier dast huden")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[64, 28233, 26502, 220, 959, 288, 459, 289, 44452]
aekerwon ier dast huden


# Lecture 9: Input-Target Pairs

In [None]:
enc_text = tokenizer.encode(raw_text)
len(enc_text) # our vocab has a length 5145 (5145 mappings of tokens to IDs)

5145

Let's demo with the first 50 tokens

In [None]:
sample = enc_text[:50]
sample

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11]

In [None]:
def create_pairs(text):
  x = text[:-1]
  y = text[1:]
  print(x)
  print(y)
  print(len(x))
  print(len(y))

  input_target_pairs = {}
  for i in range(len(x)):
    input_target_pairs[tuple(x[:i+1])] = y[i]

  return input_target_pairs

In [None]:
input_target_pairs = create_pairs(sample)

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075]
[367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11]
49
49


In [None]:
input_target_pairs

{(40,): 367,
 (40, 367): 2885,
 (40, 367, 2885): 1464,
 (40, 367, 2885, 1464): 1807,
 (40, 367, 2885, 1464, 1807): 3619,
 (40, 367, 2885, 1464, 1807, 3619): 402,
 (40, 367, 2885, 1464, 1807, 3619, 402): 271,
 (40, 367, 2885, 1464, 1807, 3619, 402, 271): 10899,
 (40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899): 2138,
 (40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138): 257,
 (40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257): 7026,
 (40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026): 15632,
 (40,
  367,
  2885,
  1464,
  1807,
  3619,
  402,
  271,
  10899,
  2138,
  257,
  7026,
  15632): 438,
 (40,
  367,
  2885,
  1464,
  1807,
  3619,
  402,
  271,
  10899,
  2138,
  257,
  7026,
  15632,
  438): 2016,
 (40,
  367,
  2885,
  1464,
  1807,
  3619,
  402,
  271,
  10899,
  2138,
  257,
  7026,
  15632,
  438,
  2016): 257,
 (40,
  367,
  2885,
  1464,
  1807,
  3619,
  402,
  271,
  10899,
  2138,
  257,
  7026,
  15632,
  438,
  2016,
  257): 922,


In [None]:
context_size = 4
for i in range(context_size + 1):
  context = sample[:i]
  target = sample[i]
  print(tokenizer.decode(context), "--->", tokenizer.decode([target]))

 ---> I
I --->  H
I H ---> AD
I HAD --->  always
I HAD always --->  thought


## We must build our own DataLoader

We build x and y tensors to represent the input-target pairs above

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class GPTDataset(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader(txt, batch_size=4, max_length=256,
                      stride=128, shuffle=True, drop_last=True,
                      num_workers=0):
  # batch size is the number of items per batch the model processes at a time BEFORE updating its parameters
  # num_workers is used for parallel processing
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDataset(txt, tokenizer, max_length, stride)
  
  dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    drop_last=drop_last,
    num_workers=num_workers
  )

  return dataloader

In [52]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

print("PyTorch:", torch.__version__)
dataloader = create_dataloader(raw_text, 1, 4, 1, False)

data_iter = iter(dataloader) # converts an iterable object (like a list, tuple, or string) into an iterator
first_batch = next(data_iter) # retrieves the next item from that iterator
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

PyTorch: 2.5.1
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


max_length is 4, so each batch has 4 tokens

stride is 1, so this sliding window moves by 1 each time

> small batch sizes use less memory during training but take a while to train (because store fewer activations at once)

> large batch sizes use more memory but require fewer parameter updates (faster because only perform update for aggregated batch)

Recall we do
$$
\nabla \mathcal{L}(\theta) = \frac{1}{B} \sum_{i=1}^B \nabla\ell(x_i, \theta)
$$

__Changing stride to 2__

In [55]:
dataloader = create_dataloader(raw_text, 1, 4, 2, False)

data_iter = iter(dataloader) # converts an iterable object (like a list, tuple, or string) into an iterator
first_batch = next(data_iter) # retrieves the next item from that iterator
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[2885, 1464, 1807, 3619]]), tensor([[1464, 1807, 3619,  402]])]


__Demonstration adjusting batch size below!__

We can see each tensor for the batch has a collection of 8 individual sequences and targets!

In [54]:
dataloader = create_dataloader(raw_text, 8, 4, 4, False)

data_iter = iter(dataloader) # converts an iterable object (like a list, tuple, or string) into an iterator
first_batch = next(data_iter) # retrieves the next item from that iterator
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]
[tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]]), tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
    

# Lecture 10: Token Embeddings

In [56]:
input_ids = torch.tensor([5,2,1,3]) # example token IDs

vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # a lookup table that stores fixed-size vector representations (embeddings) of a dictionary or vocabulary

In [59]:
embedding_layer.weight # we optimize these weights during training

# each row is a vector embedding of each token

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [63]:
# access the embedding for token with ID 3... a LOOKUP TABLE
# this syntax tells our embedding layer/lookup table to look at row 3
embedding_layer(torch.tensor([3]))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [65]:
embedding_layer(input_ids)  # get these specified tokens

tensor([[-2.8400, -0.7849, -1.4096],
        [ 1.2753, -0.2010, -0.1606],
        [ 0.9178,  1.5810,  1.3010],
        [-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

__The dimensions work now and this is correct. We will actually train these parameters later.__