## 2.2 Tokenizing text

In [33]:
import urllib.request

url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
    "the-verdict.txt"
)

file_path = "./data/the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('./data/the-verdict.txt', <http.client.HTTPMessage at 0x7fe3dd63a810>)

In [34]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [35]:
import re

text = "Hello, world. Is this-- a test?"
result = re.split(r"([,.:;?_!\"()']|--|\s)", text)
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']


In [36]:
# Should you keep or remove whitespaces?
# Depends on the application & its requirements.
# Removing them reduces memory & computational requirements.
# But they might be important for some applications, like Python code, which is whitespace-sensitive.
# We remove it here, but will later switch to a method that keeps whitespaces.

result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [37]:
preprocessed = re.split(r"([,.:;?_!\"()']|--|\s)", raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
assert len(preprocessed) == 4690, "Amount of tokens should be 4690"

In [38]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## 2.3 Coverting tokens into token IDs

In [39]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")
assert vocab_size == 1130, "Vocabulary size should be 1130"

Vocabulary size: 1130


In [40]:
vocab = {token: i for i, token in enumerate(all_words)}

In [41]:
from typing import Dict, List


class SimpleTokenizerV1:
    def __init__(self, vocab: Dict[str, int]):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token, i in vocab.items()}

    def encode(self, text: str) -> List[int]:
        preprocessed = re.split(r"([,.:;?_!\"()']|--|\s)", text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids

    def decode(self, tokens: List[int]) -> str:
        text = " ".join([self.int_to_str[token] for token in tokens])
        # Remove whitespaces before punctuation marks
        text = re.sub(r" ([,.:;?_!\"()'])", r"\1", text)
        return text

In [42]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [43]:
assert ids == [
    1,
    56,
    2,
    850,
    988,
    602,
    533,
    746,
    5,
    1126,
    596,
    5,
    1,
    67,
    7,
    38,
    851,
    1108,
    754,
    793,
    7,
], "IDs should be as expected"

In [44]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [45]:
# "Hello" is not in the vocabulary, so it will raise a KeyError
# We can deal with the kind of error by adding special tokens to the vocabulary.
try:
    text = "Hello, do you like tea?"
    print(tokenizer.encode(text))
except KeyError as e:
    print(f"KeyError: {e}")

KeyError: 'Hello'


## 2.4 Adding special context tokens

In [46]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab = {token: i for i, token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [47]:
class SimpleTokenizerV2:
    def __init__(self, vocab: Dict[str, int]):
        self.str_to_int = vocab
        self.int_to_str = {i: token for token, i in vocab.items()}

    def encode(self, text: str) -> List[int]:
        preprocessed = re.split(r"([,.:;?_!\"()']|--|\s)", text)
        preprocessed = [item for item in preprocessed if item.strip()]
        # Replace unknown tokens with "<|unk|>"
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids

    def decode(self, tokens: List[int]) -> str:
        text = " ".join([self.int_to_str[token] for token in tokens])
        # Remove whitespaces before punctuation marks
        text = re.sub(r" ([,.:;?_!\"()'])", r"\1", text)
        return text

In [48]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [49]:
tokenizer = SimpleTokenizerV2(vocab)
ids = tokenizer.encode(text)
print(ids)

[1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]


In [50]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## 2.5 Byte pair encoding

In [51]:
from importlib.metadata import version
import tiktoken

print(f"tiktoken version: {version('tiktoken')}")

tiktoken version: 0.8.0


In [52]:
tokenizer = tiktoken.get_encoding("gpt2")

In [53]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces" "of someunknownPlace"
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271]


In [54]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace


## 2.6 Data sampling with a sliding window


In [55]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [56]:
enc_sample = enc_text[50:]

In [57]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1 : context_size + 1]
print(f"x: {x}")
print(f"y:      {y}")


x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [58]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(context, "-->", target)

[290] --> 4920
[290, 4920] --> 2241
[290, 4920, 2241] --> 287
[290, 4920, 2241, 287] --> 257


In [59]:
for i in range(1, context_size + 1):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(tokenizer.decode(context), "-->", tokenizer.decode([target]))

 and -->  established
 and established -->  himself
 and established himself -->  in
 and established himself in -->  a


In [60]:
import torch
from torch.utils.data import DataLoader, Dataset


class GPTDatasetV1(Dataset):
    def __init__(
        self, txt: str, tokenizer: tiktoken.Encoding, max_length: int, stride: int
    ):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the text
        token_ids = tokenizer.encode(txt)

        # Chunk text into overlapping sequences of max_length using the sliding window
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        """Total number of samples in the dataset."""
        return len(self.input_ids)

    def __getitem__(self, idx):
        """Get a sample from the dataset at the given index."""
        return self.input_ids[idx], self.target_ids[idx]

In [61]:
def create_dataloader_v1(
    txt: str,
    batch_size: int = 4,
    max_length: int = 256,
    stride: int = 128,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )
    return dataloader


`batch_size` is the number of samples per batch. Small batch sizes require less memory, but can lead to more noisy model updates.

`drop_last` drops the last batch if it's shorter than the specified `batch_size`.
This prevents loss spikes during training.

`stride` is the step size for the sliding window.

In [62]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


## 2.7 Creating token embeddings

In [63]:
input_ids = torch.tensor([2, 3, 5, 1])

vocab_size = 6
output_dim = 3  # create embeddings of size 3

torch.manual_seed(42)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)


The weights above have been randomly initialized.
The values will get optimized during LLM training, as part of the LLM optimization.
6 rows with 3 columns. One row for each of the six possible tokens in the vocabulary, and one column for each of the three embedding dimensions.

In [64]:
print(embedding_layer(torch.tensor([3])))  # applying embedding layer to token id 3

tensor([[-0.6866,  0.6105,  1.3347]], grad_fn=<EmbeddingBackward0>)


You can see that the output is identical to the index 3 in the weights.
This is because the embedding layer is basically like a lookup from the embedding layer's weights via the token ID.

The embedding layer here is like a more efficient way to implement one-hot encoding, followed by matrix multiplication in a fully connected layer.
And that's also why we can view it as a neural network layer that can be optimized via backprop.

Sebastian provided a great notebook that explains this relationship [here](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch02/03_bonus_embedding-vs-matmul/embeddings-and-linear-layers.ipynb).
In it, he explains that embedding layers in PyTorch do the same as linear layers that perform matrix multiplications. We use embedding layers for computational efficiency.

Above we've discussed how the embedding is basically like a lookup, and that this is comparable to one-hot and a matmul for a linear layer. So say we have the `nn.Linear` layer on a one-hot encoded representation.
So the categories are the various token ids we have available, and we've one-hot encoded those to be binary attributes. Therefore, we have as many one-hot features as tokens in our vocabulary.
Given a token ID, we'd encode it such as a vector with a binary 1 (hot) in its attribute and 0 elsewhere.
Performing a matrix multiplication of that vector with our linear layer's weights gives us the embeddings for that exact token, equivalent to the lookup.

Mathematically, we can represent this as:

$\mathbf{e} = \mathbf{x}^T \mathbf{W}$

Where:
- $\mathbf{e}$ is the resulting embedding vector
- $\mathbf{x}$ is the one-hot encoded input vector
- $\mathbf{W}$ is the weight matrix of the linear layer (or embedding matrix)

For example, if we have a vocabulary size of 6 and an embedding dimension of 3:

$\begin{bmatrix}0 & 0 & 1 & 0 & 0 & 0\end{bmatrix} \begin{bmatrix}w_{11} & w_{12} & w_{13} \\ w_{21} & w_{22} & w_{23} \\ w_{31} & w_{32} & w_{33} \\ w_{41} & w_{42} & w_{43} \\ w_{51} & w_{52} & w_{53} \\ w_{61} & w_{62} & w_{63}\end{bmatrix} = \begin{bmatrix}w_{31} & w_{32} & w_{33}\end{bmatrix}$

This operation effectively selects the third row of the weight matrix, which is equivalent to looking up the embedding for the third token in our vocabulary.

The embedding layer can also be thought of as a hashtable lookup. In this case, we can represent it as:

```python
embedding = hashtable[token_id]
```

Where:
- embedding is the resulting embedding vector
- hashtable is a dictionary-like structure containing the embeddings
- token_id is the ID of the token we want to look up

For our example with a vocabulary size of 6 and an embedding dimension of 3, we could represent this as:

```python
hashtable = {
    0: [w11, w12, w13],
    1: [w21, w22, w23],
    2: [w31, w32, w33],
    3: [w41, w42, w43],
    4: [w51, w52, w53],
    5: [w61, w62, w63]
}
```

Then, to get the embedding for token ID 2, we would simply do:

```python
embedding = hashtable[2]  # This would return [w31, w32, w33]
```

This hashtable lookup approach is conceptually similar to the embedding layer and provides
another way to understand how embeddings work. However, the actual implementation in
PyTorch uses more optimized methods for efficiency and to enable gradient flow for training.


In [65]:
print(embedding_layer(input_ids))

tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)


## 2.8 Encoding word positions

In [66]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [67]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape) # 8 text samples, 4 tokens each

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [68]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


We embedded each of the tokens into a 256 dimensional vector.
8 samples in our batch (4 text samples), 4 tokens per sample, and 256 embedding dimensions for each token.

In [69]:
# A GPT model's absolute embedding approach:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


The input is usually a placeholder vector containing a sequence of numbers 0, 1, ..., n, where n is the maximum input length.

`context_length` represents the supported input size for the LLM.
We set it to `max_length` here.
In practice, the input text can be longer than the supported context length--then we'd have to truncate the text.

In [70]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
