In [1]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
"LLMs-from-scratch/main/ch02/01_main-chapter-code/"
"the-verdict.txt")

file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x103f27650>)

In [2]:
with  open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character: ", len(raw_text))
print(raw_text[:99])

Total number of character:  20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
import re
text = "Hello, world. This, os a test."
result = re.split(r"(\s)", text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'os', ' ', 'a', ' ', 'test.']

In [4]:
# let's modify the tokenization
result = re.split(r"([,.]|\s)", text)
result

['Hello',
 ',',
 '',
 ' ',
 'world',
 '.',
 '',
 ' ',
 'This',
 ',',
 '',
 ' ',
 'os',
 ' ',
 'a',
 ' ',
 'test',
 '.',
 '']

In [5]:
# we want to strip the whitespace
result = [item for item in result if item.strip()]
result

['Hello', ',', 'world', '.', 'This', ',', 'os', 'a', 'test', '.']

In [7]:
text = "Hello, world. Is this -- a test?"
result =  re.split(r"([,. :;?_!'()\"]-- |\s)", text)
result = [item for item in result if item.strip()]
result

['Hello,', 'world.', 'Is', 'this', ' -- ', 'a', 'test?']

In [10]:
# let's apply this tokenizer to our raw text
preprocessed = re.split(r"([,. :;?_!'()\"]--|\s)", raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(f"The length of the vocabulary is : {len(preprocessed)}")

The length of the vocabulary is : 3646


In [11]:
# first  30 tokens
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius--though', 'a', 'good', 'fellow', 'enough--so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that,', 'in', 'the', 'height', 'of', 'his', 'glory,']


### create  a token id for each token

In [12]:
# create a vocabulary for the preprocessed text
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"The length of the vocabulary is: {vocab_size}")

The length of the vocabulary is: 1486


In [14]:
#  now  create the  vocabulary to have each word mapped to a unique id
vocab = {token: integer for integer, token in enumerate(all_words)}
# print a few output of tokens 
for i, item in enumerate(vocab.items()):
    print(item)
    if i  >= 20:
        break

('!--', 0)
('"--', 1)
('"Ah,', 2)
('"Ah--I', 3)
('"Be', 4)
('"By', 5)
('"Come', 6)
('"Destroyed', 7)
('"Don\'t', 8)
('"Gisburns"', 9)
('"Grindles."', 10)
('"Hang', 11)
('"Has', 12)
('"How', 13)
('"I', 14)
('"I\'d', 15)
('"If', 16)
('"It', 17)
('"It\'s', 18)
('"Jack', 19)
('"Money\'s', 20)


In [20]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
    
    def encode(self, text):
        # preprocessed  = re.split(r'([,.?_!"()\']|--\s)', text)
        preprocessed = re.split(r"([,. :;?_!'()\"]--|\s)", text)
        preprocessed = [
            item.strip()  for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = "  ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return  text

In [24]:
new_text  =  """
    It's the last he painted, you know, Mrs. Gisborn said that with pardonable pride.
    """
preprocessed_2 = re.split(r"([,. :;?_!'()\"]--|\s)", new_text)
# sort it and add it to all words
new_vocab =  sorted(set(preprocessed_2))
all_words = all_words + new_vocab
# assign new ids to new words in the vocab
vocab = {token: integer for integer, token in enumerate(all_words)}


tokenizer =  SimpleTokenizerV1(vocab)

# convert the  text to tokens and ids
ids = tokenizer.encode(new_text)
print(ids)

[1490, 1500, 1494, 1492, 1495, 1502, 1493, 1491, 1489, 1498, 1499, 1501, 1496, 1497]


In [25]:
len(all_words)

1503

In [26]:
# Let's try decoding  the ids
print(tokenizer.decode(ids))

It's  the  last  he  painted,  you  know,  Mrs.  Gisborn  said  that  with  pardonable  pride.


we had an issue initially when trying to pass a new text that contains unknown words.

We ran into the issue unknown words. 

Next we're going to deal with this issue by extending our vocabulary

In [27]:
# extend the vocabulary to contain new tokens
all_tokens  = sorted(set(preprocessed))
all_tokens.extend(['<|endoftext|>', '<|unk|>'])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1488


In [28]:
# check the last five entries in our vocab
for idx, toks in enumerate(list(vocab.items())[-5:]):
    print(toks) 

('younger', 1483)
('your', 1484)
('yourself', 1485)
('<|endoftext|>', 1486)
('<|unk|>', 1487)


In [29]:
# let's update our tokenizer class
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i: s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r"([,.:;?_!'()\"]|--|\s)", text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r"\s+([,.:;?!'()\"])", r'\1', text)
        return text

In [30]:
text_1 = "Hello, do you like tea?"
text_2 = "In the sunlit terraces of the palace"
text = " <|endoftext|> ".join((text_1, text_2))
text
 

'Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace'

In [31]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1487, 1487, 489, 1478, 843, 1277, 1487, 1486, 117, 1292, 1257, 1286, 965, 1292, 1487]


In [32]:
# let's see if we can get the detokenize form of the text in the decoded form
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|> <|unk|> do you like tea <|unk|> <|endoftext|> In the sunlit terraces of the <|unk|>


# Byte pair encoding

In [34]:
from importlib.metadata import version
import tiktoken
print("tiktoken version: ", version("tiktoken"))

tiktoken version:  0.8.0


In [35]:
# now use the new tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

text  = (
    "Hello, do you like tea? <|endoftext|> in the sunlit terraces of someunknownPlace"
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers) 

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 287, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271]


In [36]:
# let's try to decode the text from the integers
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> in the sunlit terraces of someunknownPlace


In [37]:
# bpe for uknown token
word = "Alwirw ier"
unk_ids = tokenizer.encode(word)
print(unk_ids)

[2348, 86, 343, 86, 220, 959]


In [38]:
# decode the token from the ids
unk_decoded =  tokenizer.decode(unk_ids)
print(unk_decoded)

Alwirw ier


This is one major advantage of BPE, it can encode and decode unknown words well without losing the meaning.  It does this with a specialized algortithm, which at the moment i don't know how it works.
It works by breaking down words which are not in it's vocabulary into subwords. This is a big plus because it simply does not replace unknown words with the <|unk|> which would cause information loss at the decoding stage.

In [39]:
with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [40]:
# let's show how the next word prediction works
# first we will take a sample of text from our text
enc_sample = enc_text[:50]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

x: [40, 367, 2885, 1464]
y: [367, 2885, 1464, 1807]


In [None]:
# By processing the inputs along with the targets, we can create next-word prediction task
for  i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)
    
# everything to the left of the arrow shows the input that needs to be fed.
# the ones on the right show the target, which is the predicted word.



[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [43]:
# let's do it with the decoded text
for i in range(1, context_size+1):
    context = enc_sample[:i]
    target  = enc_sample[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([target]))

I ----->  H
I H -----> AD
I HAD ----->  always
I HAD always ----->  thought


In [50]:
import torch
from torch.utils.data  import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
            return len(self.input_ids)
    def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]

In [51]:
def create_dataloader_v1(txt, batch_size=4,
                         max_length=256, stride=128, shuffle=True,
                         drop_last=True, num_workers=0):
    tokenizer =  tiktoken.get_encoding("gpt2")
    dataset  = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader


In [52]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(
    raw_text, batch_size=1,
    max_length=4, 
    stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch =  next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


The first tensor above is the input ids for the text, the second tensor is the target token ids. Since the max_length=4, each tensor contains 4 token ids

- The stride shows how many context windows should in sampling the next input text for each batch . 
- When creating multiple batches from the input dataset, we slide an input window across the text. If the stride is set to 1, we shift the input window by one position when creating the next batch.

In [53]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


#### Exercise: Dataloaders with different strides and context length


In [None]:
# use a stride of 4 and max length of 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8,
    max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter) 
print("Inputs: \n", inputs)
print("targets: \n", targets)

Inputs: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
targets: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Creating Token Embeddings

So far what we have done in processing our raw text is to tokenize it and assign ids to each token. The BPE tokenizer does this for us well. But before we feed the input into the llm model to work with, we still have another step to do. The token embedding. Here we convert the token ids into embedding vectors.


As a preliminary step, we must initialize these embedding weights with random values. This initialization serves as the starting point for the LLM's learning process. The weights of this embeddings will be optimized during the training process.

So far we have covered the following:
- tokenization of texts
- convert tokens to token ids
- convert token ids to embedding

In [57]:
input_ids = torch.tensor([2,3,5,1])
vocab_size = 6
output_dim =  3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [58]:
# apply it to the input token ids
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### Encoding word position
We have two broad approaches here:
- relative positional embeddings
- absolute positional embeddings


In [59]:
# Let's now be more realistic with our embeddings
vocab_size = 50257
output_dim  =  256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
max_length =  4
dataloader  =  create_dataloader_v1(
    raw_text,  batch_size=8, max_length=4, stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets =  next(data_iter)
print("Token id: \n", inputs)
print("\n inputs shape: \n", inputs.shape)

Token id: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

 inputs shape: 
 torch.Size([8, 4])


In [60]:
# use the embedding layers to embedd the token ids into 256-dimensional vectors
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


### GPT model embedding

In [61]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embedding = pos_embedding_layer(torch.arange(context_length))
print(pos_embedding.shape)

torch.Size([4, 256])


In [62]:
input_embedding =  token_embeddings  +  pos_embedding
print(input_embedding.shape)

torch.Size([8, 4, 256])
