In [1]:
import os

# Print the current working directory to find out which folder the notebook is running in
print("Current working directory:", os.getcwd())


Current working directory: /workspaces/LLMs-from-scratch/ch02/01_main-chapter-code


In [3]:
import tiktoken

print(tiktoken.__version__)

0.12.0


## Exercise 2.1 Byte-pair encoding

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
uk_word = "Awirw ier"

integers = tokenizer.encode(uk_word)

In [6]:
for i in integers:
    print(f"{i} --> {tokenizer.decode([i])}")



23155 --> Aw
343 --> ir
86 --> w
220 -->  
959 --> ier


## Exercise 2.2

To develop more intuition for how the data loader works, try to run it with different settings such as max_length=2 and stride=2, and max_length=8 and stride=2.


Torch in namespace: False
Torch in sys.modules: True
Torch importable: True


In [7]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [9]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [19]:
dataloader_2_2 = create_dataloader_v1(raw_text,batch_size=1, max_length=2, stride=2, shuffle=False)

iter_2_2 = iter(dataloader_2_2)

for i in range(4):
    print(next(iter_2_2))



[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]
[tensor([[2885, 1464]]), tensor([[1464, 1807]])]
[tensor([[1807, 3619]]), tensor([[3619,  402]])]
[tensor([[402, 271]]), tensor([[  271, 10899]])]


In [25]:
dataloader_2_8 = create_dataloader_v1(raw_text,batch_size=1, max_length=8, stride=2, shuffle=False)

iter_2_8 = iter(dataloader_2_8)

for i in range(4):
    print(next(iter_2_8))

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]
[tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]
[tensor([[ 1807,  3619,   402,   271, 10899,  2138,   257,  7026]]), tensor([[ 3619,   402,   271, 10899,  2138,   257,  7026, 15632]])]
[tensor([[  402,   271, 10899,  2138,   257,  7026, 15632,   438]]), tensor([[  271, 10899,  2138,   257,  7026, 15632,   438,  2016]])]
