## Import libraries

In [2]:
!pip show torch

Name: torch
Version: 2.5.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /home/fberanek/Desktop/learning/books/Build_a_LLM_from_scratch/venv/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: torchaudio, torchvision


In [3]:
# NBVAL_SKIP
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1
tiktoken version: 0.8.0


In [4]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

## Create dataset

In [7]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        # Create list of input tokens
        self.input_ids = []
        # Create list of output tokens
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [8]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)
max_length = 4
stride = 4

In [9]:
dataset = GPTDatasetV1(raw_text, tokenizer, max_length, stride)

In [12]:
input, output = dataset.__getitem__(0)
print(f"This is an input {input}")
print(f"This is an inpoutputut {output}")

This is an input tensor([  40,  367, 2885, 1464])
This is an inpoutputut tensor([ 367, 2885, 1464, 1807])


You can see that output is shifted. We are trying to basically predict next token with previous known tokens, so network will learn to predict first few tokens from input and then fill last one

## Create Dataloader

In [13]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers
    )

    return dataloader

In [14]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

In [15]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break

In [17]:
input_embeddings.shape

torch.Size([8, 4, 256])

In [19]:
pos_embeddings.shape

torch.Size([4, 256])

We can see, that inputs and outputs are organized in sequence of batch size, which is equal to 8, to we have 8 sequences of tokens as input and 8 sequences of tokens as output

In [26]:
print(x)

tensor([[  326,  6774,   502,  6609],
        [  319,   616,   835,   284],
        [  257,  2726,  6227,   284],
        [  389,   262, 33204,   345],
        [  263,    11,   345,  1833],
        [   11, 12704,   257,  1310],
        [ 2993,   526,   198,   198],
        [  319,   326,   966,   314]])


In [27]:
print(y)

tensor([[ 6774,   502,  6609,  1474],
        [  616,   835,   284, 22489],
        [ 2726,  6227,   284,  1833],
        [  262, 33204,   345,   588],
        [   11,   345,  1833,    11],
        [12704,   257,  1310,  2952],
        [  526,   198,   198,     1],
        [  326,   966,   314,   714]])
