In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.3.0
tiktoken version: 0.7.0


In [2]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [3]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

In [5]:
with open("the_verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
context_length = 1024

In [6]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)

In [7]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

In [8]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [9]:
input_embeddings

tensor([[[ 2.3382, -1.7522, -2.0773,  ..., -0.1440, -0.2070, -0.4838],
         [ 0.6653,  0.4494, -0.3510,  ...,  1.2541,  1.2214, -0.5419],
         [-3.3838,  0.2860, -0.4439,  ..., -0.0320,  0.7809,  1.5377],
         [ 1.9994,  0.8666, -1.2923,  ..., -0.1789, -0.3764, -1.9076]],

        [[ 2.1198, -2.1118,  1.7682,  ..., -1.5383, -1.4287,  0.0290],
         [-2.0133, -0.3005,  0.6647,  ..., -0.4801,  1.1836,  1.5033],
         [-1.6962,  1.5344, -0.3799,  ...,  2.0998,  0.0510,  0.6014],
         [-0.3120,  0.5061, -0.8563,  ...,  0.5484,  0.4307, -2.1614]],

        [[ 0.8409, -1.7826,  0.2284,  ..., -3.2577, -0.8239,  4.3431],
         [-0.1485, -0.5909, -0.8105,  ...,  1.1793,  0.8376,  0.5975],
         [-4.4060,  0.0050, -0.1375,  ...,  0.2086,  1.9397,  1.8661],
         [ 1.6852, -0.2966,  0.0622,  ...,  0.9011,  0.0909,  0.7951]],

        ...,

        [[ 0.7858, -1.2190, -0.3808,  ..., -3.3698, -1.4500,  0.2198],
         [-1.9619, -2.7606,  0.5168,  ..., -1.3483, -0.12

In [10]:
pos_embeddings

tensor([[ 0.8839, -0.8150,  0.1588,  ..., -1.9092, -0.0515,  0.4311],
        [-0.6916, -0.8457, -0.1641,  ..., -0.0773,  0.2458,  0.7202],
        [-1.9277,  0.4331, -0.8021,  ...,  0.7815,  0.9387,  1.6611],
        [ 0.0754,  0.3200, -0.7103,  ...,  0.2162, -0.2053, -1.4257]],
       grad_fn=<EmbeddingBackward0>)