In [2]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.11.0


In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [5]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [6]:
weird_text = "Akwirw ier"

In [7]:
tokens = tokenizer.encode(weird_text)
print(tokens)

[33901, 86, 343, 86, 220, 959]


In [8]:
strings = tokenizer.decode(tokens)
print(strings)

Akwirw ier


2.6 data sample with a sliding window


In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [11]:
enc_sample = enc_text[50:]

In [12]:
print(enc_sample[:20])

[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423]


In [13]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

x: [290, 4920, 2241, 287]
y: [4920, 2241, 287, 257]


In [19]:
#we can create the next-word prediction tasks 
#creating the input–target pairs that we can use for LLM training.
for i in range(1, context_size+1):
 context = enc_sample[:i]
 desired = enc_sample[i]
 print(context, "---->", desired)
 print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

[290] ----> 4920
 and ---->  established
[290, 4920] ----> 2241
 and established ---->  himself
[290, 4920, 2241] ----> 287
 and established himself ---->  in
[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a


Listing 2.5 A dataset for batched inputs and targets

In [21]:
import torch
from torch.utils.data import Dataset, DataLoader

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [26]:
class GPTDatasetV1(Dataset):
 def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt) # Tokenizes the entire text

    for i in range(0, len(token_ids) - max_length, stride): # uses a sliding window to chunk the book into
        input_chunk = token_ids[i:i + max_length]  # overlapping segments of max_length tokens
        target_chunk = token_ids[i + 1: i + max_length + 1]
        self.input_ids.append(torch.tensor(input_chunk))
        self.target_ids.append(torch.tensor(target_chunk))
 def __len__(self): # returns the number of rows in the dataset
        return len(self.input_ids)
 def __getitem__(self, idx): # returns a single row from the dataset
        return self.input_ids[idx], self.target_ids[idx]

Listing 2.6 A data loader to generate batches with input-with pairs

In [23]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
    stride=128, shuffle=True, drop_last=True,
    num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [24]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [None]:
# stride setting dicates how much we shift the window at each step
dataloader = create_dataloader_v1(
 raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
second_batch = next(data_iter)
print(second_batch) #shift by one token

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
