Text Tokenization

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total nb of char: ", len(raw_text))
print(raw_text[:50])

Total nb of char:  8927
I HAD always thought Jack Gisburn rather a cheap g


In [2]:
import re
processed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
processed = [item.strip() for item in processed if item.strip()]

print(processed[:10])
print("Number of tokens: ", len(processed))

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']
Number of tokens:  1932


Create Dictionnary


In [3]:
all_tokens = sorted(list(set(processed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
print("Dictionnary size: ", len(all_tokens))

Dictionnary size:  688


In [4]:
dict = {token:integer for integer, token in enumerate(all_tokens)}

for i, item in enumerate(dict.items()):
    print(item)
    if i==3:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)


In [5]:
class Tokenizer:
    def __init__(self, dict):
        self.str_to_int = dict
        self.int_to_str = {integer:token for token, integer in dict.items()}
    
    def encode(self, text):
        processed = re.split(r'([,.?_!"()\']|--|\s)', text)
        processed = [item.strip() for item in processed if item.strip()]  
        processed = [item if item in self.str_to_int 
                     else "<|unk|>" for item in processed]
        encoded = [self.str_to_int[t] for t in processed]   
        return encoded

    def decode(self, encoded):
        decoded = ([self.int_to_str[i] for i in encoded])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', " ".join(decoded))
        return text   

In [6]:
tokenizer = Tokenizer(dict=dict)

encoded = tokenizer.encode("Hello, do you like tea? <|endoftext|> In the sunlit terraces of")

print("Query text: \"the genius\"")
print("Encoded representation: ", encoded)

print("Decoded representation: ", tokenizer.decode(encoded=encoded))

Query text: "the genius"
Encoded representation:  [687, 5, 687, 683, 381, 591, 9, 686, 40, 599, 581, 597, 444]
Decoded representation:  <|unk|>, <|unk|> you like tea? <|endoftext|> In the sunlit terraces of


Data Sampling


In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt) 

        for i in range(0, len(token_ids) - max_length, stride): 
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self): 
        return len(self.input_ids)
    
    def __getitem__(self, idx): 
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size=4,
max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tokenizer
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #B
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader