In [17]:
""" 
https://github.com/rasbt/LLM-workshop-2024/blob/main/02_data/02.ipynb
"""

' \nhttps://github.com/rasbt/LLM-workshop-2024/blob/main/02_data/02.ipynb\n'

In [1]:
from importlib.metadata import version


print(f'torch version: {version("torch")}')
print(f'tiktoken version: {version("tiktoken")}')

torch version: 2.6.0
tiktoken version: 0.9.0


In [3]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f'total number of characters: {len(raw_text)}')
print(raw_text[:10])

total number of characters: 20479
I HAD alwa


In [4]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item]
print(preprocessed[:38])


['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ']


In [5]:
print(f'number of processed tokens: {len(preprocessed)}')

number of processed tokens: 8405


In [6]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)


1132


In [7]:
vocab = {token:integer for integer,token in enumerate(all_words)}


In [9]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 10:
        break

('\n', 0)
(' ', 1)
('!', 2)
('"', 3)
("'", 4)
('(', 5)
(')', 6)
(',', 7)
('--', 8)
('.', 9)
(':', 10)


In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    

In [11]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)


[3, 58, 4, 852, 990, 604, 535, 748, 7, 1128, 598, 7, 3, 69, 9, 40, 853, 1110, 756, 795, 9]


In [12]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

# BytePair encoding

- GPT-2 used BytePair encoding (BPE) as its tokenizer
  
- it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words

- For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges

- The original BPE tokenizer can be found here: https://github.com/openai/gpt-2/blob/master/src/encoder.py

- In this lecture, we are using the BPE tokenizer from OpenAI's open-source tiktoken library, which implements its core algorithms in Rust to improve computational performance

- (Based on an analysis here, I found that tiktoken is approx. 3x faster than the original tokenizer and 6x faster than an equivalent tokenizer in Hugging Face)


In [13]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))


tiktoken version: 0.9.0


In [14]:
tokenizer = tiktoken.get_encoding("gpt2")


In [15]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)


[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [16]:
strings = tokenizer.decode(integers)

print(strings)


Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [18]:
tokenizer.encode("Akwirw ier", allowed_special={"<|endoftext|>"})


[33901, 86, 343, 86, 220, 959]

# Data sampling with a sliding window

- Above, we took care of the tokenization (converting text into word tokens represented as token ID numbers)

- Now, let's talk about how we create the data loading for LLMs

- We train LLMs to generate one word at a time, so we want to prepare the training data accordingly where the next word in a sequence represents the target to predict
