In [1]:
with open("verdict.txt", "r", encoding="utf-8")as f: 
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20780
THE VERDICT
June 1908
I had always thought Jack Gisburn rather a cheap genius--though a

good fello


In [2]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:
result = re.split(r'([,.]|/\s)', text)

print(result)

['Hello', ',', ' world', '.', ' This', ',', ' is a test', '.', '']


[new_item for item in iterable if condition]

new_list = []
for item in iterable:
    if condition:
        new_list.append(item)


In [4]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', ' world', '.', ' This', ',', ' is a test', '.']


In [5]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
print(result)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


### Step 1: Tokenize the short story

In [6]:
preprocessed = re.split(r'([,.:?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['THE', 'VERDICT', 'June', '1908', 'I', 'had', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to']


### Step 2: Convert to token id

In [7]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1161


the below code is same as 

vocab = {}

for integer, token in enumerate(all_words):
    vocab[token] = integer

In [8]:
# create a vocabulary
# go through each word in all_words
# give it a number based on it position
# store the word as the key and the number as the value in a dictionary

vocab = {token: integer for integer, token in enumerate(all_words)}

# for...in... inside {} --> dictionary comprehension --> short way of writing a loop that builds a dictionary
# for integer, token in enumerate().... is same as ..for (integer, token) in enumerate(all_words) 

In [9]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
('1908', 8)
(':', 9)
(';', 10)
('?', 11)
('A', 12)
('AM', 13)
('Ah', 14)
('Among', 15)
('And', 16)
('Are', 17)
('Arrt', 18)
('As', 19)
('At', 20)
('Be', 21)
('Begin', 22)
('Burlington', 23)
('But', 24)
('By', 25)
('Carlo', 26)
('Carlo;', 27)
('Chicago', 28)
('Claude', 29)
('Come', 30)
('Croft', 31)
('Destroyed', 32)
('Devonshire', 33)
('Don', 34)
('Dubarry', 35)
('Emperors', 36)
('End', 37)
('FELT', 38)
('Florence', 39)
('For', 40)
('Gallery', 41)
('Gideon', 42)
('Gisburn', 43)
('Gisburns', 44)
('Grafton', 45)
('Greek', 46)
('Grindle', 47)
('Grindles', 48)
('HAD', 49)
('HAS', 50)


In [26]:
class SimpleTokenizerV1: 
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])',r'\1', text)
        return text

In [22]:
# simpler version of the above code
class SimplerTokenizerV1: 
    def __init__(self, vocab): 
        self.str_to_int = vocab
        
        # build reverse dictionary step by step
        self.int_to_str = {}
        for s, i in vocab.items():
            self.int_to_str[i] = s

    def encode(self, text):
        # split text into tokens
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        # clean tokens
        new_list = []
        for item in preprocessed: 
            if item.strip():
                new_list.append(item.strip())

        preprocessed = new_list

        # convert tokens to ids
        ids = []
        for s in preprocessed: 
            ids.append(self.str_to_int[s])

        return ids

    def decode(self, ids):
        # convert ids back to tokens
        text_parts = []
        for i in ids: 
            text_parts.append(self.int_to_str[i])

        text = "".join(text_parts)

        # remove spaces before punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)

        return text

In [28]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
            Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 63, 2, 879, 1017, 627, 555, 772, 5, 1157, 621, 5, 1, 77, 7, 43, 880, 1139, 782, 822, 7]


In [29]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [33]:
# hello is not present in the vocabulary - so we need large dataset
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

### Adding special context tokens

In [34]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [35]:
len(vocab.items())

1163

In [36]:
for i, item in enumerate (list(vocab.items())[-5:]):
    print(item)

('younger', 1158)
('your', 1159)
('yourself', 1160)
('<|endoftext|>', 1161)
('<|unk|>', 1162)


In [39]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace space bafore the specified punctuations
        text = re.sub(r'\s+([,.;:?!"()\'])', r'\1', text)
        return text

In [40]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [41]:
tokenizer.encode(text)

[1162,
 5,
 377,
 1157,
 653,
 1004,
 11,
 1161,
 62,
 1017,
 985,
 1013,
 748,
 1017,
 1162,
 7]

In [42]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

Other tokens 
- BOS (beginning of sequence): Token marks the start of a text
- EOS (end of sequence): Token positioned at the end of a text
- PAD (padding): when training LLM with batch size larger than one, the batch might contain text of varying lengths. To ensure all the text have the same length the shorter text are extended or padded using PAD token, up to the lenght of the longest text in the batch. 

In [43]:
!pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp313-cp313-win_amd64.whl.metadata (6.9 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Downloading tiktoken-0.12.0-cp313-cp313-win_amd64.whl (879 kB)
   ---------------------------------------- 0.0/879.1 kB ? eta -:--:--
   --------------------------------------- 879.1/879.1 kB 10.0 MB/s eta 0:00:00
Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl (277 kB)
Installing collected packages: regex, tiktoken

   ---------------------------------------- 2/2 [tiktoken]

Successfully installed regex-2025.11.3 tiktoken-0.12.0


In [44]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.12.0


In [45]:
tokenizer = tiktoken.get_encoding("gpt2")

In [49]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sublit terraces"
    "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 850, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [50]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sublit terracesof someunknownPlace.


### Random example for Byte Pair Encoding

In [51]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

[33901, 86, 343, 86, 220, 959]


In [52]:
strings = tokenizer.decode(integers)
print(strings)

Akwirw ier
