In [12]:
import re
with open("data/The_Verdict.txt", "r") as f:
    raw_text = f.read()

raw_text = raw_text[98::]
raw_text = re.sub(r"(\s+)"," ", raw_text)
print("Number of characters {}", len(raw_text))
print(raw_text[:100])

Number of characters {} 21523
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [13]:
# Naive splitting
text = "Hello, world. This, is a--test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a--test.']


In [14]:
text="Hello, world. Is this-- a test?"
result = re.split(r'([\(\),.\s;:>_!]|--)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test?']


In [15]:
def basic_tokenizer(input: str)->list[str]:
    result = re.split(r'([,.;:?_!""()\']|--|\s)', input)
    result = [item.strip() for item in result if item.strip()]
    return result

In [16]:
basic_tokenizer("this is a simple test?")

['this', 'is', 'a', 'simple', 'test', '?']

In [17]:
len(basic_tokenizer(raw_text))


4903

In [18]:
basic_tokenizer(raw_text)[0:30]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in']

In [106]:
preprocessed = basic_tokenizer(raw_text)
all_words = sorted(set(preprocessed))
len(all_words)

1209

In [107]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [112]:
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '--': 7,
 '.': 8,
 '0': 9,
 '1': 10,
 '1930': 11,
 '4': 12,
 ':': 13,
 ';': 14,
 '?': 15,
 'A': 16,
 'About': 17,
 'Ah': 18,
 'Among': 19,
 'And': 20,
 'Are': 21,
 'Arrt': 22,
 'As': 23,
 'At': 24,
 'Attribution-ShareAlike': 25,
 'Be': 26,
 'Begin': 27,
 'Burlington': 28,
 'But': 29,
 'By': 30,
 'Carlo': 31,
 'Chicago': 32,
 'Claude': 33,
 'Come': 34,
 'Commons': 35,
 'Creative': 36,
 'Croft': 37,
 'Destroyed': 38,
 'Devonshire': 39,
 'Don': 40,
 'Dubarry': 41,
 'During': 42,
 'Emperors': 43,
 'FDL': 44,
 'Florence': 45,
 'For': 46,
 'GNU': 47,
 'Gallery': 48,
 'Gideon': 49,
 'Gisburn': 50,
 'Gisburns': 51,
 'Grafton': 52,
 'Greek': 53,
 'Grindle': 54,
 'Grindles': 55,
 'HAD': 56,
 'Had': 57,
 'Hang': 58,
 'Has': 59,
 'He': 60,
 'Her': 61,
 'Hermia': 62,
 'His': 63,
 'How': 64,
 'I': 65,
 'If': 66,
 'In': 67,
 'It': 68,
 'Jack': 69,
 'January': 70,
 'Jove': 71,
 'Just': 72,
 'Lord': 73,
 'Made': 74,
 'Miss': 75,
 'Money': 

In [127]:


class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {value:key for key,value in vocab.items()}

    def encode(self, text):
        result = re.split(r'([,.;:?_!""()\']|--|\s)', text)
        result = [item.strip() for item in result if item.strip()]
        tokens = [self.str_to_int[word] for word in result]
        return tokens

    def decode(self, ids):
        text = " ".join(self.int_to_str[token] for token in ids)
        text = re.sub(r'\s+([,.?!"()\'])'  , r'\1', text)
        return text
      

        

In [137]:
test = "a cheap genius is he !"
tokenizer_v1 = SimpleTokenizerV1(vocab=vocab)
tokenizer_v1.encode(test)

[134, 281, 533, 632, 580, 0]

In [138]:
test = "a cheap genius is he!"
result = tokenizer_v1.decode(tokenizer_v1.encode(test))
assert test == result

In [139]:
test = "It' s the last he painted, you know"
result = tokenizer_v1.decode(tokenizer_v1.encode(test))
assert test == result

In [2]:
!pip3 install tiktoken

Defaulting to user installation because normal site-packages is not writeable
Collecting tiktoken
  Downloading tiktoken-0.12.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.10.23-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.26.0 (from tiktoken)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Downloading charset_normalizer-3.4.4-cp39-cp39-macosx_10_9_universal2.whl.metadata (37 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Downloading idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->ti

In [7]:
!uv pip install tiktoken
from importlib.metadata import version
import tiktoken
version("tiktoken")

zsh:1: command not found: uv


'0.12.0'

In [8]:
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
text = "Helllo - do you like tea ? <|endoftext|> randomUnknownWord yeah"

In [10]:
integer = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
integer

[28254,
 5439,
 532,
 466,
 345,
 588,
 8887,
 5633,
 220,
 50256,
 4738,
 20035,
 26449,
 10194]

In [19]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.) "The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"? Well!--even throug

In [20]:
enc_text = tokenizer.encode(raw_text)

In [21]:
len(enc_text)

5207

In [22]:
enc_sample = enc_text[50:]

In [None]:
context_size = 4 # How many tokens in the input
x = enc_sample[:context_size]