In [54]:
from importlib.metadata import version

print('tiktoken version:', version('tiktoken'))
print('torch version:', version('torch'))

tiktoken version: 0.7.0
torch version: 2.2.2


## Tokenizing Text

In [55]:
import os
import urllib.request

if not os.path.exists('data/t8.shakespeare.txt'):
    url = ("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
    file_path = 'data/t8.shakespeare.txt'
    urllib.request.urlretrieve(url, file_path)

In [56]:
with open('data/t8.shakespeare.txt', 'r') as file:
    text = file.read()
    

print('Total number of characters:', len(text))
print('First 100 characters:', text[:100])

Total number of characters: 5458199
First 100 characters: This is the 100th Etext file presented by Project Gutenberg, and
is presented in cooperation with Wo


- The goal of tokenization is to split a text into words, phrases, symbols, or other meaningful elements, which are called tokens.

In [57]:
import re

# remove , . \n and convert to lowercase
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])



['This', 'is', 'the', '100th', 'Etext', 'file', 'presented', 'by', 'Project', 'Gutenberg', ',', 'and', 'is', 'presented', 'in', 'cooperation', 'with', 'World', 'Library', ',', 'Inc', '.', ',', 'from', 'their', 'Library', 'of', 'the', 'Future', 'and']


In [58]:
print('Total number of words:', len(preprocessed))


Total number of words: 1158046


## Converting token into token IDs

In [59]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print('Vocabulary size:', vocab_size)

Vocabulary size: 34510


In [60]:
vocab = {token: idx for idx, token in enumerate(all_words)}

In [61]:
for i, token in enumerate(all_words):
    if i >= 100:
        break
    print(f'{i}: {token}')

0: !
1: "
2: #100]
3: &
4: &C
5: &c
6: '
7: (
8: )
9: *
10: ****
11: *****
12: ******
13: ******This
14: **Etexts
15: **Information
16: **Welcome
17: *ANY*
18: *EITHER*
19: *Project
20: *These
21: *This
22: *WANT*
23: *not*
24: ,
25: -
26: --
27: -And
28: -Break
29: -But
30: -Cheerly
31: -Come
32: -Give
33: -Go
34: -Here
35: -Ho
36: -How
37: -I
38: -Let
39: -Luce
40: -My
41: -Nay
42: -Now
43: -Out
44: -Please
45: -Prithee
46: -Spare
47: -Still
48: -THE
49: -That
50: -There
51: -Thick
52: -What
53: -Where
54: -Why
55: -You
56: -[Aside
57: -a
58: -and
59: -even
60: -from
61: -give
62: -gold
63: -have
64: -here
65: -hey
66: -hissing
67: -if
68: -in
69: -is
70: -marry
71: -matter
72: -me
73: -nay
74: -no
75: -odours
76: -of
77: -on
78: -out
79: -perhaps-
80: -say
81: -take
82: -the
83: -there
84: -they
85: -thou
86: -to
87: -we
88: -well
89: -whose
90: -why
91: -yet
92: .
93: /
94: 000
95: 000=Trillion]
96: 08
97: 0INDEX
98: 1
99: 1-800-443-0238


In [62]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {idx: token for token, idx in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        return [self.str_to_int[token] for token in preprocessed]
    
    def decode(self, tokens):
        return ''.join([self.int_to_str[idx] for idx in tokens])
    

In [68]:
tokenizer = SimpleTokenizerV1(vocab)

text = "To be or not to be"

ids = tokenizer.encode(text)
print(ids)

[8413, 11319, 24479, 24121, 31544, 11319]


In [69]:


tokenizer.decode(ids)

'Tobeornottobe'