In this notebook, we gonna use the book "The Verdict" to demonstrate the tokenizing process.

# Naive Appoarch:

## Step 1: Creating tokens:

In [1]:
with open("/content/the-verdict.txt", "r", encoding = "utf-8") as file:
    raw_text = file.read()
print(len(raw_text))
raw_text[:99]

20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no '

To split words, we gonna use regular expression.

In [2]:
import re

# example

text = "hello, (world!) --How 'are' you?"
res = re.split(r'(\s)', text)

res

['hello,', ' ', '(world!)', ' ', '--How', ' ', "'are'", ' ', 'you?']

In [3]:
res = re.split(r'([,./?!\'"_():]|--|\s)', text)
res

['hello',
 ',',
 '',
 ' ',
 '',
 '(',
 'world',
 '!',
 '',
 ')',
 '',
 ' ',
 '',
 '--',
 'How',
 ' ',
 '',
 "'",
 'are',
 "'",
 '',
 ' ',
 'you',
 '?',
 '']

In [4]:
final_res = [item for item in res if item.strip()]
final_res

['hello',
 ',',
 '(',
 'world',
 '!',
 ')',
 '--',
 'How',
 "'",
 'are',
 "'",
 'you',
 '?']

This is a simple tokenizer. Include 2 step:
  - Use Regular expression to split the whitespace and the special mark out of words.
  - Strip out the white space (in this case, not for all).

In [5]:
preprocessed_text = re.split(r'([,./?!\'"_():;]|--|\s)', raw_text)
preprocessed_text = [item for item in preprocessed_text if item.strip()]
preprocessed_text[:100]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in',
 'the',
 'height',
 'of',
 'his',
 'glory',
 ',',
 'he',
 'had',
 'dropped',
 'his',
 'painting',
 ',',
 'married',
 'a',
 'rich',
 'widow',
 ',',
 'and',
 'established',
 'himself',
 'in',
 'a',
 'villa',
 'on',
 'the',
 'Riviera',
 '.',
 '(',
 'Though',
 'I',
 'rather',
 'thought',
 'it',
 'would',
 'have',
 'been',
 'Rome',
 'or',
 'Florence',
 '.',
 ')',
 '"',
 'The',
 'height',
 'of',
 'his',
 'glory',
 '"',
 '--',
 'that',
 'was',
 'what',
 'the',
 'women',
 'called',
 'it',
 '.',
 'I',
 'can',
 'hear',
 'Mrs',
 '.',
 'Gideon',
 'Thwing',
 '--',
 'his',
 'last',
 'Chicago',
 'sitter',
 '--']

In [6]:
len(preprocessed_text)

4690

## Step 2: Create Token ID:

In [7]:
set_of_words = sorted(set(preprocessed_text))
set_of_words

['!',
 '"',
 "'",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 ';',
 '?',
 'A',
 'Ah',
 'Among',
 'And',
 'Are',
 'Arrt',
 'As',
 'At',
 'Be',
 'Begin',
 'Burlington',
 'But',
 'By',
 'Carlo',
 'Chicago',
 'Claude',
 'Come',
 'Croft',
 'Destroyed',
 'Devonshire',
 'Don',
 'Dubarry',
 'Emperors',
 'Florence',
 'For',
 'Gallery',
 'Gideon',
 'Gisburn',
 'Gisburns',
 'Grafton',
 'Greek',
 'Grindle',
 'Grindles',
 'HAD',
 'Had',
 'Hang',
 'Has',
 'He',
 'Her',
 'Hermia',
 'His',
 'How',
 'I',
 'If',
 'In',
 'It',
 'Jack',
 'Jove',
 'Just',
 'Lord',
 'Made',
 'Miss',
 'Money',
 'Monte',
 'Moon-dancers',
 'Mr',
 'Mrs',
 'My',
 'Never',
 'No',
 'Now',
 'Nutley',
 'Of',
 'Oh',
 'On',
 'Once',
 'Only',
 'Or',
 'Perhaps',
 'Poor',
 'Professional',
 'Renaissance',
 'Rickham',
 'Riviera',
 'Rome',
 'Russian',
 'Sevres',
 'She',
 'Stroud',
 'Strouds',
 'Suddenly',
 'That',
 'The',
 'Then',
 'There',
 'They',
 'This',
 'Those',
 'Though',
 'Thwing',
 'Thwings',
 'To',
 'Usually',
 'Venetian',
 'Victor',
 '

In [8]:
vocab = {word:index for index, word in enumerate(set_of_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [9]:
len(vocab)

1130

After we have a vocabulary, we need to create a Tokenizer class. This class has to have 2 functions: **encoder** (convert text to integer) and **decoder** (convert integer to text).

In [10]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {index:word for word, index in vocab.items()}

  def encode(self, text):
    preprocessed_text = re.split(r'([,./?!\'"_():]|--|\s)', text)
    preprocessed_text = [item.strip() for item in preprocessed_text if item.strip()]
    ids = [self.str_to_int[word] for word in preprocessed_text]
    return ids
  def decode(self, ids):
    words = " ".join([self.int_to_str[id] for id in ids])
    words = re.sub(r'\s+([,./?!\'"_():])', r'\1', words)
    return words

In [11]:
tokenizer = SimpleTokenizerV1(vocab)

text = """Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to "lift him up"--she had not led him back to the easel."""

ids = tokenizer.encode(text)

ids

[73,
 297,
 5,
 566,
 876,
 514,
 711,
 364,
 546,
 362,
 5,
 876,
 514,
 394,
 5,
 177,
 62,
 28,
 288,
 5,
 422,
 1016,
 1,
 624,
 546,
 1051,
 1,
 6,
 876,
 514,
 711,
 615,
 546,
 191,
 1016,
 988,
 374,
 7]

In [12]:
converted_text = tokenizer.decode(ids)
converted_text

'Of course, if she had not dragged him down, she had equally, as Miss Croft contended, failed to" lift him up" -- she had not led him back to the easel.'

## Adding special context token:

If our vocabulary is too small, it may not cover all of the words may occur. So we add <|unk|> token for unknown ones. Moreover, when we deal with multiple independent documents, we need a token to seperate between docs: <|endoftext|> (doc 1 -> end of text token -> doc 2 -> ...)

In [13]:
all_tokens = sorted(set(vocab))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
full_vocab = {word:index for index, word in enumerate(all_tokens)}
full_vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [14]:
len(full_vocab)

1132

In [15]:
full_vocab.items()


dict_items([('!', 0), ('"', 1), ("'", 2), ('(', 3), (')', 4), (',', 5), ('--', 6), ('.', 7), (':', 8), (';', 9), ('?', 10), ('A', 11), ('Ah', 12), ('Among', 13), ('And', 14), ('Are', 15), ('Arrt', 16), ('As', 17), ('At', 18), ('Be', 19), ('Begin', 20), ('Burlington', 21), ('But', 22), ('By', 23), ('Carlo', 24), ('Chicago', 25), ('Claude', 26), ('Come', 27), ('Croft', 28), ('Destroyed', 29), ('Devonshire', 30), ('Don', 31), ('Dubarry', 32), ('Emperors', 33), ('Florence', 34), ('For', 35), ('Gallery', 36), ('Gideon', 37), ('Gisburn', 38), ('Gisburns', 39), ('Grafton', 40), ('Greek', 41), ('Grindle', 42), ('Grindles', 43), ('HAD', 44), ('Had', 45), ('Hang', 46), ('Has', 47), ('He', 48), ('Her', 49), ('Hermia', 50), ('His', 51), ('How', 52), ('I', 53), ('If', 54), ('In', 55), ('It', 56), ('Jack', 57), ('Jove', 58), ('Just', 59), ('Lord', 60), ('Made', 61), ('Miss', 62), ('Money', 63), ('Monte', 64), ('Moon-dancers', 65), ('Mr', 66), ('Mrs', 67), ('My', 68), ('Never', 69), ('No', 70), ('Now

In [16]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {index:word for word, index in vocab.items()}
  def encode(self, text):
    #re.split(r'([,./?!\'"_():]|--|\s)', text)
    new_text = re.split(r'([,.;\'"()/_?!]|--|\s)', text)
    #print(new_text)
    new_text = [item.strip() for item in new_text if item.strip()]
    #print(new_text)
    preprocessed = [
        item if item in self.str_to_int
        else "<|unk|>"
        for item in new_text
    ]
    ids = [self.str_to_int[word] for word in preprocessed]
    return ids
  def decode(self, ids):

    text = " ".join([self.int_to_str[id] for id in ids])
    final_text = re.sub(r'\s+(["\'.,:()?!_])', r'\1', text)
    return final_text

In [17]:
tokenizerV2 = SimpleTokenizerV2(full_vocab)

test_text = "Hello, how are 'you' doing?"

ids = tokenizerV2.encode(test_text)
ids

[1130, 5, 560, 169, 2, 1126, 2, 357, 10]

In [18]:
convted = tokenizerV2.decode(ids)
convted

"<|unk|>, how are' you' doing?"

In [19]:
# multiple text src case
text1 = "I do. You are."
text2 = "Hello. 'how' are u?"

# notice the space around the eot
text = " <|endoftext|> ".join((text1, text2))
#print(text1 + text2)

ids_multiple = tokenizerV2.encode(text)
ids_multiple

[53, 355, 7, 113, 169, 7, 1131, 1130, 7, 2, 560, 2, 169, 1130, 10]

In [20]:
new_text = tokenizerV2.decode(ids_multiple)
new_text

"I do. You are. <|endoftext|> <|unk|>.' how' are <|unk|>?"

# Byte Pair Encoding:

BPE is too hard to implement from scratch =)

So we gonna use a library named: tiktoken.

In [21]:
pip install tiktoken



In [23]:
import tiktoken

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")

Note: the use of tiktoken tokenizer is the same as SimpleTokenizerV2 we have created.

In [25]:
text = """"Why _has_ he chucked painting?" I asked abruptly. <|endoftext|> iloveass999."""

ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

ids

[1,
 5195,
 4808,
 10134,
 62,
 339,
 442,
 17758,
 12036,
 1701,
 314,
 1965,
 25891,
 13,
 220,
 50256,
 4229,
 659,
 562,
 17032,
 13]

In [26]:
converted = tokenizer.decode(ids)
converted

'"Why _has_ he chucked painting?" I asked abruptly. <|endoftext|> iloveass999.'