In [1]:
from importlib.metadata import version

# load raw text dataset and split the text into tokens

In [1]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_text = f.read()

In [72]:
# raw_text 

In [2]:
len(raw_text)  # Display the length of the text to confirm it has been loaded correctly

20479

In [5]:
import re
text = 'Hello, world. This, is a test.'
result = re.split(r'(\s)', text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']

In [6]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
# print(result)
preprocessed = result

In [7]:
len(preprocessed)  # Display the length of the preprocessed text to confirm it has been processed correctly

4690

In [8]:
preprocessed[:10]  # Display the first 10 tokens to verify preprocessing

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

# converting tokens into token IDs

1. first build a vocabulary from the preprocessed text

In [9]:
all_words = set(preprocessed)  # Display the unique tokens in the preprocessed text

In [10]:
vocab_size = len(all_words)
print(vocab_size)  # Display the size of the vocabulary

1130


In [11]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [12]:
vocab

{'seemed': 0,
 'flowers': 1,
 'No': 2,
 'knew': 3,
 'circulation': 4,
 'murmur': 5,
 'HAD': 6,
 'lair': 7,
 'familiar': 8,
 'desire': 9,
 'fewer': 10,
 'happened': 11,
 'problem': 12,
 'surrounded': 13,
 'husband': 14,
 'thought': 15,
 'Why': 16,
 'garlanded': 17,
 'across': 18,
 '.': 19,
 'sweetly': 20,
 'At': 21,
 'preliminary': 22,
 'timorously': 23,
 'landing': 24,
 'substantial': 25,
 'can': 26,
 'white': 27,
 'want': 28,
 'women': 29,
 'face': 30,
 'Stroud': 31,
 'lay': 32,
 'established': 33,
 'such': 34,
 'Greek': 35,
 'painting': 36,
 '_': 37,
 'those': 38,
 "'": 39,
 'Riviera': 40,
 'rather': 41,
 'have': 42,
 'lies': 43,
 'absorbed': 44,
 'home': 45,
 'an': 46,
 'spaniel': 47,
 'Yes': 48,
 'finality': 49,
 'irony': 50,
 'Victor': 51,
 'chair': 52,
 'inevitable': 53,
 'traps': 54,
 'behind': 55,
 'strongly': 56,
 'elbow': 57,
 'tie': 58,
 'What': 59,
 'foreseen': 60,
 'came': 61,
 'while': 62,
 'because': 63,
 'wander': 64,
 'line': 65,
 'recreated': 66,
 'flashed': 67,
 'bro

In [30]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab # {str:int}
    self.int_to_str = {i:s for s, i in vocab.items()} # {int:str}
  
  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    
    preprocessed = [
      item.strip() for item in preprocessed if item.strip()
    ] # Remove empty strings
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids 

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [31]:
tokenizer = SimpleTokenizerV1(vocab)

In [32]:
text = """"It's the last he painted, you know,"
    Mrs. Gisburn said with pardonable pride."""

In [33]:
ids = tokenizer.encode(text)

In [34]:
print(ids)

[126, 819, 39, 577, 88, 379, 806, 742, 149, 255, 956, 149, 126, 509, 19, 709, 276, 670, 236, 441, 19]


In [35]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

# adding special context tokens

 the vocabulary created from the small dataset (the-verdict.txt in this case) may be very limited. when using this vocabulary to tokenize some other text, it may very likely come across new words not contained in the vocabulary. As such, we need to make our vocabulary more complex and inclusive. 

In [None]:
# e.g. Hello is not in the vocabulary 
text = "Hello, do you like tea. is this-- a test?"
tokenizer.encode(text)

KeyError: 'Hello'

In [41]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|enoftext|>", "<|unk|>"]) 

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [45]:
len(vocab.items())

1132

In [47]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|enoftext|>', 1130)
('<|unk|>', 1131)


In [60]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab # {str:int}
    self.int_to_str = {i:s for s, i in vocab.items()} # {int:str}
  
  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    
    preprocessed = [
      item.strip() for item in preprocessed if item.strip()
    ] # Remove empty strings
    preprocessed = [
      item if item in self.str_to_int
      else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids 

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [66]:
tokenizer = SimpleTokenizerV2(vocab)
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 7, 584, 999, 6, 115, 1131, 10]

In [69]:
text

'Hello, do you like tea. is this-- a test?'

In [68]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea. is this -- a <|unk|>?'

# byte pair encoding