In [None]:
from importlib.metadata import version

# load raw text dataset and split the text into tokens

In [1]:
with open('the-verdict.txt', 'r', encoding='utf-8') as f:
  raw_text = f.read()

In [None]:
# raw_text 

In [2]:
raw_text[:1000]  # Display the first 1000 characters

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [3]:
len(raw_text)  # Display the length of the text to confirm it has been loaded correctly

20479

Split the text into tokens based on punctuation and whitespace

In [4]:
import re
# text = 'Hello, world. This, is a test.'
# result = re.split(r'(\s)', text)
# result

In [5]:

result = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
result = [item.strip() for item in result if item.strip()]
# print(result)
preprocessed = result

In [6]:
len(preprocessed)  # Display the length of the preprocessed text to confirm it has been processed correctly

4690

In [7]:
preprocessed[:10]  # Display the first 10 tokens to verify preprocessing

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

# converting tokens into token IDs

first build a vocabulary from the preprocessed text

In [8]:
all_words = set(preprocessed)  # Display the unique tokens in the preprocessed text

In [9]:
vocab_size = len(all_words)
print(vocab_size)  # Display the size of the vocabulary

1130


In [10]:
vocab = {token:integer for integer, token in enumerate(all_words)}

In [11]:
for index, item in enumerate(list(vocab.items())[:10]):
  print(item)

('off', 0)
('chucked', 1)
('get', 2)
('Carlo', 3)
('tribute', 4)
('least', 5)
('course', 6)
('an', 7)
('etching', 8)
('from', 9)


In [13]:
class SimpleTokenizerV1:
  def __init__(self, vocab):
    self.str_to_int = vocab # {str:int}
    self.int_to_str = {i:s for s, i in vocab.items()} # {int:str}
  
  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    
    preprocessed = [
      item.strip() for item in preprocessed if item.strip()
    ] # Remove empty strings
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids 

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [14]:
tokenizer = SimpleTokenizerV1(vocab)

In [15]:
# example 
text = """"It's the last he painted, you know,"
    Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[960, 208, 412, 11, 680, 35, 59, 750, 622, 875, 438, 622, 960, 774, 744, 13, 994, 536, 853, 1049, 744]


In [16]:
# decode 
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

# adding special context tokens

the vocabulary is limited, e.g. it does not contain 'hello' and cannot tokenize the following simple sentence 

In [18]:
# e.g. Hello is not in the vocabulary 
text = "Hello, do you like tea. is this-- a test?"
tokenizer.decode(tokenizer.encode(text))

KeyError: 'Hello'

expand the vocabulary with special tokens

In [None]:

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|enoftext|>", "<|unk|>"]) 

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [None]:
len(vocab.items()) # it now has two more tokens "<|enoftext|>", "<|unk|>"

1132

In [20]:
for i, item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|enoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab # {str:int}
    self.int_to_str = {i:s for s, i in vocab.items()} # {int:str}
  
  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    
    preprocessed = [
      item.strip() for item in preprocessed if item.strip()
    ] # Remove empty strings
    preprocessed = [
      item if item in self.str_to_int
      else "<|unk|>" for item in preprocessed
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids 

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # replace spaces before the specified punctuations
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text


In [None]:
tokenizer = SimpleTokenizerV2(vocab)
tokenizer.encode(text)

In [None]:
text

In [None]:
tokenizer.decode(tokenizer.encode(text))

# byte pair encoding