In [None]:
# Read the sample file and print a few things based on what was read
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

In [None]:
# tokenize based on whitespace

import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

In [None]:
# Include punctuation as tokens
result = re.split(r'([,.]|\s)', text)
print(result)

In [None]:
# Now remove the spaces - note for some models we'll keep the spaces as they can be significat, e.g. python syntax
result = [item for item in result if item.strip()]
print(result)

In [None]:
# Expand to include addition punctuation, etc

text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

In [None]:
# tokenize the example text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

In [None]:
print(preprocessed[:30])

### Building a vocabulary

A vocabulary defines how we map each word and special character to a unique identifier. To build a vocabulary we tokenize the entire dataset, sort it, and remove duplicates.

In [None]:
# Remove duplicates and sort the tokens from our tokenized dataset
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

In [None]:
# Create the vocabulary from the sorted/deduplicated tokens
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

In [None]:
enumerate(all_words)

Note in addition to mapping tokens to integers, we want to take the output of LLMs and map integer output to tokens

In [None]:
# Simple tokenizer class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) 

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

In [None]:
print(tokenizer.decode(ids))

In [None]:
# What if the token is not in the vocab? We get a KeyError. Having a large dataset can mitigate this somewhat...
text = "Hello, do you like tea?"
print(tokenizer.encode(text))