In [3]:
# Load text file.
with open("the-verdict.txt", "r") as file:
    raw_content = file.read()

In [4]:
import re

# Preprocess text by splitting on whitespace and punctuation, keeping them as tokens.
# This function will return a list of tokens, where each token is a word or punctuation mark.
def preprocess_text(text):
    # Split on whitespace and punctuation, keep them as tokens
    output = re.split(r'([,.:;?_!"()\']|--|\s|\n])', text )
    output = [substring.strip() for substring in output if substring.strip()]
    return output

In [5]:
# Preprocess the raw content.
preprocessed = preprocess_text(raw_content)
preprocessed[:10]  # Display first 10 tokens for verification

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [6]:
# Returns a map of unique words to their indices.
def create_vocab(preprocessed):
    all_tokens = sorted(list(set(preprocessed)))
    all_tokens.extend(['<|endoftext|>', '<|unk|>'])
    vocab = {word: idx for idx, word in enumerate(all_tokens)}
    return vocab

In [41]:
# Create vocabulary from the preprocessed text.
vocab = create_vocab(preprocessed)
for k,v in list(vocab.items())[-10:]:
    print (k, v)

year 1122
years 1123
yellow 1124
yet 1125
you 1126
younger 1127
your 1128
yourself 1129
<|endoftext|> 1130
<|unk|> 1131


In [33]:
# Define a simple tokenizer.
# Tokenizer supports Encode and Decode methods.
# The tokenizer splits text into words and punctuation, removing whitespaces.
# Tokens that are not in the vocab. are encoded as "<|unk|>".
class SimpleTokenizer: 
    def __init__(self, vocab):
        self.text_to_int = vocab
        self.int_to_text = {v: k for k, v in vocab.items()}

    def encode(self, text):
        return [
            self.text_to_int[word] if word in self.text_to_int 
            else self.text_to_int['<|unk|>'] for word in preprocess_text(text)
            ]
    
    def decode(self, tokens):
        output = " ".join([self.int_to_text[id] for id in tokens])
        output = re.sub(r'\s+([,.:;?_!"()\']|--)', r'\1', output)
        return output

In [None]:
# Init. tokenizer.
tokenizerV1 = SimpleTokenizerV1(vocab)

# Sampe use of tokenizer functions.
text = "this Hello world <|endoftext|> is!! was--"
print(tokenizerV1.encode(text))  # Example encoding.
print(tokenizerV1.decode(tokenizerV1.encode(text)))  # Example decoding.
del text

[999, 1131, 1131, 1130, 584, 0, 0, 1077, 6]
this <|unk|> <|unk|> <|endoftext|> is!! was--
