In [1]:
# Cell 1: Import necessary libraries
from collections import defaultdict
from tokenizers import Tokenizer
import re

In [2]:
# Cell 2: Define the text to tokenize
text = """I must not fear. Fear is the mind-killer. Fear is the little-death that brings total obliteration. 
I will face my fear. I will permit it to pass over me and through me. 
And when it has gone past I will turn the inner eye to see its path. 
Where the fear has gone there will be nothing. Only I will remain."""

In [3]:
# Cell 3: Lowercase the text
text = text.lower()

In [4]:
# Cell 4: Implement the Word Tokenizer
def word_tokenizer(text):
    return text.split()

In [5]:
# Cell 5: Implement the Character Tokenizer
def character_tokenizer(text):
    return list(text)

In [6]:
# Cell 6: Implement the Sentence Tokenizer
def sentence_tokenizer(text):
    return re.split(r'[.!?]\s*', text.strip())

In [7]:
# Cell 7: Create a vocabulary function
def create_vocabulary(tokens):
    vocab = defaultdict(lambda: len(vocab))
    [vocab[token] for token in tokens]
    return dict(vocab)

In [8]:
# Cell 8: Tokenize the text using each tokenizer
word_tokens = word_tokenizer(text)
char_tokens = character_tokenizer(text)
sentence_tokens = sentence_tokenizer(text)

In [9]:
# Cell 9: Create vocabularies for each tokenizer
word_vocab = create_vocabulary(word_tokens)
char_vocab = create_vocabulary(char_tokens)
sentence_vocab = create_vocabulary(sentence_tokens)

In [10]:
# Cell 10: Convert tokens to indices
word_indices = [word_vocab[token] for token in word_tokens]
char_indices = [char_vocab[token] for token in char_tokens]
sentence_indices = [sentence_vocab[token] for token in sentence_tokens]

In [11]:
# Cell 11: Encode the sequence using a pretrained WordPiece tokenizer
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")
wordpiece_indices = tokenizer.encode(text).ids

In [12]:
# Cell 12: Print the results
print("Word Tokenizer Output (Indices):", word_indices)
print("Character Tokenizer Output (Indices):", char_indices)
print("Sentence Tokenizer Output (Indices):", sentence_indices)
print("WordPiece Tokenizer Output (Indices):", wordpiece_indices)

Word Tokenizer Output (Indices): [0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 8, 9, 10, 11, 12, 0, 13, 14, 15, 3, 0, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 22, 25, 17, 26, 27, 28, 0, 13, 29, 6, 30, 31, 18, 32, 33, 34, 35, 6, 4, 26, 27, 36, 13, 37, 38, 39, 0, 13, 40]
Character Tokenizer Output (Indices): [0, 1, 2, 3, 4, 5, 1, 6, 7, 5, 1, 8, 9, 10, 11, 12, 1, 8, 9, 10, 11, 1, 0, 4, 1, 5, 13, 9, 1, 2, 0, 6, 14, 15, 16, 0, 17, 17, 9, 11, 12, 1, 8, 9, 10, 11, 1, 0, 4, 1, 5, 13, 9, 1, 17, 0, 5, 5, 17, 9, 15, 14, 9, 10, 5, 13, 1, 5, 13, 10, 5, 1, 18, 11, 0, 6, 19, 4, 1, 5, 7, 5, 10, 17, 1, 7, 18, 17, 0, 5, 9, 11, 10, 5, 0, 7, 6, 12, 1, 20, 0, 1, 21, 0, 17, 17, 1, 8, 10, 22, 9, 1, 2, 23, 1, 8, 9, 10, 11, 12, 1, 0, 1, 21, 0, 17, 17, 1, 24, 9, 11, 2, 0, 5, 1, 0, 5, 1, 5, 7, 1, 24, 10, 4, 4, 1, 7, 25, 9, 11, 1, 2, 9, 1, 10, 6, 14, 1, 5, 13, 11, 7, 3, 19, 13, 1, 2, 9, 12, 1, 20, 10, 6, 14, 1, 21, 13, 9, 6, 1, 0, 5, 1, 13, 10, 4, 1, 19, 7, 6, 9, 1, 24, 10, 4, 5, 1, 0, 1, 21, 0, 17, 17, 1, 5, 3, 11, 6, 1, 5, 