In [1]:
import pathlib

# Read in raw text

In [2]:
data_dir = pathlib.Path("data")
with open(data_dir / "wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()
print("Text length: ", len(text))
print(text[:200])

Text length:  232284
DOROTHY AND THE WIZARD IN OZ

BY

L. FRANK BAUM

AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

ILLUSTRATED BY JOHN R. NEILL

BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


[Illu


# Create tokenizer

In [4]:
# Get unique characters
chars = sorted(set(text))
print("Number of unique characters: ", len(chars))
print(chars)

Number of unique characters:  80
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
class Tokenizer(object):
    """
    Tokenizer constructor from provided vocabulary.

    Attributes
    ----------
        str_to_int: dict[str, int]
            Mapping from string in vocabulary to integer
        int_to_str: dict[int, str]
            Mapping from integer to string in vocabulary
    
    Methods
    -------
        encode[strs] -> list[int]
        decode[tokens] -> list[strs]
    """

    def __init__(self, vocab: list[str]) -> None:
        self.str_to_int = {c:i for i,c in enumerate(vocab)}
        self.int_to_str = {i:c for i,c in enumerate(vocab)}

    def encode(self, strs: list[str]) -> list[int]:
        """
        Encode a list of strings into a list of tokens.
        
        Args
        ----
            strs: list[str]
                List of strings
        Return
        ------
            list[int]:
                List of encoded strings
        """
        return [self.str_to_int[c] for c in strs]

    def decode(self, tokens: list[int]) -> list[str]:
        """
        Decode a list of tokens into a list of strings.
        
        Args
        ----
            tokens: list[int]
                List of tokens
        Return
        ------
            list[str]
                List of decoded tokens
        """
        return [self.int_to_str[i] for i in tokens]