# Chapter 2: Working with text data

## 2.2: Tokenizing Text

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of characters: ", len(raw_text))

print(raw_text[:99])

In [None]:
import re
text = "Hello, world. This, is a test."

# split on whitespace
result = re.split(r'(\s)',text)
print(result)

#split on whitespace, commas, and periods
result = re.split(r'([,.]|\s)', text)
print(result)

#remove whitespace characters
result = [item for item in result if item.strip()]
print(result)

In [None]:
# build a more advanced tokenizer
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

In [None]:
# apply our more advanced (but still very basic!) tokenizer to Edith Wharton's entire short story
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preproces if item.strip()]

## 2.3 Converting tokens into token IDs

In [None]:
# create a list of all unqiue tokens (words) and sort alphabetically. This is our vocabulary.
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

In [None]:
# Display the first 51 tokens in our vocabulary.
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

In [None]:
# A class that handles this tokenization, creation of the vocabulary, and an integer-to-string lookup on our vocabulary.
class SimpleTokenizerV1:
    def __init__(self,vocab):
        # store the vocabulary as a class attribute for access in encode/decode methods
        self.str_to_int = vocab
        # create an inverse vocabulary that maps token IDs back to the original text tokens
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        # process input text into token IDs
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        # convert ids back into tokens
        text = " ".join([self.int_to_str[i] for i in ids])

        # removes spaces before specified punctuation
        text = re.sub(r'\s+([,.?!"()\'])', r'\1',text)
        return text

In [None]:
# Initialize our tokenizer with the full vocab of the entire story.
tokenizer = SimpleTokenizerV1(vocab)

# tokenize a passage from Edith Wharton's short story
text = """"It's the last he painted, you know,"
        Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

In [None]:
# Can we turn these IDs back into text?
print(tokenizer.decode(ids))

In [None]:
# Try to tokenize text not in the training set used to generate our vocabulary
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

# Results in KeyError