In [None]:
# Import the Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
sentences = [
    'One of my favorite programming language is Python',
    'do you like computer too?',
    'My dog likes to play!',
    "My favorite car is Bugatti",
    "Bugatti was a French car manufacturer of high-performance automobiles",
    "My car, my machine, and my transport"
]

**Tokenize the words**
The first step to preparing text to be used in a machine learning model is to tokenize the text, in other words, to generate numbers for the words.

In [None]:
# Optionally set the max number of words to tokenize.
# The out of vocabulary (OOV) token represents words that are not in the index.
# Call fit_on_text() on the tokenizer to generate unique numbers for each word
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

**View the word index**

After tokenize the text, the tokenizer has a word index that contains key-value pairs for all the words and their numbers.

The word is the key, and the number is the value.

Notice that the OOV token is the first entry.

In [None]:
# Examine the word index
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'car': 3, 'of': 4, 'favorite': 5, 'is': 6, 'bugatti': 7, 'one': 8, 'programming': 9, 'language': 10, 'python': 11, 'do': 12, 'you': 13, 'like': 14, 'computer': 15, 'too': 16, 'dog': 17, 'likes': 18, 'to': 19, 'play': 20, 'was': 21, 'a': 22, 'french': 23, 'manufacturer': 24, 'high': 25, 'performance': 26, 'automobiles': 27, 'machine': 28, 'and': 29, 'transport': 30}


In [None]:
# Get the number for a given word
print(word_index['language'])

10


**Create sequences for the sentences**

After tokenize the words, the word index contains a unique number for each word. However, the numbers in the word index are not ordered. Words in a sentence have an order. So after tokenizing the words, the next step is to generate sequences for the sentences.

In [None]:
sequences = tokenizer.texts_to_sequences(sentences)
print (sequences)

[[8, 4, 2, 5, 9, 10, 6, 11], [12, 13, 14, 15, 16], [2, 17, 18, 19, 20], [2, 5, 3, 6, 7], [7, 21, 22, 23, 3, 24, 4, 25, 26, 27], [2, 3, 2, 28, 29, 2, 30]]


Sequence sentences that contain words that are not in the word index
Let's take a look at what happens if the sentence being sequenced contains words that are not in the word index.

The Out of Vocabluary (OOV) token is the first entry in the word index. We will see it shows up in the sequences in place of any word that is not in the word index.

In [None]:
new_sentences = ["I like sport car", "My car and my machine are made by myself but both need improvement"]

sequences2 = tokenizer.texts_to_sequences(new_sentences)
print(sequences2)

[[1, 14, 1, 3], [2, 3, 29, 2, 28, 1, 1, 1, 1, 1, 1, 1, 1]]


**Make the sequences all the same length**

Later, when feed the sequences into a neural network to train a model, the sequences all **need to be uniform in size**. Currently the sequences have varied lengths, so the next step is to make them all be the same size, either by padding them with zeros and/or truncating them.

Use `f.keras.preprocessing.sequence.pad_sequences` to add zeros to the sequences to make them all be the same length. By default, the padding goes at the start of the sequences, but we can specify to pad at the end.

We can optionally specify the maximum length to pad the sequences to. Sequences that are longer than the specified max length will be truncated. By default, sequences are truncated from the beginning of the sequence, but we can specify to truncate from the end.

If the max length is not provided, then the sequences are padded to match the length of the longest sentence.

For all the options when padding and truncating sequences, see https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(sequences)
print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:")
print(padded)


Word Index =  {'<OOV>': 1, 'my': 2, 'car': 3, 'of': 4, 'favorite': 5, 'is': 6, 'bugatti': 7, 'one': 8, 'programming': 9, 'language': 10, 'python': 11, 'do': 12, 'you': 13, 'like': 14, 'computer': 15, 'too': 16, 'dog': 17, 'likes': 18, 'to': 19, 'play': 20, 'was': 21, 'a': 22, 'french': 23, 'manufacturer': 24, 'high': 25, 'performance': 26, 'automobiles': 27, 'machine': 28, 'and': 29, 'transport': 30}

Sequences =  [[8, 4, 2, 5, 9, 10, 6, 11], [12, 13, 14, 15, 16], [2, 17, 18, 19, 20], [2, 5, 3, 6, 7], [7, 21, 22, 23, 3, 24, 4, 25, 26, 27], [2, 3, 2, 28, 29, 2, 30]]

Padded Sequences:
[[ 0  0  8  4  2  5  9 10  6 11]
 [ 0  0  0  0  0 12 13 14 15 16]
 [ 0  0  0  0  0  2 17 18 19 20]
 [ 0  0  0  0  0  2  5  3  6  7]
 [ 7 21 22 23  3 24  4 25 26 27]
 [ 0  0  0  2  3  2 28 29  2 30]]


In [None]:
# Specify a max length for the padded sequences
padded = pad_sequences(sequences, maxlen=15)
print(padded)

[[ 0  0  0  0  0  0  0  8  4  2  5  9 10  6 11]
 [ 0  0  0  0  0  0  0  0  0  0 12 13 14 15 16]
 [ 0  0  0  0  0  0  0  0  0  0  2 17 18 19 20]
 [ 0  0  0  0  0  0  0  0  0  0  2  5  3  6  7]
 [ 0  0  0  0  0  7 21 22 23  3 24  4 25 26 27]
 [ 0  0  0  0  0  0  0  0  2  3  2 28 29  2 30]]


In [None]:
# Put the padding at the end of the sequences
padded = pad_sequences(sequences, maxlen=15, padding="post")
print(padded)

[[ 8  4  2  5  9 10  6 11  0  0  0  0  0  0  0]
 [12 13 14 15 16  0  0  0  0  0  0  0  0  0  0]
 [ 2 17 18 19 20  0  0  0  0  0  0  0  0  0  0]
 [ 2  5  3  6  7  0  0  0  0  0  0  0  0  0  0]
 [ 7 21 22 23  3 24  4 25 26 27  0  0  0  0  0]
 [ 2  3  2 28 29  2 30  0  0  0  0  0  0  0  0]]


In [None]:
# truncated
padded = pad_sequences(sequences, maxlen=3)
print(padded)

[[10  6 11]
 [14 15 16]
 [18 19 20]
 [ 3  6  7]
 [25 26 27]
 [29  2 30]]


In [None]:
# Try turning sentences that contain words that 
# aren't in the word index into sequences.
test_data = [
    "my best friend's favorite car is Audi and car manufacturer is at French",
    "my best friend like auto machine"
]
print (test_data)

# Remind ourselves which number corresponds to the
# out of vocabulary token in the word index
print("<OOV> has the number", word_index['<OOV>'], "in the word index.")

# Convert the test sentences to sequences
test_seq = tokenizer.texts_to_sequences(test_data)
print("\nTest Sequence = ", test_seq)

# Pad the new sequences
padded = pad_sequences(test_seq)
print("\nPadded Test Sequence: ")

# Notice that "1" appears in the sequence wherever there's a word 
# that's not in the word index
print(padded)

["my best friend's favorite car is Audi and car manufacturer is at French", 'my best friend like auto machine']
<OOV> has the number 1 in the word index.

Test Sequence =  [[2, 1, 1, 5, 3, 6, 1, 29, 3, 24, 6, 1, 23], [2, 1, 1, 14, 1, 28]]

Padded Test Sequence: 
[[ 2  1  1  5  3  6  1 29  3 24  6  1 23]
 [ 0  0  0  0  0  0  0  2  1  1 14  1 28]]
