# Import Libraries

In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sentences

In [2]:
sentences = [
    'My favorite food is ice cream',
    'do you like ice cream too?',
    'My dog likes ice cream!',
    "your favorite flavor of icecream is chocolate",
    "chocolate isn't good for dogs",
    "your dog, your cat, and your parrot prefer broccoli"
]

# Tokenize the Word

In [3]:
# Create tokenizer insatance
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer

<keras_preprocessing.text.Tokenizer at 0x7f0efe230350>

In [4]:
tokenizer.num_words, tokenizer.oov_token

(100, '<OOV>')

In [6]:
tokenizer.fit_on_texts(sentences)

In [12]:
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'your': 2, 'ice': 3, 'cream': 4, 'my': 5, 'favorite': 6, 'is': 7, 'dog': 8, 'chocolate': 9, 'food': 10, 'do': 11, 'you': 12, 'like': 13, 'too': 14, 'likes': 15, 'flavor': 16, 'of': 17, 'icecream': 18, "isn't": 19, 'good': 20, 'for': 21, 'dogs': 22, 'cat': 23, 'and': 24, 'parrot': 25, 'prefer': 26, 'broccoli': 27}


In [13]:
print(word_index['your'])

2


In [14]:
sent_sequences = tokenizer.texts_to_sequences(sentences)
sent_sequences

[[5, 6, 10, 7, 3, 4],
 [11, 12, 13, 3, 4, 14],
 [5, 8, 15, 3, 4],
 [2, 6, 16, 17, 18, 7, 9],
 [9, 19, 20, 21, 22],
 [2, 8, 2, 23, 24, 2, 25, 26, 27]]

# Check if word not in vocabulary

In [15]:
sentences2 = [
    "I like hot chocolate", 
    "My dogs and my hedgehog like kibble but my squirrel prefers grapes and my chickens like ice cream, preferably vanilla"
]


In [16]:
sent2_sequences = tokenizer.texts_to_sequences(sentences2)
sent2_sequences

[[1, 13, 1, 9],
 [5, 22, 24, 5, 1, 13, 1, 1, 5, 1, 1, 1, 24, 5, 1, 13, 3, 4, 1, 1]]

# Make sequences same length

In [18]:
padded = pad_sequences(sent_sequences)
print(padded)

[[ 0  0  0  5  6 10  7  3  4]
 [ 0  0  0 11 12 13  3  4 14]
 [ 0  0  0  0  5  8 15  3  4]
 [ 0  0  2  6 16 17 18  7  9]
 [ 0  0  0  0  9 19 20 21 22]
 [ 2  8  2 23 24  2 25 26 27]]


In [20]:
# Specify the max length
padded = pad_sequences(sent_sequences, maxlen=15)
print(padded)

[[ 0  0  0  0  0  0  0  0  0  5  6 10  7  3  4]
 [ 0  0  0  0  0  0  0  0  0 11 12 13  3  4 14]
 [ 0  0  0  0  0  0  0  0  0  0  5  8 15  3  4]
 [ 0  0  0  0  0  0  0  0  2  6 16 17 18  7  9]
 [ 0  0  0  0  0  0  0  0  0  0  9 19 20 21 22]
 [ 0  0  0  0  0  0  2  8  2 23 24  2 25 26 27]]


In [21]:
# put the padding at the end
padded = pad_sequences(sent_sequences, maxlen=15, padding='post')
print(padded)

[[ 5  6 10  7  3  4  0  0  0  0  0  0  0  0  0]
 [11 12 13  3  4 14  0  0  0  0  0  0  0  0  0]
 [ 5  8 15  3  4  0  0  0  0  0  0  0  0  0  0]
 [ 2  6 16 17 18  7  9  0  0  0  0  0  0  0  0]
 [ 9 19 20 21 22  0  0  0  0  0  0  0  0  0  0]
 [ 2  8  2 23 24  2 25 26 27  0  0  0  0  0  0]]


In [22]:
test_data = [
    "my best friend's favorite ice cream flavor is strawberry",
    "my dog's best friend is a manatee"
]
print (test_data)

["my best friend's favorite ice cream flavor is strawberry", "my dog's best friend is a manatee"]


In [23]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 1, 6, 3, 4, 16, 7, 1], [5, 1, 1, 1, 7, 1, 1]]


In [24]:
test_padding = pad_sequences(test_seq, maxlen=10)
print(test_padding)

[[ 0  5  1  1  6  3  4 16  7  1]
 [ 0  0  0  5  1  1  1  7  1  1]]
