In [43]:
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer           # creates vectors out of sentences
from tensorflow.keras.preprocessing.sequence import pad_sequences   # adds padding to sequences to make them uniform

In [44]:
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

In [45]:
# creates instance of tokenizer 
# num_words takes top 100 words by volume and only encodes those (only affects sequences, not word_index)
# might lightly affect accuracy but will hugely affect train time
# oov_token defines what to be used for words that aren't in the trained word_index 
# want to make oov_token something unique that will not be confused with real word
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')

# takes in data and encodes
tokenizer.fit_on_texts(sentences)

# word_index returns diction where key is word and value is token for the word
# auto converts to lowercase and strips out punctuation
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


In [46]:
# uses tokens to turn sentences into lists of tokens
# texts_to_sequences can take ANY sentence and encode them using tokens learned using fit_on_texts()
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [50]:
# pass sequences to fill in values so that it can create a matrix of equal row length 
# will put a number of 0s before the sentences to fit max length 

# padded = pad_sequences(sequences)

# can also put 0s after the sentence with pad_sequences(sequences, padding='post')
# padded = pad_sequences(sequences, padding='post')

# can override the max length of sentences with maxlen
# note that this will remove data from the beginning of sentences that go over the maxlen
# padded = pad_sequences(sequences, padding='post', maxlen=5)

# you can override to remove extra data from the end of sentence rather than beginning
padded = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)

print(padded)

[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


In [51]:
# need to encode any test data using the same word index
# notice that the output has words missing because they arent a part of the word_index created in training
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
