# Tokenizer()

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'testing this stuff',
    'stuff of testing is good'
]

t = Tokenizer(num_words = 100)
t.fit_on_texts(sentences)
word_indexes = t.word_index
print(word_indexes)

{'testing': 1, 'stuff': 2, 'this': 3, 'of': 4, 'is': 5, 'good': 6}


In [3]:
seq = t.texts_to_sequences(sentences)
print(seq)

[[1, 3, 2], [2, 4, 1, 5, 6]]


In [12]:
# Tokenizer() ignores unseen words by default
seq2 = t.texts_to_sequences(['testing stuff and this time some stuff is lost'])
print(seq2)

[[1, 2, 3, 2, 5]]


In [13]:
# To specify default value for missing words, use: oov_token="value"
t2 = Tokenizer(num_words=100, oov_token="<OOV>")
t2.fit_on_texts(sentences)
print(t2.word_index)
print(t2.texts_to_sequences(['testing stuff and this time some stuff is lost']))

{'<OOV>': 1, 'testing': 2, 'stuff': 3, 'this': 4, 'of': 5, 'is': 6, 'good': 7}
[[2, 3, 1, 4, 1, 1, 3, 6, 1]]


# Padding

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

seq = t2.texts_to_sequences(sentences)
padded = pad_sequences(seq)
print(padded)
print(sentences[0], '\n', sentences[1])

[[0 0 2 4 3]
 [3 5 2 6 7]]
testing this stuff 
 stuff of testing is good


In [20]:
# to pad at the end use padding='post'
# set up 'maxlen' to cut longer sequences
# set up 'truncating' to specify where to cut
padded = pad_sequences(seq, padding='post', truncating='post', maxlen=16)
print(padded)

[[2 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [3 5 2 6 7 0 0 0 0 0 0 0 0 0 0 0]]
