In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
sentences = [
    "I like eggs and ham.",
    "I love chocolate and bunnies.",
    "I hate onions."
]

In [13]:
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(tokenizer.word_index)
print(tokenizer.word_counts)

{'i': 1, 'and': 2, 'like': 3, 'eggs': 4, 'ham': 5, 'love': 6, 'chocolate': 7, 'bunnies': 8, 'hate': 9, 'onions': 10}
OrderedDict([('i', 3), ('like', 1), ('eggs', 1), ('and', 2), ('ham', 1), ('love', 1), ('chocolate', 1), ('bunnies', 1), ('hate', 1), ('onions', 1)])


In [14]:
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [15]:
pad_sequences(sequences)

array([[ 1,  3,  4,  2,  5],
       [ 1,  6,  7,  2,  8],
       [ 0,  0,  1,  9, 10]], dtype=int32)

In [16]:
pad_sequences(sequences, maxlen=6)

array([[ 0,  1,  3,  4,  2,  5],
       [ 0,  1,  6,  7,  2,  8],
       [ 0,  0,  0,  1,  9, 10]], dtype=int32)

In [17]:
pad_sequences(sequences, maxlen=4)

array([[ 3,  4,  2,  5],
       [ 6,  7,  2,  8],
       [ 0,  1,  9, 10]], dtype=int32)

In [18]:
pad_sequences(sequences, maxlen=6, padding='post')

array([[ 1,  3,  4,  2,  5,  0],
       [ 1,  6,  7,  2,  8,  0],
       [ 1,  9, 10,  0,  0,  0]], dtype=int32)

In [19]:
pad_sequences(sequences, maxlen=4, truncating='post')

array([[ 1,  3,  4,  2],
       [ 1,  6,  7,  2],
       [ 0,  1,  9, 10]], dtype=int32)

In [20]:
pad_sequences(sequences, maxlen=4, padding='post', truncating='post')

array([[ 1,  3,  4,  2],
       [ 1,  6,  7,  2],
       [ 1,  9, 10,  0]], dtype=int32)