In [14]:
import tensorflow as tf
from   tensorflow import keras
from   tensorflow.keras.preprocessing.sequence import pad_sequences
from   tensorflow.keras.preprocessing.text import Tokenizer

In [7]:
N_WORDS = 100
OOV = '<OOV>'

In [3]:
sentences = ['Today is a sunny day.', 'Today is a rainy day.', 
             'Is it sunny today?']
tokenizer = Tokenizer(num_words=N_WORDS)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}

In [4]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[1, 2, 3, 4, 5], [1, 2, 3, 6, 5], [2, 7, 4, 1]]

In [5]:
test_data = ['Today is a snowy day', 'Will it be rainy tomorrow?']
test_sequences = tokenizer.texts_to_sequences(test_data)
word_index # unchanged

{'today': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5, 'rainy': 6, 'it': 7}

In [6]:
test_sequences

[[1, 2, 3, 5], [7, 6]]

In [8]:
tokenizer = Tokenizer(num_words=N_WORDS, oov_token=OOV)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
test_sequences = tokenizer.texts_to_sequences(test_data)
word_index

{'<OOV>': 1,
 'today': 2,
 'is': 3,
 'a': 4,
 'sunny': 5,
 'day': 6,
 'rainy': 7,
 'it': 8}

In [9]:
test_sequences

[[2, 3, 4, 1, 6], [1, 8, 1, 7, 1]]

### Padding 

In [10]:
sentences = ['Today is a sunny day',
             'Today is a rainy day', 
             'Is it sunny today?',
             'I really enjoyed walking in the snow today']

In [11]:
sentence_sequences = tokenizer.texts_to_sequences(sentences)
sentence_sequences

[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2], [1, 1, 1, 1, 1, 1, 1, 2]]

In [15]:
padded = pad_sequences(sentence_sequences)
padded

array([[0, 0, 0, 2, 3, 4, 5, 6],
       [0, 0, 0, 2, 3, 4, 7, 6],
       [0, 0, 0, 0, 3, 8, 5, 2],
       [1, 1, 1, 1, 1, 1, 1, 2]], dtype=int32)

In [16]:
post_padded = pad_sequences(sentence_sequences, padding='post')
post_padded

array([[2, 3, 4, 5, 6, 0, 0, 0],
       [2, 3, 4, 7, 6, 0, 0, 0],
       [3, 8, 5, 2, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 2]], dtype=int32)

In [18]:
trim_pad = pad_sequences(sentence_sequences, padding='post', maxlen=6)
trim_pad

array([[2, 3, 4, 5, 6, 0],
       [2, 3, 4, 7, 6, 0],
       [3, 8, 5, 2, 0, 0],
       [1, 1, 1, 1, 1, 2]], dtype=int32)

In [19]:
trim_pad2 = pad_sequences(
    sentence_sequences, padding='post', maxlen=6, truncating='post')
trim_pad2

array([[2, 3, 4, 5, 6, 0],
       [2, 3, 4, 7, 6, 0],
       [3, 8, 5, 2, 0, 0],
       [1, 1, 1, 1, 1, 1]], dtype=int32)