In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [18]:
sentences = ['I ate an apple','I love apples','apples taste good', 'I would like to eat an apple during summer']

In [19]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'an': 2, 'apple': 3, 'apples': 4, 'ate': 5, 'love': 6, 'taste': 7, 'good': 8, 'would': 9, 'like': 10, 'to': 11, 'eat': 12, 'during': 13, 'summer': 14}


In [20]:
sequences = tokenizer.texts_to_sequences(sentences)

In [21]:
print(sequences)

[[1, 5, 2, 3], [1, 6, 4], [4, 7, 8], [1, 9, 10, 11, 12, 2, 3, 13, 14]]


#How does the tokenizer handle the data that is not present in training

In [22]:
test = ["I ate an orange","I came from Hyderabad"]

In [23]:
test_seq = tokenizer.texts_to_sequences(test)

#Here we see that Tokenizer ignores the data that It has not seen.

In [24]:
print(test_seq)

[[1, 5, 2], [1]]


#Inorder to not to loose the length of the sequence of the word we can use <OOV> parameter.

In [25]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [26]:
sentences = ['I ate an apple','I love apples','apples taste good', 'I would like to eat an apple during summer']

In [27]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'an': 3, 'apple': 4, 'apples': 5, 'ate': 6, 'love': 7, 'taste': 8, 'good': 9, 'would': 10, 'like': 11, 'to': 12, 'eat': 13, 'during': 14, 'summer': 15}


In [28]:
sequences = tokenizer.texts_to_sequences(sentences)

In [29]:
print(sequences)

[[2, 6, 3, 4], [2, 7, 5], [5, 8, 9], [2, 10, 11, 12, 13, 3, 4, 14, 15]]


In [30]:
test = ["I ate an orange","I came from Hyderabad"]

In [31]:
test_seq = tokenizer.texts_to_sequences(test)

In [32]:
print(test_seq)

[[2, 6, 3, 1], [2, 1, 1, 1]]


#Padding to capture the sizes of different sentences.

In [33]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [34]:
sentences = ['I ate an apple','I love apples','apples taste good', 'I would like to eat an apple during summer']

In [35]:
tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'an': 3, 'apple': 4, 'apples': 5, 'ate': 6, 'love': 7, 'taste': 8, 'good': 9, 'would': 10, 'like': 11, 'to': 12, 'eat': 13, 'during': 14, 'summer': 15}


In [36]:
sequences = tokenizer.texts_to_sequences(sentences)

In [37]:
print(sequences)

[[2, 6, 3, 4], [2, 7, 5], [5, 8, 9], [2, 10, 11, 12, 13, 3, 4, 14, 15]]


In [38]:
padded = pad_sequences(sequences)

In [43]:
#if we keep post it means zeros will appear after.
padded = pad_sequences(sequences,padding = 'post')

In [44]:
#if you donot want the padding to the the maximum length of the sentence then what we can do is pu max lenth.
padded = pad_sequences(sequences,padding= 'post', maxlen = 5)

In [47]:
#truncation can be pre or post.
padded = pad_sequences(sequences, padding= 'post', truncating = 'post', maxlen = 5)

In [48]:
print(word_index)
print(sequences)
print(padded)

{'<OOV>': 1, 'i': 2, 'an': 3, 'apple': 4, 'apples': 5, 'ate': 6, 'love': 7, 'taste': 8, 'good': 9, 'would': 10, 'like': 11, 'to': 12, 'eat': 13, 'during': 14, 'summer': 15}
[[2, 6, 3, 4], [2, 7, 5], [5, 8, 9], [2, 10, 11, 12, 13, 3, 4, 14, 15]]
[[ 2  6  3  4  0]
 [ 2  7  5  0  0]
 [ 5  8  9  0  0]
 [ 2 10 11 12 13]]


In [None]:
test = ["I ate an orange","I came from Hyderabad"]

In [None]:
test_seq = tokenizer.texts_to_sequences(test)

In [None]:
print(test_seq)