In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
sentences= ['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']
tokenizer= Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
#word_index attribute returns dictionary where key is the unique word and value is the
#token for that word. Tokenizer strips punctuation out! eg: 'dog!' and 'dog' are treated as same word
word_index= tokenizer.word_index
#getting encoded code for sentences
sequences= tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


In [8]:
test_data= ['i really love my dog', 'my dog loves my boyfriend']
#getting encoded token value form for test sentences above
test_seq= tokenizer.texts_to_sequences(test_data)
#result---> it ignores the words it hadn't seen before like-('really', 'loves', ;boyfriend')
print(test_seq)
#so instead of ignoring, put a special value whenever any unknown word is encountered. 


[[4, 2, 1, 3], [1, 3, 1]]


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
#updated:- add the property oov_token to Tokenizer constructor. use the word OOV for words
# that are not in the word_index provided to it 
tokenizer= Tokenizer(num_words=100, oov_token= '<OOV>')
tokenizer.fit_on_texts(sentences)
word_index= tokenizer.word_index
#getting encoded code for sentences
sequences= tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)


#input size feeded to NN should be uniform. For that PADDING is done!
#list of sentences have been padded out into sentences

padded= pad_sequences(sequences)
print(padded)
#for padding after the sentences
padded= pad_sequences(sequences, padding='post')
print(padded)
#matrix width = length of longest sentence. you can overwrite that with 'maxlen' parameter
#eg:- if you want your sentences to have only 5 words then maxlen=5. it truncates from beginning (pre)
#you can change that to 'post' to truncate the words from last
padded= pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
print(padded)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]
[[ 5  3  2  4  0  0  0]
 [ 5  3  2  7  0  0  0]
 [ 6  3  2  4  0  0  0]
 [ 8  6  9  2  4 10 11]]


In [10]:
test_data= ['i really love my dog', 'my dog loves my boyfriend']
#getting encoded token value form for test sentences above
test_seq= tokenizer.texts_to_sequences(test_data)
#result---> it ignores the words it hadn't seen before like-('really', 'loves', ;boyfriend')
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
