Coursera link: https://www.coursera.org/learn/natural-language-processing-tensorflow/lecture/Sydkf/notebook-for-lesson-2

Keras Text Preprocessing: https://keras.io/preprocessing/text/

The same in TF library: https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

# Tokenizer

In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
             'i love my dog',
             'I, love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?'
]

# tokenizing our text into a dictionary of values: one distinct word -> one distinct integer.
tokenizer = Tokenizer(num_words=100)   # it will put in the dictionary the first 100 words in volume appearing in our text.
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)


2.1.0
{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


# Converting sentences to sequences

In [4]:
# Convert the sentences into sequences of integers using the dictionary:
sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)
print(sequences[0])  # selecting one sentence

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
[4, 2, 1, 3]


In [6]:
test_data = [
              'i really love my dog',
              'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)

print(test_seq)

# Result: some words are lost because we are using a dictionry (word_index from above) that didn't have those words.

# Thus, we need a broad training data to have a large dictionary.

[[4, 2, 1, 3], [1, 3, 1]]


Result: some words are lost because we are using a dictionry (word_index from above) that didn't have those words.

Thus, we need a broad training data to have a large dictionary.

# Indexing unseen words:

In [7]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
             'i love my dog',
             'I, love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?'
]

# tokenizing our text into a dictionary of values: one distinct word -> one distinct integer.
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')   # it will put in the dictionary the first 100 words in volume appearing in our text. 
                                                          # the unseen words will be tokenized as OOV (out of vocabulary) via the oov_token option.
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)


# Convert the sentences into sequences of integers using the dictionary:
sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)
print(sequences[0])  # selecting one sentence


test_data = [
              'i really love my dog',
              'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)

print(test_seq)

# Result: the unseen words from the dictionary are tokenized as OOV.


2.1.0
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[5, 3, 2, 4]
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


# Padding

Padding is used to make each sentence sequence of the same length.
This is so that all training sentences are of same length to be able to do the NN training.

In [24]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = [
             'i love my dog',
             'I, love my cat',
             'You love my dog!',
             'Do you think my dog is amazing?'
]

# Tokenizing our text into a Dictionary of values: one distinct word -> one distinct integer.
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')   # it will put in the dictionary the first 100 words in volume appearing in our text. 
                                                          # the unseen words will be tokenized as OOV (out of vocabulary) via the oov_token option.
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print('\nDictionary (word index) =', word_index)


# Convert the sentences into sequences of integers using the dictionary:
sequences = tokenizer.texts_to_sequences(sentences)

print('\nTrain Sequences =', sequences)
print('\nTrain Sequence #1 =', sequences[0])  # selecting one sentence


# Padding consists of converting the sequences all to the same length:
padded_seqs = pad_sequences(sequences)
# padded_seqs = pad_sequences(sequences, padding='post', maxlen=5)  # padding='post' adds the zeros at the end
#                                                                   # maxlen=5 sets the sentence length to 5 and longer sentences are cut off from the beginning.
# padded_seqs = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)  # truncating='post' cuts sentences longer than 5 from the end.
print('\nPadded Sequences =')
print(padded_seqs)  # the list of sentences has been padded out into a matrix of integers

print('\nPadded train data Dimensions =', padded_seqs.shape)


# Tokenizing the test data:
test_data = [
              'i really love my dog',
              'my dog loves my manatee'
]

test_seq = tokenizer.texts_to_sequences(test_data)

print('\nTest Sequences =',test_seq)


# Padding the test data but with same row length as the train data -> using maxlen:
padded_test_seqs = pad_sequences(test_seq, maxlen=padded_seqs.shape[1])

print('\nPadded Test Sequences =')
print(padded_test_seqs)



2.1.0

Dictionary (word index) = {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Train Sequences = [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

Train Sequence #1 = [5, 3, 2, 4]

Padded Sequences =
[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]

Padded train data Dimensions = (4, 7)

Test Sequences = [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

Padded Test Sequences =
[[0 0 5 1 3 2 4]
 [0 0 2 4 1 2 1]]
