# A Simple example of using TensorFlow-Keras Tokenizer and padding feature to convert Sentences into tokens

In [1]:
# Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# List of sentences
sentences = [
    'This is the first line',
    'This isn\'t the first line',
    'Is this the 3rd line?',
    'This line, which is 4th line is too big!?'
]

# create instance of Tokenizer. SInce "<OOV>" is just a simple string. So you can use any unique string instead of this
tokenizer = Tokenizer(oov_token="<OOV>")  # add a new token as Out of vocabulary (OOV) for undefined tokens
tokenizer.fit_on_texts(sentences)  # encode the passed data
word_index = tokenizer.word_index  # convert sentence to a dictionary of word tokens

sequences = tokenizer.texts_to_sequences(sentences) # convert each sentence to lists of tokens

padded = pad_sequences(sequences, maxlen=6)  # append/pad zeroes to make all sentences token list to same size

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:\n", padded)


# Try with words that the tokenizer wasn't fit to. 
test_data = [
    'this is really the first line',
    'is this the 5th line?'
]

test_seq = tokenizer.texts_to_sequences(test_data)   # convert each sentence to lists of tokens
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=7)   # append/pad zeroes to make all sentences token list to same size
print("\nPadded Test Sequence: \n", padded)


Word Index =  {'<OOV>': 1, 'line': 2, 'this': 3, 'is': 4, 'the': 5, 'first': 6, "isn't": 7, '3rd': 8, 'which': 9, '4th': 10, 'too': 11, 'big': 12}

Sequences =  [[3, 4, 5, 6, 2], [3, 7, 5, 6, 2], [4, 3, 5, 8, 2], [3, 2, 9, 4, 10, 2, 4, 11, 12]]

Padded Sequences:
 [[ 0  3  4  5  6  2]
 [ 0  3  7  5  6  2]
 [ 0  4  3  5  8  2]
 [ 4 10  2  4 11 12]]

Test Sequence =  [[3, 4, 1, 5, 6, 2], [4, 3, 5, 1, 2]]

Padded Test Sequence: 
 [[0 3 4 1 5 6 2]
 [0 0 4 3 5 1 2]]


## To add padding at the end, Use `padding='post'`. As default paddign is set to "pre"

In [2]:
padded = pad_sequences(test_seq, maxlen=7,padding='post')   # append/pad zeroes to make all sentences token list to same size
print("\nPadded Test Sequence: \n", padded)


Padded Test Sequence: 
 [[3 4 1 5 6 2 0]
 [4 3 5 1 2 0 0]]


## Effect of Removing "OOV" (Out of Vocab) token
Notice the "Padded test Sequence" here. Since we didn't mention the "<OOV>" for missing tokens. Therefore, it just ignores the words that are out of the dictionary of tokenized words

In [4]:
# Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# List of sentences
sentences = [
    'This is the first line',
    'This isn\'t the first line',
    'Is this the 3rd line?',
    'This line, which is 4th line is too big!?'
]

# create instance of Tokenizer
tokenizer = Tokenizer()  # add a new token as Out of vocabulary (OOV) for undefined tokens
tokenizer.fit_on_texts(sentences)  # encode the passed data
word_index = tokenizer.word_index  # convert sentence to a dictionary of word tokens

sequences = tokenizer.texts_to_sequences(sentences) # convert each sentence to lists of tokens

padded = pad_sequences(sequences, maxlen=6)  # append/pad zeroes to make all sentences token list to same size

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:\n", padded)


# Try with words that the tokenizer wasn't fit to. 
test_data = [
    'this is really the first line',
    'is this the 5th line?'
]

test_seq = tokenizer.texts_to_sequences(test_data)   # convert each sentence to lists of tokens
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=7)   # append/pad zeroes to make all sentences token list to same size

print("\nPadded Test Sequence: \n", padded)


Word Index =  {'line': 1, 'this': 2, 'is': 3, 'the': 4, 'first': 5, "isn't": 6, '3rd': 7, 'which': 8, '4th': 9, 'too': 10, 'big': 11}

Sequences =  [[2, 3, 4, 5, 1], [2, 6, 4, 5, 1], [3, 2, 4, 7, 1], [2, 1, 8, 3, 9, 1, 3, 10, 11]]

Padded Sequences:
 [[ 0  2  3  4  5  1]
 [ 0  2  6  4  5  1]
 [ 0  3  2  4  7  1]
 [ 3  9  1  3 10 11]]

Test Sequence =  [[2, 3, 4, 5, 1], [3, 2, 4, 1]]

Padded Test Sequence: 
 [[0 0 2 3 4 5 1]
 [0 0 0 3 2 4 1]]


## Impact of `Truncating` parameter on the token sequence in 'pad_sequences'

In [6]:
# Imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# List of sentences
sentences = [
    'This is the first line',
    'This isn\'t the first line',
    'Is this the 3rd line?',
    'This line, which is 4th line is too big!?'
]

# create instance of Tokenizer. SInce "<OOV>" is just a simple string. So you can use any unique string instead of this
tokenizer = Tokenizer(oov_token="<OOV>")  # add a new token as Out of vocabulary (OOV) for undefined tokens
tokenizer.fit_on_texts(sentences)  # encode the passed data
word_index = tokenizer.word_index  # convert sentence to a dictionary of word tokens

sequences = tokenizer.texts_to_sequences(sentences) # convert each sentence to lists of tokens

padded = pad_sequences(sequences, maxlen=6)  # append/pad zeroes to make all sentences token list to same size

print("\nWord Index = " , word_index)
print("\nSequences = " , sequences)
print("\nPadded Sequences:\n", padded)


# Try with words that the tokenizer wasn't fit to. 
test_data = [
    'this is really the first line',
    'is this the 5th line?'
]

test_seq = tokenizer.texts_to_sequences(test_data)   # convert each sentence to lists of tokens
print("\nTest Sequence = ", test_seq)

padded = pad_sequences(test_seq, maxlen=4)   # append/pad zeroes to make all sentences token list to same size

# Notice that the sentence token sequence is truncated from the begining beause default value of "truncation" paramaeter is "pre" 
print("\nPadded Test Sequence: \n", padded)

padded = pad_sequences(test_seq, maxlen=4, truncating='post')   # append/pad zeroes to make all sentences token list to same size

# Notice that the sentence token sequence is truncated from the end beause we have set value of "truncating" paramaeter to "post" 
print("\nPadded Test Sequence: \n", padded)


Word Index =  {'<OOV>': 1, 'line': 2, 'this': 3, 'is': 4, 'the': 5, 'first': 6, "isn't": 7, '3rd': 8, 'which': 9, '4th': 10, 'too': 11, 'big': 12}

Sequences =  [[3, 4, 5, 6, 2], [3, 7, 5, 6, 2], [4, 3, 5, 8, 2], [3, 2, 9, 4, 10, 2, 4, 11, 12]]

Padded Sequences:
 [[ 0  3  4  5  6  2]
 [ 0  3  7  5  6  2]
 [ 0  4  3  5  8  2]
 [ 4 10  2  4 11 12]]

Test Sequence =  [[3, 4, 1, 5, 6, 2], [4, 3, 5, 1, 2]]

Padded Test Sequence: 
 [[1 5 6 2]
 [3 5 1 2]]

Padded Test Sequence: 
 [[3 4 1 5]
 [4 3 5 1]]
