In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Just simple test
sentences = [
  "I like eggs and ham.",
  "I love chocolate and bunnies.",
  "I hate onions."             
]

In [3]:
# This is usually a pretty reasonable size
# 3000 words is about 95% of most text, so 20000 should be fine
# everything outside 20000 is assigned a <RARE> token (same value)
MAX_VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
# First, sentences is an iterable, where each sentence is a string
tokenizer.fit_on_texts(sentences) # akin to fit() from sklearn
# Then, convert iterable of sentences to corresponding list of integers
sequences = tokenizer.texts_to_sequences(sentences) # akin to transform() from sklearn

'\ne.g. sentences now looks like this:\nsentences = [\n  [1,2,3,4,5],\n  [1,6,7,4,8],\n  [1,9,10]\n]\n'

In [4]:
print(sequences) # (dont run this for some huge dataset)
# Notice how it counts from 1, since TF uses 0 for padding value

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [5]:
# tokenizer stores a dict which maps index to word
tokenizer.word_index

{'and': 2,
 'bunnies': 8,
 'chocolate': 7,
 'eggs': 4,
 'ham': 5,
 'hate': 9,
 'i': 1,
 'like': 3,
 'love': 6,
 'onions': 10}

In [6]:
data = pad_sequences(sequences)
print(data)
# max length = 5, and padding at beginning for default

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [7]:
# passing in max length and set padding to post
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post') # 'pre' is default
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


Scenario to use pre padding: classifying spam or not span (minimize likelihood of forgetting previous values)

Scenario to use post padding: neural machine translation (if language B has longer sentence than language A, we dont want language B to be at a loss at the beginning of the sentence)

In [8]:
# too much padding
data = pad_sequences(sequences, maxlen=6)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [9]:
# truncation
data = pad_sequences(sequences, maxlen=4)
print(data)
# BEGINNING is cutoff (makes sense, cause RNN pays attention to final values more)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [11]:
# truncate at the end
data = pad_sequences(sequences, maxlen=4, truncating='post') # default is 'pre'
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
