In [1]:
# install tensorflow
import tensorflow as tf

# notebook to preprocess text to be used in RNNs

this includes: strings > tokens > integers > vectors

this can be accomplished using tensorflow's "Tokenizer" and "pad_sequences" functions



In [2]:
# other imports
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

we'll do a simple test with toy data

In [3]:
sentences = ["I like eggs and ham.", 
             "I love chocolate and bunnies.", 
             "I hate onions."]

In [4]:
# define max vocab size
max_vocab_size = 20000

# use Tokenizer to convert strings > tokens & tokens > integers
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [5]:
print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


In [6]:
# how to get word to index mapping? 
# tokenizer object stores this information!
tokenizer.word_index

{'i': 1,
 'and': 2,
 'like': 3,
 'eggs': 4,
 'ham': 5,
 'love': 6,
 'chocolate': 7,
 'bunnies': 8,
 'hate': 9,
 'onions': 10}

now onto padding (always shown with 0's)

In [7]:
# pad first with default
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


this looks good, it recognized the longest sentence had 5 words.

padding only needed to be added to the last sentence, and the default puts it first

In [8]:
# can get the same answer by specificing max sequence length to be 5
max_sequence_length = 5
data = pad_sequences(sequences, maxlen=max_sequence_length)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [9]:
# alternatively, can padding to the end of sentence
max_sequence_length = 5
data = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [10]:
# if too much padding is added, will end up with 0's in for every sentence
data = pad_sequences(sequences, maxlen=6)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [11]:
# alternatively, if you only want to keep parts of the sentence,
# you can truncate with the same function
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


notice the truncation happens from the **beginning** of the sentence

this makes sense, as the RNN pays more attention to patterns at the end

In [12]:
# you can truncate the ends of the sentences as well
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
