# Text Preprocessing


In [19]:
from keras.utils import pad_sequences
from keras.layers import TextVectorization

In [28]:
# Simple test
sentences = [
    'I like eggs and ham.',
    'I love hot chocolate and bunnies.',
    'I hate onions.',
]

In [29]:
MAX_VOCAB_SIZE = 20000

In [30]:
# Vectorizing data
vectorize_layer = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    # Default but useful arguments
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    output_mode='int',
)

In [31]:
# "Training vectorization layer"
vectorize_layer.adapt(sentences)

In [32]:
# Transforming sequences
sequences = vectorize_layer(sentences)
print(sequences)

tf.Tensor(
[[ 2  6 10  3  9  0]
 [ 2  5  7 11  3 12]
 [ 2  8  4  0  0  0]], shape=(3, 6), dtype=int64)


In [33]:
# Showing vocabulary
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'i',
 'and',
 'onions',
 'love',
 'like',
 'hot',
 'hate',
 'ham',
 'eggs',
 'chocolate',
 'bunnies']

In [9]:
# Word to index mapping
word2indx = {v: k for k, v in enumerate(vectorize_layer.get_vocabulary())}
print(word2indx)

{'': 0, '[UNK]': 1, 'i': 2, 'sandwiches': 3, 'onions': 4, 'love': 5, 'like': 6, 'hot': 7, 'hate': 8, 'chocolate': 9, 'cheese': 10, 'bunnies': 11, 'and': 12}


In [17]:
# Truncatation
vectorize_layer_truncated = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    output_sequence_length=3,
)

# Fit
vectorize_layer_truncated.adapt(sentences)

# Predict
sequences = vectorize_layer_truncated(sentences)
print(sequences)

tf.Tensor(
[[ 2  6 10]
 [ 2  5  7]
 [ 2  8  4]], shape=(3, 3), dtype=int64)


In [None]:
# Ragged (no padding)
vectorize_layer_ragged = TextVectorization(
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True,
)

# Fit
vectorize_layer_ragged.adapt(sentences)

# Predict
sequences = vectorize_layer_ragged(sentences)
print(sequences)

<tf.RaggedTensor [[2, 6, 10, 3], [2, 5, 7, 9, 12, 11], [2, 8, 4]]>


In [20]:
# Padding in front instead of back

padded = pad_sequences(sequences.to_list())
print(padded)

[[ 0  0  2  6 10  3]
 [ 2  5  7  9 12 11]
 [ 0  0  0  2  8  4]]
