# TensorFlow: Text Pre-Processing

In [None]:
import os
import io
import matplotlib.pyplot as plt

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf
import tensorflow_datasets as tfds

print("TF Version: ", tf.__version__)
print("TF Eager mode: ", tf.executing_eagerly())
print("TF GPU is", "available" if tf.config.list_physical_devices("GPU") else "not available")

# Text Vectorization

In [None]:
# Define corpus sentences
sentences = [
    "I love my dog",
    "I love my cat!",
    "Do you thing my dog is amazing?"
]

__Padding vectorization__

In [None]:
vectorize_layer0 = tf.keras.layers.TextVectorization()
vectorize_layer0.adapt(sentences)
vocabulary0 = vectorize_layer0.get_vocabulary()

In [None]:
# 0 corresponds to padding (used to pad shorter sentences)
# 1 corresponds to unknown word given existing vocabulary
_ = [print("Index={}, Value={}".format(index, value)) for index, value in enumerate(vocabulary0)]

In [None]:
sentences_with_oov = sentences + ["I really love my dog"]

# Map sentences to token sequences (without padding)
seq_ds = tf.data.Dataset.from_tensor_slices(sentences_with_oov)
_ = [print("{} => {}".format(t, s)) for t, s in zip(sentences_with_oov, seq_ds.map(vectorize_layer0))]

In [None]:
# Map sentences to token sequences (with padding)
_ = [print("{} => {}".format(t, s)) for t, s in zip(sentences_with_oov, vectorize_layer0(sentences_with_oov))]

__Ragged vectorization__

In [None]:
vectorize_layer1 = tf.keras.layers.TextVectorization(ragged=True)
vectorize_layer1.adapt(sentences)
vocabulary1 = vectorize_layer1.get_vocabulary()

In [None]:
# Map sentences to token sequences (without padding)
ragged_sequences = vectorize_layer1(sentences)
_ = [print("{} => {}".format(t, s)) for t, s in zip(sentences, ragged_sequences)]

In [None]:
# Pre-pad the sequences of the ragged tensor
print(tf.keras.utils.pad_sequences(ragged_sequences.numpy()))

# Word Embeddings

Creating word embeddings using NN and `imdb_ewviews` dataset.
Dataset contains positive or negative film reviews. Therefore, training classification network provides weights from `Embedding` layer.

In [None]:
# The size of vocabulary
VOCAB_SIZE = 10000
# The max length of sequence after word vectorization
MAX_LENGTH = 120
# The length of word embedding
EMBEDDING_DIM = 16

In [None]:
(train_ds, test_ds), info = tfds.load(
    "imdb_reviews",
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True,
    data_dir="data",
    download=True)

In [None]:
print(info)

In [None]:
# Element specification: (review, label)
print(train_ds.element_spec)

In [None]:
# Get the string inputs and integer outputs of the training se
train_reviews = train_ds.map(lambda review, label: review)
train_labels = train_ds.map(lambda review, label: label)

# Get the string inputs and integer outputs of the test set
test_reviews = test_ds.map(lambda review, label: review)
test_labels = test_ds.map(lambda review, label: label)

In [None]:
# Instantiate the vectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_sequence_length=MAX_LENGTH
)

# Generate the vocabulary based only on the training set
vectorize_layer.adapt(train_reviews)

In [None]:
# Apply vectorization (sentences to sequences)
train_seqs = train_reviews.map(lambda text: vectorize_layer(text))
test_seqs = test_reviews.map(lambda text: vectorize_layer(text))

# Zip sequences together with labels
train_ds_vec = tf.data.Dataset.zip(train_seqs,train_labels)
test_ds_vec = tf.data.Dataset.zip(test_seqs, test_labels)

In [None]:
# Get vocabulary
vocabulary = vectorize_layer.get_vocabulary()

# Get a sample integer sequences
sample_sequence = train_seqs.take(1).get_single_element()

# Lookup each token in the vocabulary
decoded_text = " ".join([vocabulary[token] for token in sample_sequence])

# Print decoded text together with [UNK]
print(decoded_text)

In [None]:
SHUFFLE_BUFFER_SIZE = 1000
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

# Optimize the datasets for training
train_ds_final = (train_ds_vec
    .cache()
    .shuffle(SHUFFLE_BUFFER_SIZE)
    .prefetch(PREFETCH_BUFFER_SIZE)
    .batch(BATCH_SIZE)
)

test_ds_final = (test_ds_vec
    .cache()
    .prefetch(PREFETCH_BUFFER_SIZE)
    .batch(BATCH_SIZE)
)

In [None]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.Input(shape=(MAX_LENGTH,)),
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# Setup the training parameters
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Print the model summary
model.summary()

In [None]:
NUM_EPOCHS = 10

# Train the model
history = model.fit(
    train_ds_final,
    epochs=NUM_EPOCHS,
    validation_data=test_ds_final,
    verbose=2)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
# Get the embedding layer from the model (i.e. first layer)
embedding_layer = model.layers[0]

# Get the weights of the embedding layer
embedding_weights = embedding_layer.get_weights()[0]

# Print the shape. Expected is (vocab_size, embedding_dim)
print(embedding_weights.shape)

In [None]:
log_dir = "logs/imdb"
os.makedirs(log_dir, exist_ok=True)

out_v = io.open(os.path.join(log_dir, "vecs.tsv"), "w", encoding="utf-8")
out_m = io.open(os.path.join(log_dir, "meta.tsv"), "w", encoding="utf-8")

vocabulary = vectorize_layer.get_vocabulary()
for word_num in range(1, len(vocabulary)):
    word_name = vocabulary[word_num]
    word_embedding = embedding_weights[word_num]
    out_m.write(word_name + "\n")
    out_v.write('\t'.join([str(x) for x in word_embedding]) + "\n")

out_v.close()
out_m.close()

Open [Tensorflow Embedding Projector](https://projector.tensorflow.org/) and load the two files
`logs/imdb/vecs.tsv` (Step 1) and `logs/imdb/meta.tsv` (Step 2) to see the visualization.

# Subword Embeddings

In [None]:
import keras_nlp

In [None]:
(train_ds, test_ds), info = tfds.load(
    "imdb_reviews",
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    with_info=True,
    as_supervised=True,
    data_dir="data",
    download=True)

In [None]:
# Get the string inputs and integer outputs of the training se
train_reviews = train_ds.map(lambda review, label: review)
train_labels = train_ds.map(lambda review, label: label)

# Get the string inputs and integer outputs of the test set
test_reviews = test_ds.map(lambda review, label: review)
test_labels = test_ds.map(lambda review, label: label)

In [None]:
# Compute the subword vocabulary and save to a file
keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_reviews,
    vocabulary_size=8000,
    reserved_tokens=["[PAD]", "[UNK]"],
    vocabulary_output_file='data/imdb_vocab_subwords.txt'
)