# TensorFlow: Sentiment classification using word embeddings

In [None]:
import os

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
from tensorboard import program

print("TF Version: ", tf.__version__)
print("TF Eager mode: ", tf.executing_eagerly())
print("TF GPU is", "available" if tf.config.list_physical_devices("GPU") else "not available")

In [None]:
# Set dir for logging
LOG_ROOT_DIR = os.path.join("logs", "sentiments")
# Set the size of buffer for samples shuffling
BUFFER_SIZE = 10_000
# Set the size of batches
BATCH_SIZE = 32
# The size of the vocabulary
MAX_FEATURES = 10_000
# The sample sequence length (truncate or pad to get this length)
SEQ_LEN = 250
# Set the number of dimensions for embedded vectors
EMBEDDING_DIM = 64

## Prepare Datasets

### Load datasets

In [None]:
# Load dataset from TFDS collection
(raw_train_ds, raw_val_ds, raw_test_ds), ds_info = tfds.load(
    name="imdb_reviews",
    split=["train[:80%]", "train[80%:]", "test"],
    as_supervised=True,
    with_info=True
)

In [None]:
# Print one sample
for sample, label in raw_train_ds.take(1):
    print(f"Text: {sample.numpy()}")
    print(f"Label: {label.numpy()}")

### Pre-process Datasets

In [None]:
train_ds = (raw_train_ds
            .shuffle(BUFFER_SIZE)
            .repeat()
            .batch(BATCH_SIZE)
            .prefetch(tf.data.AUTOTUNE))

val_ds = (raw_val_ds
           .batch(BATCH_SIZE)
           .prefetch(tf.data.AUTOTUNE))

test_ds = (raw_test_ds
           .batch(BATCH_SIZE)
           .prefetch(tf.data.AUTOTUNE))

In [None]:
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=MAX_FEATURES,
    output_sequence_length=SEQ_LEN
)

In [None]:
# Adapt vectorization layer (ony train data must be used)
vectorize_layer.adapt(raw_train_ds.map(lambda x, y: x))

__Test text vectorization__

In [None]:
def vectorize_text(x, y):
    # Add a batch dimension
    x = tf.expand_dims(x, -1)
    y = tf.expand_dims(y, -1)
    # Vectorize a text
    return vectorize_layer(x), y

In [None]:
x, y = next(iter(raw_train_ds.take(1)))
print("Review: ", x.numpy())
print("Label: ", y.numpy())
print("Vectorized review: ", vectorize_text(x, y))

In [None]:
print("152 ---> ", vectorize_layer.get_vocabulary()[152])
print("113 ---> ", vectorize_layer.get_vocabulary()[113])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

## Build Model

In [None]:
# Calculate the amount of steps per epoch
STEPS_PER_EPOCH = int(raw_train_ds.cardinality() // BATCH_SIZE)

In [None]:
# Define learning rate scheduler
lr_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
    1e-4,
    decay_steps=STEPS_PER_EPOCH*10, # Decay every 10 epochs
    decay_rate=0.5,
    staircase=True)

### Model - CNN

In [None]:
MODEL0_LOGS = os.path.join(LOG_ROOT_DIR, "tb-model0")

#### Create Model

In [None]:
model0 = tf.keras.Sequential([
    vectorize_layer,
    # Creates embedded vector for each word-index
    # (batch_size, steps) -> (batch_size, steps, features)
    tf.keras.layers.Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_DIM),
    # Regularize by randomly dropping dimensions in feature vector
    tf.keras.layers.SpatialDropout1D(0.5),
    # Conv1D + global max pooling
    tf.keras.layers.Conv1D(64, 3, activation="relu"),
    tf.keras.layers.Conv1D(64, 3, activation="relu"),
    tf.keras.layers.GlobalMaxPool1D(),
    # Dense Layer
    tf.keras.layers.Dense(
        units=64,
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(0.0001)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1),
])

model0.compile(
    optimizer=tf.keras.optimizers.Adam(lr_scheduler),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

#### Fit Model

In [None]:
history0 = model0.fit(
    train_ds,
    epochs=15,
    steps_per_epoch=STEPS_PER_EPOCH,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
        tf.keras.callbacks.TensorBoard(MODEL0_LOGS, histogram_freq=1)
    ],
    validation_data=val_ds,
    verbose=2
)

#### Evaluate Model

In [None]:
test_loss, test_acc = model0.evaluate(test_ds)

print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_acc)

In [None]:
tb0 = program.TensorBoard()
tb0.configure(argv=[None, '--load_fast', 'false', '--logdir', MODEL0_LOGS])
url = tb0.launch()
print(f"TensorBoard listening on {url}")

### Model 1 - GRU

In [None]:
MODEL1_LOGS = os.path.join(LOG_ROOT_DIR, "tb-model1")

#### Create Model

In [None]:
model1 = tf.keras.Sequential([
    vectorize_layer,
    # Creates embedded vector for each word-index
    # (batch_size, steps) -> (batch_size, steps, features)
    tf.keras.layers.Embedding(
        input_dim=vectorize_layer.vocabulary_size(),
        output_dim=EMBEDDING_DIM,
        mask_zero=True
    ),
    # Regularize by randomly dropping dimensions in feature vector
    tf.keras.layers.SpatialDropout1D(0.5),
    # LSTM Layer
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    # Dense Layer
    tf.keras.layers.Dense(
        units=32,
        activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(0.0001)
    ),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1),
])

model1.compile(
    optimizer=tf.keras.optimizers.Adam(lr_scheduler),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

#### Fit Model

In [None]:
history1 = model1.fit(
    train_ds,
    epochs=15,
    steps_per_epoch=STEPS_PER_EPOCH,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=3, verbose=1),
        tf.keras.callbacks.TensorBoard(MODEL1_LOGS, histogram_freq=1)
    ],
    validation_data=val_ds,
    verbose=2
)

#### Evaluate Model

In [None]:
test_loss, test_acc = model1.evaluate(test_ds)

print("Test Loss: ", test_loss)
print("Test Accuracy: ", test_acc)

In [None]:
tb1 = program.TensorBoard()
tb1.configure(argv=[None, '--load_fast', 'false', '--logdir', MODEL1_LOGS])
url = tb1.launch()
print(f"TensorBoard listening on {url}")

### Outcome

In [None]:
plotter = tfdocs.plots.HistoryPlotter(metric="loss", smoothing_std=10)
plotter.plot({
    "CNN": history0,
    "GRU": history1
})

In [None]:
# Test model on a couple of example
examples = tf.constant([
  "The movie was great! The animation and the graphics were out of this world.",
  "The movie was amazing! I would recommend this movie.",
  "The movie was terrible. I wouldn't recommend this movie."
])

predictions = model0.predict(examples)
print("CNN:")
print(["+" if p > 0.0 else "-" for p in predictions])
print()
print("GRU:")
print(["+" if p > 0.0 else "-" for p in predictions])