# TensorFlow: Sentiment classification using word embeddings

In [None]:
import os
import matplotlib.pyplot as plt

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

## Prepare Datasets

### Load datasets

In [None]:
(raw_train_ds, raw_val_ds, raw_test_ds), ds_info = tfds.load(
    name="imdb_reviews",
    split=["train[:60%]", "train[60%:]", "test"],
    as_supervised=True,
    with_info=True
)

### Pre-process Datasets

In [None]:
# The size of the vocabulary
MAX_FEATURES = 10000

# The sequence length (pad or truncate to this value)
SEQ_LEN = 250

vectorize_layer = layers.TextVectorization(
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQ_LEN)

In [None]:
train_x = raw_train_ds.map(lambda x, y: x)

# Adapt vectorization layer (train data must be used only)
vectorize_layer.adapt(train_x)

In [None]:
def vectorize_text(x, y):
    # Add a batch dimension
    x = tf.expand_dims(x, -1)
    y = tf.expand_dims(y, -1)
    # Vectorize a text
    return vectorize_layer(x), y

In [None]:
x, y = next(iter(raw_train_ds.take(1)))
print("Review: ", x.numpy())
print("Label: ", y.numpy())
print("Vectorized review: ", vectorize_text(x, y))

In [None]:
print("3201 ---> ",vectorize_layer.get_vocabulary()[1177])
print("2194 ---> ",vectorize_layer.get_vocabulary()[7819])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

In [None]:
train_ds = (raw_train_ds.map(vectorize_text)
    .cache()
    .prefetch(tf.data.AUTOTUNE))
val_ds = (raw_val_ds.map(vectorize_text)
    .cache()
    .prefetch(tf.data.AUTOTUNE))
test_ds = (raw_test_ds.map(vectorize_text)
   .cache()
   .prefetch(tf.data.AUTOTUNE))

## Build Model

### Create Model

In [None]:
EMBEDDING_DIM = 16

model = tf.keras.Sequential([
    # Creates embedded vector for each word-index
    # (batch_size, steps) -> (batch_size, steps, features)
    layers.Embedding(MAX_FEATURES, EMBEDDING_DIM),
    layers.Dropout(0.2),
    # Dimensionality reduction by averaging over feature1,...,featureN across all steps (columns)
    # (batch_size, steps, features) -> (batch_size, features)
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[tf.metrics.BinaryAccuracy(threshold=0.5)]
)

### Fit Model

In [None]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)],
    verbose=1
)

### Evaluate Model

In [None]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
acc = history.history["binary_accuracy"]
val_acc = history.history["val_binary_accuracy"]
loss = history.history["loss"]
val_loss = history.history["val_loss"]

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, "b", label="Training Accuracy")
plt.plot(epochs, val_acc, "g", label="Validation Accuracy")
plt.title("Training and Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

plt.plot(epochs, loss, "r", label="Training Loss")
plt.plot(epochs, val_loss, "g", label="Validation Loss")
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

### Export Model

In [None]:
# Create export model with text vectorization layer
# (such movel is able to get into input raw text)
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation("sigmoid"),
])

In [None]:
export_model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy']
)

In [None]:
export_model.summary()

In [None]:
# Add batch dimension to x and y
export_test_ds = raw_test_ds.map(lambda x, y: (tf.expand_dims(x, -1), tf.expand_dims(y, -1)))

In [None]:
# Evaluate exported model on unseen test data
metrics = export_model.evaluate(export_test_ds, return_dict=True)
print(metrics)

In [None]:
# Predict on the examples using exported model
examples = tf.constant([
  "The movie was great!",
  "The movie was okay.",
  "The movie was terrible..."
])

export_model.predict(examples)