# Text Classification using tf.keras

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

In [None]:
# Validating the versions
print(f'Tensorflow Versio: {tf.__version__}')
print(f'Eager mode: {tf.executing_eagerly()}')
print(f'Hub Version: {hub.__version__}')
print(f'GPU is {"availble" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE"}')

In [None]:
# download IMDB reviews dataset
train_data, test_data = tfds.load(name="imdb_reviews", split=["train","test"], batch_size=-1, as_supervised=True)
train_features, train_labels = tfds.as_numpy(train_data) 
test_features, test_labels = tfds.as_numpy(test_data) 

In [None]:
# Explore data
print(f'Training examples: {len(train_features)}, Test examples: {len(test_features)}')
X = train_features[:10]
y = train_labels[:10]
for i in range(len(X)):
    print(f'{y[i]}: {X[i]}')


In building a neural network model, we have to make 3 architectural decisions, namely:
- How to represent the text?
- How many layers to use in the model?
- How many hidden units to use in each layer?

In this particular example, input data represent texts and the labels are either 0 or 1.
We can convert the texts into embeddings vector using pretrained text embeddings. There are 2 advantages associated with using pretrained text embeddings, namely:
- No worries about preprocessing the texts
- Exploit the benefits of transfer learning

For this example, we can use pretrained embeddings model from Tensorflow hub called google/nnlm-en-dim50/2. There are two other models:
- google/nnlm-en-dim50-with-normalization/2, same as the above named model but with normalization to remove punctuations, thereby improving in-vocabulary coverage.
- google/nnlm-en-dim128-with-normalization/2, which uses embedding dimension of 128 instead of 50.


In [None]:
# create KerasLayer that uses Tensorflow Hub model to embed sentences, expected output of the layer is (number of examples, embedding dimension). In this case, the dimension is (3,50)
# model = "http://tfhub.dev/google/nnlm-en-dim50/2"
model = "http://tfhub.dev/google/nnlm-en-dim50-with-normalization/2"
hub_layer = hub.KerasLayer(model, input_shape=[], dtype=tf.string, trainable=True)
hub_layer(train_features[:3])

In [None]:
# Build the full model
# model uses a pretrained saved model to map sentences to the embedding vectors (split sentence to token, generates token embedding and then combine token embedding to vector of fixed size 50)
# The fixed length vector is piped through a layer with 16 hidden units
# The last layer is a single output node, with logits output.

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(32, activation="relu"))
model.add(tf.keras.layers.Dense(16, activation="relu"))
model.add(tf.keras.layers.Dense(1))

model.summary()

In [None]:
# Compile the model, specifying th loss function and the optimizr for training. Since this is a binary classification problem, and the output is a probability, we'll use the binary_crossentropy loss function.
model.compile(optimizer="adam",
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=[tf.metrics.BinaryAccuracy(threshold=0.0, name="accuracy")])

In [None]:
# create validation set
validation_features = train_features[:10000]
validation_labels = train_labels[:10000]

train_features = train_features[10000:]
train_labels = train_labels[10000:]

In [None]:
train_history = model.fit(train_features,
                          train_labels,
                          epochs=20,
                          batch_size=512,
                          validation_data=(validation_features, validation_labels),
                          verbose=1)

In [None]:
# Visualization: plot the training and validation loss from the model training
history = train_history.history
accuracy = history['accuracy']
validation_accuracy = history['val_accuracy']
loss = history['loss']
validation_loss = history['val_loss']

epochs =  range(1, len(accuracy) + 1)
plt.plot(epochs, loss, 'ro', label='Training Loss')
plt.plot(epochs, validation_loss, 'b', label='Validation Loss')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
# Visualization: plot training and validation accuracy 
plt.clf() # clear figure to plot new data

plt.plot(epochs, accuracy, 'bo', label='Training Accuracy')
plt.plot(epochs, validation_accuracy, 'b', label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()