In [1]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
import numpy as np
import random

SEED = 42
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [2]:
(train_dataset,test_dataset),ds_info = tfds.load('imdb_reviews',
                                                 with_info = True,
                                         split = ['train','test'],
                                                  as_supervised = True,
                                         shuffle_files = True,
                                         batch_size = 128

                                                 )

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteHBRPOI/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [3]:
def text_vectorization(dataset,vectorizer):
  """
  Uses TextVectorization Layer outside of the model.
  The purpose is to make the NN model suitable to .h5 format.

  Parameters
  ----------
  dataset: Tensorflow dataset containing data and labels.
  vectorizer: A TextVectorization layer adapted to train_dataset
  """
  #conduct all the process in CPU to CPU-GPU conflict
  with tf.device("/cpu:0"):
    outputs = []
    label_list = []

    #tokenize and pad the data and store it
    for x, y in dataset:
        output = vectorizer(x)
        outputs.append(output)
        label_list.append(y)

    #concatenate the labels and data
    X_vectorized = tf.concat(outputs, axis=0)
    Y_labels = tf.concat(label_list, axis=0)

    vectorized = tf.data.Dataset.from_tensor_slices((X_vectorized, Y_labels)).batch(32).prefetch(tf.data.AUTOTUNE)
    return vectorized

def define_callbacks(model):
  es = tf.keras.callbacks.EarlyStopping(patience = 5,verbose = 1, restore_best_weights = True)
  mc = tf.keras.callbacks.ModelCheckpoint(filepath = f"./ModelCheckpoints/{model.name}.ckpt",
                                         save_best_only = True,
                                         save_weights_only = True)
  tb = tf.keras.callbacks.TensorBoard(log_dir = f"./TensorboardLogs/{model.name}")

  return es,mc,tb

In [4]:
#get the vocab length
vectorizer = layers.TextVectorization(max_tokens = None)
vectorizer.adapt(train_dataset.map(lambda x,y: x))
total_words = vectorizer.vocabulary_size()

In [5]:
#save the length of the sequences to find the output sequence length
text_lengths = []
train_dataset_unbatched = train_dataset.unbatch()
for text,label in train_dataset_unbatched: #unbatch the dataset
  text_lengths.append(len(text.numpy().split())) #append the sentence length

In [6]:
#create model and history dictionaries
models = {}
histories = {}

In [7]:
MAX_TOKENS = 20000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 128
MODEL_NAME = "model1"
HISTORY_NAME = 'history1'

#define the actual vectorizer
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS,
                                               output_sequence_length = OUTPUT_SEQUENCE_LENGTH)

vectorizer.adapt(train_dataset.map(lambda x,y: x))
#preprocess the data
train_dataset_vectorized1 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized1 = text_vectorization(test_dataset,vectorizer)


inputs = tf.keras.layers.Input(shape = (598,))

x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)


#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized1,validation_data = test_dataset_vectorized1,epochs = 5,
                    )



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Add another Dense Layer: Not worked. Increased overfitting.

In [8]:
MAX_TOKENS = 20000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 128
MODEL_NAME = "model2"
HISTORY_NAME = 'history2'


inputs = tf.keras.layers.Input(shape = (598,))

x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64,activation = 'relu')(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)

#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized1,validation_data = test_dataset_vectorized1,epochs = 5,
                   )


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Decrese the tokens: Worked. Overfitting decreased.

In [9]:
MAX_TOKENS = 10000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 50
MODEL_NAME = 'model3'
HISTORY_NAME = 'history3'

#define the actual vectorizer
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS,
                                               output_sequence_length = OUTPUT_SEQUENCE_LENGTH)
vectorizer.adapt(train_dataset.map(lambda x,y: x))

#preprocess the data
train_dataset_vectorized2 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized2 = text_vectorization(test_dataset,vectorizer)

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.GlobalAveragePooling1D()(x)
#x = layers.Dense(64,activation = 'relu')(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)


#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized2,validation_data = test_dataset_vectorized2,epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Try Flattening: Not worked Increased overfitting a lot.

In [10]:
MAX_TOKENS = 10000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 50
MODEL_NAME = 'model4'
HISTORY_NAME = 'history4'

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.Flatten()(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)

#create the callbacks
es,mc,tb = define_callbacks(models[MODEL_NAME])

#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized2,validation_data = test_dataset_vectorized2,epochs = 5)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Use Convolution Layer:

In [11]:
MAX_TOKENS = 10000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 50
MODEL_NAME = 'model5'
HISTORY_NAME = 'history5'
ACTIVATION = 'swish'

#define the actual vectorizer
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS,
                                               output_sequence_length = OUTPUT_SEQUENCE_LENGTH)
vectorizer.adapt(train_dataset.map(lambda x,y: x))

#preprocess the data
train_dataset_vectorized2 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized2 = text_vectorization(test_dataset,vectorizer)

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.Conv1D(filters = 32,kernel_size = 3,activation = ACTIVATION)(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)

#define callbacks
es,mc,tb = define_callbacks(models[MODEL_NAME])

#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized2,validation_data = test_dataset_vectorized2,epochs = 100,
                                                 callbacks = [es,mc,tb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping


Train model 3 for longer.

In [12]:
MAX_TOKENS = 10000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 50
MODEL_NAME = 'model6'
HISTORY_NAME = 'history6'

#define the actual vectorizer
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS,
                                               output_sequence_length = OUTPUT_SEQUENCE_LENGTH)
vectorizer.adapt(train_dataset.map(lambda x,y: x))

#preprocess the data
train_dataset_vectorized2 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized2 = text_vectorization(test_dataset,vectorizer)

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)

#define callbacks
es,mc,tb = define_callbacks(models[MODEL_NAME])

#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized2,validation_data = test_dataset_vectorized2,epochs = 100,
                                                 callbacks = [es,mc,tb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 14: early stopping


Add dropout and Dense layer. Change Activation to swish

In [13]:
MAX_TOKENS = 10000
OUTPUT_SEQUENCE_LENGTH = int(tfp.stats.percentile(text_lengths,95).numpy())
EMBEDDING_DIM = 50
MODEL_NAME = 'model7'
HISTORY_NAME = 'history7'
ACTIVATION = 'swish'

#define the actual vectorizer
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS,
                                               output_sequence_length = OUTPUT_SEQUENCE_LENGTH)
vectorizer.adapt(train_dataset.map(lambda x,y: x))

#preprocess the data
train_dataset_vectorized2 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized2 = text_vectorization(test_dataset,vectorizer)

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                      output_dim = EMBEDDING_DIM)(inputs)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64,activation = ACTIVATION)(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1,activation = 'sigmoid')(x)

#create the base model
models[MODEL_NAME] = tf.keras.Model(inputs,outputs,name = MODEL_NAME)

#define callbacks
es,mc,tb = define_callbacks(models[MODEL_NAME])

#compile the model
models[MODEL_NAME].compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#fit the model
histories[HISTORY_NAME] = models[MODEL_NAME].fit(train_dataset_vectorized2,validation_data = test_dataset_vectorized2,epochs = 100,
                                                 callbacks = [es,mc,tb])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: early stopping


NOTE: LSTM does not work in GPU when it is used with relu;

In [14]:
#!tensorboard dev upload --logdir /content/TensorboardLogs

In [15]:
#clone the model
final_model = tf.keras.models.clone_model(models['model5'])

#load the best weights
final_model.load_weights("/content/ModelCheckpoints/model5.ckpt")

#compile the model
final_model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#evaluate the test dataset
final_model.evaluate(test_dataset_vectorized2)



[0.28687766194343567, 0.886680006980896]

In [16]:
#!tensorboard dev list

In [17]:
#!tensorboard dev delete --experiment_id rvOlSmxeQMiBvT4XhMmLdw

In [18]:
#final_model.save('final_model.h5')