In [1]:
#import libraries
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

In [2]:
(train_dataset,test_dataset),ds_info = tfds.load('imdb_reviews',
                                                 with_info = True,
                                         split = ['train','test'],
                                                  as_supervised = True,
                                         shuffle_files = True,
                                         batch_size = 128
                                                 )

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQMTOWO/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQMTOWO/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteQMTOWO/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [3]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2023-10-06 03:47:09--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-10-06 03:47:10--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-10-06 03:47:11--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

## Create Glove Embedding Matrix

Got help from [here.](https://github.com/TheKidPadra/DeepLearning.AI-TensorFlow_Developer-specialization/blob/main/3.%20Natural%20Language%20Processing%20in%20TensorFlow/C3W3_Assignment.ipynb)

In [4]:
#read the glove embeding file
GLOVE_FILE = './glove.6B.100d.txt'

#initialize a dictionary
GLOVE_EMBEDDINGS = {}

#open the glove file and process it.
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

In [5]:
text_lengths = []
reviews = train_dataset.map(lambda x,y: x)

for a in reviews:
  text_lengths.append(len(a.numpy()[0].split()))

OUTPUT_SEQUENCE_LENGTH = int(np.percentile(text_lengths,95))
OUTPUT_SEQUENCE_LENGTH

725

In [6]:
#initialize a text vectorization layer to get the words
vectorizer = layers.TextVectorization(max_tokens = 10000,
                                      output_sequence_length = OUTPUT_SEQUENCE_LENGTH)

#adapt the vectorizer
vectorizer.adapt(train_dataset.map(lambda x,y: x))

words = vectorizer.get_vocabulary()

#control the process
words[0:5]

['', '[UNK]', 'the', 'and', 'a']

In [7]:

def text_vectorization(dataset,vectorizer):
  """
  Uses TextVectorization Layer outside of the model.
  The purpose is to make the NN model suitable to .h5 format.

  Parameters
  ----------
  dataset: Tensorflow dataset containing data and labels.
  vectorizer: A TextVectorization layer adapted to train_dataset
  """
  #conduct all the process in CPU to CPU-GPU conflict
  with tf.device("/cpu:0"):
    outputs = []
    label_list = []

    #tokenize and pad the data and store it
    for x, y in dataset:
        output = vectorizer(x)
        outputs.append(output)
        label_list.append(y)

    #concatenate the labels and data
    X_vectorized = tf.concat(outputs, axis=0)
    Y_labels = tf.concat(label_list, axis=0)

    vectorized = tf.data.Dataset.from_tensor_slices((X_vectorized, Y_labels)).batch(32).prefetch(tf.data.AUTOTUNE)
    return vectorized

def define_callbacks(model):
  es = tf.keras.callbacks.EarlyStopping(patience = 5,verbose = 1, restore_best_weights = True)
  mc = tf.keras.callbacks.ModelCheckpoint(filepath = f"./ModelCheckpoints/{model.name}.ckpt",
                                         save_best_only = True,
                                         save_weights_only = True)
  tb = tf.keras.callbacks.TensorBoard(log_dir = f"./TensorboardLogs/{model.name}")

  return es,mc,tb

In [8]:
train_dataset_vectorized1 = text_vectorization(train_dataset,vectorizer)
test_dataset_vectorized1 = text_vectorization(test_dataset,vectorizer)

In [9]:
train_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
#create a model with glove embeddings
MAX_TOKENS = 10000
EMBEDDING_DIM = 128

inputs = tf.keras.layers.Input(shape = (598,))
x = layers.Embedding(input_dim = MAX_TOKENS,
                     output_dim = EMBEDDING_DIM,
                     embeddings_initializer=tf.keras.initializers.Constant(GLOVE_EMBEDDINGS),
                     trainable = False)(inputs)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(1,activation = 'sigmoid')

model = tf.keras.Model(inputs,outputs)

#compile the model
model.compile(
    optimizer = 'adam',
    loss = 'binary_crossentropy',
    metrics = ['accuracy']
)

#define callbacks
es,mc,tb = define_callbacks(model)

#fit the model
history = model.fit(train_dataset_vectorized1,
                    validation_data = test_dataset_vectorized1,
                    epochs = 100,
                    callbacks = [es,mc])