In [10]:
import tensorflow_datasets as tfds
import tensorflow as tf

# load data

In [3]:
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.U7KDLS_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.U7KDLS_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.U7KDLS_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [7]:
# take the first example of the review
single_example = list(imdb['train'].take(1))[0]

In [6]:
# the review is on the first position of the tuple
single_example[0]

<tf.Tensor: shape=(), dtype=string, numpy=b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.">

In [8]:
# the label is stored on the second position of the tuple
single_example[1]

<tf.Tensor: shape=(), dtype=int64, numpy=0>

# train test split

In [9]:
train_dataset, test_dataset = imdb['train'], imdb['test']

# only return the review
train_reviews = train_dataset.map(lambda review, label: review)
# return the label
train_labels = train_dataset.map(lambda review, label: label)

test_reviews = test_dataset.map(lambda review, label: review)
test_labels = test_dataset.map(lambda review, label: label)

* leveraging map to split the reviews and the label

# encoding and padding

In [14]:
# initialize the layer
# max_tokens=1000 -> use limited amount of tokens (choose the top 10k tokens)
vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=1000)

# adapt into TextVectorization -> create the vocabs
vectorize_layer.adapt(train_reviews)

def padding_func(sequences):
  """Transform into pre-padding sequences"""

  # ragged batch contains all of the sequences
  # produce ragged tensor has sequences with diff length s
  sequences = sequences.ragged_batch(batch_size=sequences.cardinality())
  sequences = sequences.get_single_element()

  # padding
  padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(), maxlen=120,
                                                  truncating='post', padding='pre')
  padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)
  return padded_sequences

* ragged tensor -> tensor with different length
* `batch_size=sequences.cardinality()` -> will use batch size with length of the sequences (process all data)
* `maxlen=120, truncating='post', padding='pre'` -> length of all sequence will be the same (120), if it's too long it will truncate the end and if it's too short it'll use pre-padding (0 from front).

In [15]:
train_sequences = train_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)
test_sequences = test_reviews.map(lambda text: vectorize_layer(text)).apply(padding_func)

In [16]:
train_sequences

<_TensorSliceDataset element_spec=TensorSpec(shape=(120,), dtype=tf.int32, name=None)>

# combine train and test data with the labels

In [17]:
train_dataset_vectorized = tf.data.Dataset.zip(train_sequences, train_labels)
test_dataset_vectorized = tf.data.Dataset.zip(test_sequences, test_labels)

In [19]:
SHUFFLE_BUFFER_SIZE = 1000
PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE
BATCH_SIZE = 32

In [27]:
train_dataset_final = (train_dataset_vectorized
                       .cache()
                       .shuffle(SHUFFLE_BUFFER_SIZE)
                       .prefetch(PREFETCH_BUFFER_SIZE)
                       .batch(BATCH_SIZE)
                      )

test_dataset_final = (test_dataset_vectorized
                      .cache()
                      .prefetch(PREFETCH_BUFFER_SIZE)
                      .batch(BATCH_SIZE)
                    )

# modeling

In [43]:
vocab_size = len(vectorize_layer.get_vocabulary())
embedding_dim = 16

In [44]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(120,)), # input shape
    tf.keras.layers.Embedding(vocab_size, embedding_dim), # embedding layer
    tf.keras.layers.Flatten(),  # from 2D to be 1D
    tf.keras.layers.Dense(6, activation='relu'),  # hidden layer
    tf.keras.layers.Dense(1, activation='sigmoid')  # output (binary classification)
])

* the result of embedding will be a 2D array with the length of the sentence and the embedding dimension. ex: 16 as its size
* flatten it or use `GlobalAveragePooling1D()`
* feed into dense layer neural network to do classification

In [45]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

# train data

In [46]:
num_epochs = 10
model.fit(train_dataset_final,
          epochs=num_epochs,
          validation_data=test_dataset_final)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6212 - loss: 0.6273 - val_accuracy: 0.7919 - val_loss: 0.4361
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8055 - loss: 0.4203 - val_accuracy: 0.8014 - val_loss: 0.4297
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8430 - loss: 0.3561 - val_accuracy: 0.7846 - val_loss: 0.4769
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8785 - loss: 0.2961 - val_accuracy: 0.7759 - val_loss: 0.5149
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9043 - loss: 0.2392 - val_accuracy: 0.7658 - val_loss: 0.5947
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9264 - loss: 0.1906 - val_accuracy: 0.7602 - val_loss: 0.6762
Epoch 7/10
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x7ed4ee169d10>

In [47]:
model.evaluate(train_dataset_final)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9954 - loss: 0.0281


[0.02304774336516857, 0.9966400265693665]

In [48]:
model.predict(test_dataset_final)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


array([[9.4791508e-01],
       [9.9696141e-01],
       [1.2008879e-05],
       ...,
       [1.8286351e-04],
       [9.9448359e-01],
       [9.9916488e-01]], dtype=float32)

# see the embeddings

In [36]:
import io

In [39]:
embedding_model = tf.keras.Sequential([
    vectorize_layer,
    tf.keras.layers.Embedding(input_dim=len(vectorize_layer.get_vocabulary()),
                              output_dim=embedding_dim)
])

In [41]:
# check if the model has been built
if not embedding_model.built:
  # build the model with a dummy input shape
  embedding_model.build(input_shape=(None, 1))  # Adjust input shape if needed

# get the embedding layer (index 1)
embedding_layer = embedding_model.layers[1]

# check if the layer has weights
if embedding_layer.weights:
  embedding_weights = embedding_layer.get_weights()[0]
else:
  print("Warning: Embedding layer has no weights. The model might not have been trained or built correctly.")
  embedding_weights = None

In [42]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

vocabulary = vectorize_layer.get_vocabulary()

for word_num in range(1, len(vocabulary)):
  word_name = vocabulary[word_num]
  word_embedding = embedding_weights[word_num]
  out_m.write(word_name + '\n')
  out_v.write('\t'.join([str(x) for x in word_embedding]) + '\n')

out_v.close()
out_m.close()


* `meta.tsv` -> words from vocabulary (as words' label in visualization)
* `vecs.tsv` -> vector from embedding layer (words' position in embedding spaces)
* Tools [projector.tensorflow.org](https://projector.tensorflow.org/)(visualize and explor semantic meaning bw words)
