<a href="https://colab.research.google.com/github/fboldt/aulasann/blob/main/aula12f_embedding_bidir_lstm_with_mask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup
!cat aclImdb/train/pos/4077_10.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  50.1M      0  0:00:01  0:00:01 --:--:-- 50.1M
I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drin

In [2]:
import os, pathlib, shutil, random
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
train_pos_dir = train_dir / "pos"
train_neg_dir = train_dir / "neg"
val_pos_dir = val_dir / "pos"
val_neg_dir = val_dir / "neg"
for category in ("neg", "pos"):
  os.makedirs(val_dir / category)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2 * len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files:
    shutil.move(train_dir / category / fname,
                val_dir / category / fname)

In [3]:
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [4]:
from tensorflow.keras.layers import TextVectorization
max_length = 600
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens = max_tokens,
    output_mode = "int",
    output_sequence_length = max_length)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [5]:
for inputs, targets in int_train_ds:
  print("inputs.shape:", inputs.shape)
  print("targets.shape:", targets.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 600)
targets.shape: (32,)
inputs.dtype: <dtype: 'int64'>
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(
[   10   283  2026     1     2    18   748     3    10   410     2    18
  2677     5     2   185    12  2026     1   105  1505     3  4050     1
     1     3     1   560    31     2   167   827     3    54     2  1559
     5   112     2    65     1  2264     1    44    47  6339    37     1
    16   451    72    69    26   247    54     1    23    24    53  4093
    13    13     2  1113     7  5878    34  2026   578     3   105  1370
    24     1     6     2  1270   112     7     1  4762     7     4   371
 12262    12     7   307   491    33    25   202   214    12   176   838
    89    98     1     8    25    99    28   883     1    87    28  2056
 13993     4  4310 13995    36   120    22   433    43 16377    42 16267
     3     7   165     6     2  2264  3937    12    28    69   684    27
  1945  4762  1033     6   873     1     3     8     2   749     5 

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(max_tokens, 256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

In [9]:
callbacks = [
    keras.callbacks.ModelCheckpoint("embedding_bi_lstm_with_mask.keras",
                                    save_best_only=True)
]
history = model.fit(int_train_ds.cache(),
                    validation_data=int_val_ds.cache(),
                    epochs=10,
                    callbacks=callbacks)
model = keras.models.load_model("embedding_bi_lstm_with_mask.keras")
test_loss, test_acc = model.evaluate(int_test_ds)
print(f"Test Loss: {test_loss:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 45ms/step - accuracy: 0.6845 - loss: 0.5666 - val_accuracy: 0.8160 - val_loss: 0.4023
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 41ms/step - accuracy: 0.8591 - loss: 0.3336 - val_accuracy: 0.8574 - val_loss: 0.3346
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 41ms/step - accuracy: 0.8968 - loss: 0.2660 - val_accuracy: 0.8814 - val_loss: 0.2995
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 40ms/step - accuracy: 0.9119 - loss: 0.2237 - val_accuracy: 0.8682 - val_loss: 0.3753
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.9356 - loss: 0.1781 - val_accuracy: 0.8694 - val_loss: 0.3393
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 40ms/step - accuracy: 0.9543 - loss: 0.1326 - val_accuracy: 0.8736 - val_loss: 0.3579
Epoch 7/10
[1m6