# FC book Ch. 11 Natural language processing. IMDB reviews.

## Prepare IMDB data

In [1]:
import os, pathlib, shutil, random

"""
DATA ALREADY PREPARED

base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)
"""

'\nDATA ALREADY PREPARED\n\nbase_dir = pathlib.Path("aclImdb")\nval_dir = base_dir / "val"\ntrain_dir = base_dir / "train"\nfor category in ("neg", "pos"):\n    os.makedirs(val_dir / category)\n    files = os.listdir(train_dir / category)\n    random.Random(1337).shuffle(files)\n    num_val_samples = int(0.2 * len(files))\n    val_files = files[-num_val_samples:]\n    for fname in val_files:\n        shutil.move(train_dir / category / fname, val_dir / category / fname)\n'

In [2]:
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size )
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

2024-01-14 21:54:27.499605: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-14 21:54:27.499648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-14 21:54:27.500841: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-14 21:54:27.507146: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 20000 files belonging to 2 classes.


2024-01-14 21:54:29.407539: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-14 21:54:29.445835: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-01-14 21:54:29.446247: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


## Listing 11.2
Displaying the shapes and dtypes of the first batch

In [3]:
for inputs, targets in train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
targets[0]: tf.Tensor(1, shape=(), dtype=int32)


## Listing 11.3
Preprocessing our datasets with a TextVectorization layer

In [4]:
from tensorflow.keras.layers import TextVectorization

text_vectorization = TextVectorization( max_tokens=20000, output_mode="multi_hot")
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
binary_1gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map( lambda x, y: (text_vectorization(x), y),num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

## Listing 11.4
Inspecting the output of our binary unigram dataset

In [5]:
for inputs, targets in binary_1gram_train_ds:
    print("inputs.shape:", inputs.shape)
    print("inputs.dtype:", inputs.dtype)
    print("targets.shape:", targets.shape)
    print("targets.dtype:", targets.dtype)
    print("inputs[0]:", inputs[0])
    print("targets[0]:", targets[0])
    break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


## Listing 11.5
Our model-building utility

In [6]:
from tensorflow import keras
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
    return model

## Listing 11.6
Training and testing the binary unigram model

In [7]:
model = get_model()
model.summary()



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [8]:

callbacks = [ keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True) ]
model.fit(binary_1gram_train_ds.cache(), validation_data=binary_1gram_val_ds.cache(), epochs=10, callbacks=callbacks)

model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/10
  1/625 [..............................] - ETA: 15:37 - loss: 0.6977 - accuracy: 0.5000

2024-01-14 21:55:29.153124: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f10388fda30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-01-14 21:55:29.153164: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 2060, Compute Capability 7.5
2024-01-14 21:55:29.158210: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-01-14 21:55:29.170753: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1705287329.207333   29883 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.881


## Listing 11.7
Configuring the TextVectorization layer to return bigrams

In [9]:
text_vectorization = TextVectorization( ngrams=2,max_tokens=20000,output_mode="multi_hot")

## Listing 11.8
Training and testing the binary bigram model

In [10]:
text_vectorization.adapt(text_only_train_ds)

binary_2gram_train_ds = train_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map( lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

model = get_model()
model.summary()

callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras",save_best_only=True)]

model.fit(binary_2gram_train_ds.cache(), validation_data=binary_2gram_val_ds.cache(), epochs=10, callbacks=callbacks)
model = keras.models.load_model("binary_2gram.keras")

print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.891


## Listing 11.9
Configuring the TextVectorization layer to return token counts

If you’re doing text classification, knowing how many times a word occurs in a sample
is critical: any sufficiently long movie review may contain the word “terrible” regard-
less of sentiment, but a review that contains many instances of the word “terrible” is
likely a negative one.

Here’s how you’d count bigram occurrences with the TextVectorization layer.

In [11]:
text_vectorization = TextVectorization( ngrams=2, max_tokens=20000, output_mode="count")

## Listing 11.10 
Configuring TextVectorization to return TF-IDF-weighted outputs

In [12]:
text_vectorization = TextVectorization(ngrams=2, max_tokens=20000, output_mode="tf_idf")

## Listing 11.11 
Training and testing the TF-IDF bigram model

In [13]:
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y),num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y),num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y),num_parallel_calls=4)

model = get_model()
model.summary()

callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram.keras",save_best_only=True)]
model.fit(tfidf_2gram_train_ds.cache(), validation_data=tfidf_2gram_val_ds.cache(), epochs=10, callbacks=callbacks)
model = keras.models.load_model("tfidf_2gram.keras")

print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.894


## Listing 11.12 
Preparing integer sequence datasets

In [14]:
from tensorflow.keras import layers
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization( max_tokens=max_tokens, output_mode="int", output_sequence_length=max_length)

text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

## Listing 11.13 
A sequence model built on one-hot encoded vector sequences

In [15]:
import tensorflow as tf

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirection  (None, 64)                5128448   
 al)                                                             
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

## Listing 11.14 
## Was not able to execute this listing
## Listings 11.21 - 11.24 Remains 

In [19]:
#callbacks = [ keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras", save_best_only=True) ]

In [18]:
#model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

In [25]:
#model = keras.models.load_model("one_hot_bidir_lstm.keras")
#print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")