In [1]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  48.9M      0  0:00:01  0:00:01 --:--:-- 48.9M


In [2]:
import os, pathlib, shutil, random
from tensorflow import keras
batch_size = 32
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok = True)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)
text_only_train_ds = train_ds.map(lambda x, y: x)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [3]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [4]:
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'the',
 'a',
 'and',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'br',
 'was',
 'as',
 'for',
 'with',
 'but',
 'movie',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'who',
 'from',
 'so',
 'like',
 'her',
 'or',
 'just',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'more',
 'when',
 'very',
 'my',
 'even',
 'she',
 'no',
 'up',
 'would',
 'which',
 'only',
 'time',
 'really',
 'story',
 'were',
 'their',
 'had',
 'see',
 'can',
 'me',
 'than',
 'we',
 'much',
 'well',
 'get',
 'been',
 'will',
 'into',
 'bad',
 'also',
 'because',
 'do',
 'first',
 'great',
 'people',
 'other',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'then',
 'movies',
 'could',
 'films',
 'make',
 'them',
 'way',
 'any',
 'too',
 'after',
 'characters',
 'think',
 'watch',
 'many',
 'two',
 'being',
 'character',
 'seen',
 'never',
 'little',
 'plot',
 'best',
 'acting',
 'whe

In [5]:
for inputs, targets in int_train_ds:
    print(inputs[31])
    break

tf.Tensor(
[   10  1686    11   638   168  2385    15   170     5     1     9    14
    21     4  4155  2148   192    62    17  3094   249     4   168  1349
    45    23   178     6    68   140   164   138   104    57    21    59
   353  1187 14234  3848     4  1848  3591     9    63     7   646   259
   619   134    57    29   122   261    21     6     3    50    20   644
   101   391     3   383   250   120    11   120    63   159    70  1188
    10   796   144   334     4  2046 12739   709     6   138  1751   127
     3  3089    16   122   243    11 12073   198   109     1   532  2638
   325    18   130    10     1   484   301   403   104   366   314     6
    68     2   216  1987    44     6  1401     8     2    98     5  1696
     4   719     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0    

In [6]:
import tensorflow as tf
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirectiona  (None, 64)               5128448   
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
___________________________________________________

In [7]:
print(embedded)

KerasTensor(type_spec=TensorSpec(shape=(None, None, 20000), dtype=tf.float32, name=None), name='tf.one_hot/one_hot:0', description="created by layer 'tf.one_hot'")


In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=5, callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/5