In [1]:
# Common imports
import numpy as np
import os

In [2]:
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

In [3]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
(X_train, y_test), (X_valid, y_valid) = keras.datasets.imdb.load_data()
X_train.shape, y_test.shape, X_valid.shape, y_valid.shape

((25000,), (25000,), (25000,), (25000,))

In [5]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [6]:
word_index = keras.datasets.imdb.get_word_index()

# The integers 0, 1, and 2 are special: they represent the padding token, the start-of-sequence (SoS) and unknown words
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [7]:
import tensorflow_datasets as tfds
# as_supervised: if True, the returned tf.data.Dataset will have a 2-tuple structure (input, label)
# with_info: if True, tfds.load will return the tuple (tf.data.Dataset, tfds.core.DatasetInfo) containing the info associated with the builder.
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

In [8]:
datasets.keys()

dict_keys(['test', 'train', 'unsupervised'])

In [9]:
info

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=1.0.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word

In [10]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

In [11]:
train_size, test_size

(25000, 25000)

In [12]:
for X_batch, y_batch in datasets["train"].batch(5).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative

Review: Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do. <br /><br />But come on Hollywood - a Moun ...
Label: 0 = Negative

Review: This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...
Label: 1 = Positive

Review: As others have mentioned, all the women that go nude in 

In [13]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [14]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(5, 59), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>', b'<pad>',
         b'<pad>', b'<pad>', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'c

In [15]:
from collections import Counter

vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [16]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [17]:
len(vocabulary)

53893

In [18]:
vocab_size = 10000
truncated_vocabulary = [
    word for word, count in vocabulary.most_common()[:vocab_size]]

In [19]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b"This movie was awesoommmee".split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [20]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [21]:
table.lookup(tf.constant([b"This movie was awesoommmee".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10429]])>

In [22]:
X_batch

<tf.Tensor: shape=(8, 60), dtype=string, numpy=
array([[b'Red', b'Eye', b'is', b'not', b'the', b'kind', b'of', b'movie',
        b"that's", b'going', b'to', b'win', b'the', b'Palme', b"D'or",
        b'but', b'Wes', b'Craven', b'has', b'never', b'been', b'that',
        b'kind', b'of', b'director', b'anyway', b'and', b'his',
        b'branding', b'is', b'a', b'good', b'indication', b'of', b'what',
        b'a', b'film', b'goer', b'can', b'expect', b'The', b'fact',
        b'that', b'Red', b'Eye', b'is', b'a', b'tight', b'little',
        b'undemanding', b'package', b'at', b'minutes', b'is', b'part',
        b'of', b'its', b'<pad>', b'<pad>', b'<pad>'],
       [b'What', b'is', b'left', b'of', b'Planet', b'Earth', b'is',
        b'populated', b'by', b'a', b'few', b'poor', b'and', b'starving',
        b'rag', b'tag', b'survivors', b'They', b'must', b'eat', b'bugs',
        b'and', b'insects', b'or', b'whatever', b'after', b'a',
        b'poison', b'war', b'or', b'something', b'has', b'nea

In [23]:
X_batch.shape

TensorShape([8, 60])

In [24]:
y_batch

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([1, 0, 0, 0, 1, 0, 0, 1])>

In [25]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [27]:
train_set = datasets["train"].repeat().batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [47]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [44]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])

In [49]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 128)         1408000   
_________________________________________________________________
gru_12 (GRU)                 (None, None, 128)         99072     
_________________________________________________________________
gru_13 (GRU)                 (None, 128)               99072     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 1,606,273
Trainable params: 1,606,273
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [61]:
test_set = datasets["test"].repeat().batch(32).map(preprocess)
test_set = test_set.map(encode_words).prefetch(1)

a = model.predict(test_set.take(1))

In [62]:
for X_batch, y_batch in test_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  135    26    77 ...     0     0     0]
 [   74 10791   776 ...     0     0     0]
 [ 3078   755 10210 ...     0     0     0]
 ...
 [ 5703  9425  8036 ...     0     0     0]
 [  274     6    21 ...     0     0     0]
 [    6    98     9 ...     0     0     0]], shape=(32, 64), dtype=int64)
tf.Tensor([1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 1 1], shape=(32,), dtype=int64)


In [75]:
b = table.lookup(tf.constant([b"The movie is so bad and wasting my time".split()]))

In [76]:
model.predict(b)

array([[0.17164907]], dtype=float32)

In [73]:
b = table.lookup(tf.constant([b"The movie is awesome. I can watch it again and again".split()]))

In [74]:
model.predict(b)

array([[0.9461711]], dtype=float32)