In [1]:
import numpy as np

In [2]:
import tensorflow as tf

In [4]:
from tensorflow import keras

In [6]:
tf.random.set_seed(42)

In [8]:
import tensorflow_datasets as tfds

In [9]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\Divyam\tensorflow_datasets\imdb_reviews\plain_text\1.0.0...[0m


HBox(children=(HTML(value='Dl Completed...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='…

HBox(children=(HTML(value='Dl Size...'), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'…





HBox(children=(HTML(value='Generating splits...'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Generating train examples...'), FloatProgress(value=1.0, bar_style='info', layout=L…

HBox(children=(HTML(value='Shuffling imdb_reviews-train.tfrecord...'), FloatProgress(value=0.0, max=25000.0), …

HBox(children=(HTML(value='Generating test examples...'), FloatProgress(value=1.0, bar_style='info', layout=La…

HBox(children=(HTML(value='Shuffling imdb_reviews-test.tfrecord...'), FloatProgress(value=0.0, max=25000.0), H…

HBox(children=(HTML(value='Generating unsupervised examples...'), FloatProgress(value=1.0, bar_style='info', l…

HBox(children=(HTML(value='Shuffling imdb_reviews-unsupervised.tfrecord...'), FloatProgress(value=0.0, max=500…

[1mDataset imdb_reviews downloaded and prepared to C:\Users\Divyam\tensorflow_datasets\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


In [10]:
print(datasets.keys())

dict_keys([Split('train'), Split('test'), Split('unsupervised')])


In [12]:
train_size = info.splits["train"].num_examples
test_size = info.splits["test"].num_examples

print(train_size, test_size)

25000 25000


In [14]:
for X_batch, Y_batch in datasets["train"].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), Y_batch.numpy()):
        print("Review:", review.decode("utf-8")[:200], "...")
        print("Label:", label, "= Positive" if label else "= Negative")
        print()
        

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting  ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However  ...
Label: 0 = Negative



In [20]:
def preprocess(X_batch, Y_batch):
    X_batch = tf.strings.substr(X_batch, 0 , 300)
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value = b"<pad>"), Y_batch

In [21]:
preprocess(X_batch, Y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [23]:
from collections import Counter 

In [24]:
vocabulary = Counter()

In [25]:
for X_batch, Y_batch in datasets["train"].batch(2).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [26]:
vocabulary.most_common()[:5]

[(b'<pad>', 63155),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431)]

In [27]:
len(vocabulary )

53893

In [28]:
vocab_size = 10000

In [30]:
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [31]:
words = tf.constant(truncated_vocabulary)

In [32]:
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)

In [33]:
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)

In [34]:
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [35]:
table.lookup(tf.constant([b"This movie was faaaaaantastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]], dtype=int64)>

In [36]:
def encode_words(X_batch, Y_batch):
    return table.lookup(X_batch), Y_batch

In [38]:
train_set = datasets["train"].repeat().batch(32).map(preprocess)

In [41]:
train_set = train_set.map(encode_words).prefetch(1)

In [43]:
test_set = datasets["test"].batch(1000).map(preprocess)

In [44]:
test_set = test_set.map(encode_words)

In [45]:
for X_batch , Y_batch in train_set.take(1):
    print(X_batch)
    print(Y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [46]:
embed_size = 128

In [49]:
from tensorflow.keras import Model

In [52]:
model = keras.models.Sequential([keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                                                      mask_zero = True,
                                                      input_shape = [None]),
                               keras.layers.GRU(4, return_sequences = True),
                               keras.layers.GRU(2),
                               keras.layers.Dense(1, activation="sigmoid")])

In [53]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [54]:
import time

In [55]:
start = time.time()

In [57]:
model.fit(train_set, steps_per_epoch = train_size //32, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x21e75a619a0>

In [58]:
end = time.time()

In [59]:
print(end-start)

160.04403018951416


In [60]:
model.evaluate(test_set)



[0.5337879061698914, 0.7559599876403809]