In [None]:
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt

In [None]:
max_words = 20000
max_len = 200

(train_sequences, train_labels), (test_sequences, test_labels) = tf.keras.datasets.imdb.load_data(num_words=max_words)


def preprocess(sequences, labels):
    return sequences, labels.astype(np.int32)

train_sequences, train_labels = preprocess(train_sequences, train_labels)
test_sequences, test_labels = preprocess(test_sequences, test_labels)

In [None]:
vocabulary = tf.keras.datasets.imdb.get_word_index()
char_to_ind = vocabulary
ind_to_char = {ind: char for (char, ind) in vocabulary.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
def gen():
    for sequence, label in zip(train_sequences, train_labels):
        yield sequence, label
def test_gen():
    for sequence, label in zip(test_sequences, test_labels):
        yield sequence, label

In [None]:
train_data = tf.data.Dataset.from_generator(gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))
test_data = tf.data.Dataset.from_generator(test_gen, output_signature=(
         tf.TensorSpec(shape=(None,), dtype=tf.int32),
         tf.TensorSpec(shape=(), dtype=tf.int32)))

Padded batch

In [None]:
train_data = train_data.padded_batch(32)
test_data = test_data.padded_batch(32)
print (train_sequences.shape, test_sequences.shape)

(25000,) (25000,)


Bucketing

In [None]:
buckets = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
bucket_batch_size = [32] * (len(buckets) + 1)
train_data = train_data.bucket_by_sequence_length(lambda sequence, label: tf.shape(sequence)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

test_data = test_data.bucket_by_sequence_length(lambda sequence_t, label_t: tf.shape(sequence_t)[0],
                                                  bucket_boundaries=buckets, bucket_batch_sizes=bucket_batch_size)

In [None]:
# NOTE!!
# you should probably still remove very long sequences (longer than some cutoff)
# before converting to a dataset

In [None]:
# here's a very simple toy example for a keras lstm
# the "hidden dimensions" are just randomly chosen. 
# you probably don't want to use a hidden size of 12 =) (but maybe it's actually really good?)


# embedding comes first to replace one-hot vectors. 
#    mask_zero=True to prevent computations on padded time steps.
# then an arbitrary number of RNN layers.
# deeper RNN layers take as input sequence the state sequence of the layer before,
# so all layers except the last one should return_sequences=True
# finally, a Dense layer for the output, since the output computation is *not*
# included in the RNN cells; all cells provided by Keras only compute the states
model = tf.keras.Sequential([tf.keras.layers.Embedding(max_words, 20, mask_zero=True), 
                             tf.keras.layers.LSTM(12, return_sequences=True),
                             tf.keras.layers.LSTM(15),
                             tf.keras.layers.Dense(1)])


# FYI, the third line is the same as the first two lines together.
# the second option can use a much more efficient implementation, it will be SOOO much faster.
# try it yourself!
#rnn_cell = tf.keras.layers.LSTMCell(12)
#rnn = tf.keras.layers.RNN(rnn_cell, return_sequences=False)
rnn = tf.keras.layers.LSTM(12, return_sequences=False)

In [None]:
model.compile(optimizer='adam',
              loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
               metrics=["accuracy"])
history = model.fit(train_data,
                    validation_data=(test_data),epochs=5)

Epoch 1/5




ValueError: ignored

In [None]:
# calling RNN layers is easy!
one_hot_batch = tf.one_hot(train_sequences, depth=max_words)
rnn(one_hot_batch)

ValueError: ignored