<a href="https://colab.research.google.com/github/devulapallia1/LSTM/blob/main/592ML_tf_keras_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# reference: www.tensorflow.org/text/tutorials/text_classification_rnn

import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar() # disable the display status of a determinate or indeterminate process


In [None]:
# each review has either a positive (1) or negative (0) sentiment

dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

train_dataset.element_spec

Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...
Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [None]:
# print to understand the review dataset
for example, label in train_dataset.take(4): # check 4 reviews (text and labels)
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0
text:  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rubbish. 

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

# .shuffle: shuffle the samples to have always a random order of BUFFLE_SIZE samples fed to the network
# .batch: batch samples in chunks of size BATCH_SIZE
# .prefetch: uses a background thread and an internal buffer to prefetch elements from the input dataset ahead of the time they are requested.
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [None]:
for example, label in train_dataset.take(1): # after shuffle, print the first batch in train_dataset
  print('texts: ', example.numpy()[:4])  # print the first 4 items in the batch
  print()
  print('labels: ', label.numpy()[:4]) # print the first 4 y values in the batch

texts:  [b'"Phantasm" of 1979 was a highly atmospheric, creepy, scary and very original Horror flick, and, in one word, cult. The first sequel of 1988 was gory, witty, action-packed and highly entertaining. After the first sequel however, "Phantasm" creator Don Coscarelly apparently lacked new ideas. "Phantasm III - Lord Of The Dead" of 1994 is certainly not a complete failure, it even is quite entertaining, but there is no more originality, and the desperate attempts to bring in something new, are at times tiresome, which makes it quite disappointing in comparison to its predecessors. <br /><br />- SPOILERS - <br /><br />Quite in the beginning, we are introduced the secret behind the mysterious sentinel spheres (the brain-sucking flying silver balls) is unraveled. Thenceforward, a number of unnecessary and annoying new characters (such as Tim, a "Home Alone"-style little kid who happens to be great at shooting, an Rocky, a tough and super-cool nunchaku-swinging black chick with a crew

In [None]:
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization

VOCAB_SIZE = 1000

encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)

# call `adapt` on the text-only dataset to create the vocabulary.
encoder.adapt(train_dataset.map(lambda text, label: text)) # keep text only, ignore label

vocab = np.array(encoder.get_vocabulary())
vocab[:100]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be',
       'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so',
       'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some',
       'there', 'what', 'good', 'when', 'more', 'very', 'even', 'she',
       'my', 'no', 'up', 'would', 'which', 'only', 'time', 'really',
       'story', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we',
       'much', 'well', 'been', 'get', 'will', 'into', 'also', 'because',
       'other', 'do', 'people', 'bad', 'great', 'first', 'how', 'most',
       'him', 'dont', 'made', 'then', 'movies', 'make', 'films', 'could',
       'way', 'them', 'any'], dtype='<U14')

In [None]:
example = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')

encoded_example = encoder(example)[:20].numpy()  # example is encoded into integers. Print first 20 integers
encoded_example # 'the' is encoded to 2, 'movie' is encoded to 18


array([  2,  18,  14, 652,   2, 737,   3,   2,   1,  66,  46,   5,  11,
       188,  10,  59, 368,  11,  18])

In [None]:

example = ('The movie was not good. The animation and the graphics '
               'were terrible. I would not recommend this movie.')

encoded_example = encoder(example)[:20].numpy()
encoded_example  # 'the' is encoded to 2, 'movie' is encoded to 18


array([  2,  18,  14,  22,  50,   2, 737,   3,   2,   1,  66, 384,  10,
        59,  22, 368,  11,  18])

In [None]:
model = tf.keras.Sequential([
    encoder,                                    # pass encoder to the model
    tf.keras.layers.Embedding(                  # embedding layer
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),                        # embedding layer
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),  # LSTM layer
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),    # LSTM layer
    tf.keras.layers.Dense(64, activation='relu'),               # fully connected layer
    tf.keras.layers.Dense(1)    # fully connected layer with one output
])

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])  # you can see the output/label is random, because we have not trained the LSTM model yet

[0.00064196]


In [None]:
# choose loss function, optimizer, and performance metrics:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

# train the model:
history = model.fit(train_dataset, epochs=3,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
# predict on a sample text without padding.

sample_text = ('The movie was not good. The animation and the graphics '
               'were terrible. I would not recommend this movie.')

predictions = model.predict(np.array([sample_text]))
print(predictions)

sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')

predictions = model.predict(np.array([sample_text]))
print(predictions)

[[-0.00651081]]
