# Sentiment Analysis with RNNs (Recurrent Neural Networks)

### 1. Import Libraries and Data

In [2]:
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

### 2. Preview the Data and Text Preprocessing

In [3]:
reviews_data, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4P4EKY/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4P4EKY/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete4P4EKY/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
reviews_data

{'test': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'train': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>,
 'unsupervised': <PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>}

In [5]:
train_data = reviews_data['train']
test_data = reviews_data['test']

In [6]:
# train_data.shape

In [7]:
# Preview 5 sample review from training data
for feature, label in train_data.take(5):
    print('feature: ', feature.numpy())
    print('label: ', label.numpy())

feature:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0
feature:  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep because the film was rub

In [8]:
# Optimize for Memory

# .prefetch() will take the next batch of data from permanent storage,
# load it into temporary storage (RAM) while the model is learning
# from the preceeding batch.

# .batch() will set the number of data to be fed to the model at one go

# buffer_size in .shuffle() is the maximum number of tokens in any batch 

train_data = train_data.shuffle(buffer_size = 10000).batch(64).prefetch(tf.data.AUTOTUNE)

test_data = test_data.batch(64).prefetch(tf.data.AUTOTUNE)

In [9]:
# Text Preprocessing

# Max size of the vocabulary in the collection
vocabSize = 1000

# Encoder for converting text feature to encoded features
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = vocabSize
)

# Apply the encoding
# We use .map() to extract all of the features (text reviews)
# so that we can encode them with the encoder.
feature_tokens = train_data.map( lambda review, sentiment: review )
encoder.adapt(feature_tokens)

In [10]:
# Before and After of Text Preprocessing

In [11]:
# feature_before_encoding = list(train_data.take(1))[0]
feature.numpy()

b'As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. The plot very ably shows the hypocrisy of the female libido. When men are around they want to be pursued, but when no "men" are around, they become the pursuers of a 14 year old boy. And the boy becomes a man really fast (we should all be so lucky at this age!). He then gets up the courage to pursue his true love.'

In [12]:
encoder(feature)[0:100].numpy()

array([ 15, 377,  26,   1,  32,   2, 362,  12, 138,   1,   8,  11,  20,
        24, 639, 412,   1,   2, 114,  53,   1, 265,   2,   1,   5,   2,
       653,   1,  51, 346,  24, 184,  35, 178,   6,  28,   1,  19,  51,
        57, 346,  24, 184,  35, 396,   2,   1,   5,   4,   1, 336, 165,
       444,   3,   2, 444, 440,   4, 133,  63, 828,  72, 139,  32,  28,
        38,   1,  31,  11, 582,  27,  92, 202,  58,   2,   1,   6,   1,
        25, 281, 116])

### 3. Build the RNN Model

In [13]:
model_rnn = tf.keras.Sequential()

# Encoder Layer
# Preprocessing Text: (1) Vectorization, (2) Stop-Word Removal, (3) Encoding
model_rnn.add( encoder )

# Embedding Layer
# Reduce the dimensions of the data set
model_rnn.add(
    tf.keras.layers.Embedding(
        input_dim = len(encoder.get_vocabulary()),
        output_dim = 64,
        mask_zero = True
    )
 )

# Bidirectional LSTM (Long Short-Term Memory) Layer
model_rnn.add(
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64)
    )
)

# Densely Connected Layer
model_rnn.add(
    tf.keras.layers.Dense(64, activation='relu')
)

# Output Layer
model_rnn.add(
    tf.keras.layers.Dense(1)
)

In [15]:
model_rnn.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

In [16]:
model_rnn.fit( train_data, epochs=1 )



<keras.callbacks.History at 0x7f538c93be10>

### 4. Evaluate the Metrics

In [17]:
# .evaluate() will have the model predict the labels
# for the test data. Then, it will compare the predicted labels
# again the actual labels, and calculate both the loss and accuracy
# of the model

# Train vs Test Metrics
# We do this to determine whether model is overfitting or not
loss_train, accuracy_train =  model_rnn.evaluate(train_data)
print('Loss for Train', loss_train)
print('Accuracy for Train', accuracy_train)


loss_test, accuracy_test =  model_rnn.evaluate(test_data)
print('Loss for Test', loss_test)
print('Accuracy for Test', accuracy_test)

Loss for Train 0.4996262788772583
Accuracy for Train 0.8438400030136108
Loss for Test 0.5493530035018921
Accuracy for Test 0.8329600095748901


### 5. Predict Sentiment of Text

In [23]:
sample_reviews = [
    "My eyes hurt. My ears are in pain. What just happened today. You call this a movie? You call the people actors? Forget my money. Just make me forgot I ever a movie starring ",
    "What a wonderful movie. I never slept so well in my life. The actors did not change their pitch, which drowned their voice.",
    "Wow, what a cast. Legendary movie. Great cinematography and music!",
    "Good movie. Not bad.",
    "What a movie. Starring Kristen Stewart and Rami Malek."
]

In [24]:
predicted_sentiments = model_rnn.predict(sample_reviews)

print(predicted_sentiments)

[[0.35946417]
 [0.98870605]
 [0.9030119 ]
 [0.04187424]
 [0.39204413]]
