# Sentiment Analysis with Recurrent Neural Networks (RNN)

### Import Data and Libraries

In [1]:
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

In [2]:
# TensorFlow provides sample datasets for experimenting
# as_supervised will give you the collection for supervised learning
reviewsData, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZUDTOX/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZUDTOX/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteZUDTOX/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [3]:
# Preview the 'reviewsData'
reviewsData

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [4]:
# Extract from 'reviewsData' the train and test datasets
trainData = reviewsData['train']
testData = reviewsData['test']

In [5]:
for feature, label in trainData.take(5):
  print('feature (text): ', feature.numpy())
  print('label (binary): ', label.numpy())

feature (text):  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label (binary):  0
feature (text):  b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep b

In [6]:
# bufferSize the maximum number of tokens (words) in any text feature
bufferSize = 10000
# batchSize is the number of samples to propogate through the network
batchSize = 64

In [7]:
# .shuffle() will randomize the dataset
# .batch() set the batch size of sampling
# .prefetch() will cache the data while an existing batch is being propogated

trainData = trainData.shuffle( buffer_size=bufferSize ).batch( batchSize ).prefetch(tf.data.AUTOTUNE)

testData = testData.batch( batchSize ).prefetch(tf.data.AUTOTUNE)

### Text Preprocessing

In [8]:
# Vocabulary is the total number of unique
# words in the collection
vocabSize = 1000

# Tokenization
# Split the words into a vector of tokens
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocabSize)

# Text Encoding
# Use the .map() method extract only the features (reviews) and exlude the label (sentiment)
featureTokens = trainData.map( lambda review, sentiment:  review)
encoder.adapt(featureTokens)

In [9]:
# Preview some of the terms before encoding
feature.numpy()[0:100]

b'As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. Th'

In [10]:
# Preview some of the terms after encoding
encoder(feature)[0:100].numpy()

array([ 15, 377,  26,   1,  32,   2, 362,  12, 138,   1,   8,  11,  20,
        24, 639, 412,   1,   2, 114,  53,   1, 265,   2,   1,   5,   2,
       653,   1,  51, 346,  24, 184,  35, 178,   6,  28,   1,  19,  51,
        57, 346,  24, 184,  35, 396,   2,   1,   5,   4,   1, 336, 165,
       444,   3,   2, 444, 440,   4, 133,  63, 828,  72, 139,  32,  28,
        38,   1,  31,  11, 582,  27,  92, 202,  58,   2,   1,   6,   1,
        25, 281, 116])

In [11]:
# Extract the vocabulary as NumPy array
# and preview of the vocabulary words
vocab = np.array( encoder.get_vocabulary() )
vocab[0:100]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be',
       'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so',
       'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some',
       'there', 'what', 'good', 'when', 'more', 'very', 'even', 'she',
       'my', 'no', 'up', 'would', 'which', 'only', 'time', 'really',
       'story', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we',
       'much', 'well', 'been', 'get', 'will', 'into', 'also', 'because',
       'other', 'do', 'people', 'bad', 'great', 'first', 'how', 'most',
       'him', 'dont', 'made', 'then', 'movies', 'make', 'films', 'could',
       'way', 'them', 'any'], dtype='<U14')

### Building the Deep Learning Model

In [12]:
modelRNN = tf.keras.Sequential()

# Add Encoder Layer
modelRNN.add( encoder )

# Add Embedding Layer
modelRNN.add(
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    )   
)

# Add Bidirectional Layer
modelRNN.add(
  tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(64)
  )
)

# Add Dense Layer
modelRNN.add(
  tf.keras.layers.Dense(64, activation='relu')   
)

# Add Output Layer
modelRNN.add(
  tf.keras.layers.Dense(1)   
)

In [13]:
# Compile the model
modelRNN.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

In [14]:
# Fit the model (i.e, train the model)
modelRNN.fit( trainData, epochs=1 )



<keras.callbacks.History at 0x7f3caf968fd0>

In [15]:
loss_test, accuracy_test = modelRNN.evaluate(testData)

print('Loss', loss_test)
print('Accuracy', accuracy_test)

Loss 7.631711959838867
Accuracy 0.5030400156974792


In [31]:
sampleMovieReviews = [
    'Watched the movie with the family. Everyone loved it! Funny, educating, and inspirational! Definitely recommend it watching.',
    'This is a movie?! More like a nightmare! What the hell did I just watch?? Where did you find the cast? Are they humans?',
    'I loved this movie very much! Would definitely watch again!',
]

predictedSentiments = modelRNN.predict( np.array(sampleMovieReviews) )

print( predictedSentiments )

[[0.7079204 ]
 [0.02038823]
 [0.67212397]]
