# Sentiment Analysis with Recurrent Neural Networks

### Import Data and Libraries

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [7]:
reviewsData, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [8]:
reviewsData

{'test': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'train': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>,
 'unsupervised': <PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>}

In [9]:
trainData = reviewsData['train']
testData = reviewsData['test']

In [10]:
for feature, label in trainData.take(5):
  print(f"feature (movie review): {feature}")
  print(f"label (sentiment): {label}")

feature (movie review): b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label (sentiment): 0
feature (movie review): b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion

### Text Preprocessing

In [14]:
# The number of uniques words in the preprocessed corpus
vocabSize = 1000

# Tokenize and Encode
# Steps: 
# 1. Standardize each example (usually lowercasing + punctuation stripping)
# 2. Split each example into substrings (usually words)
# 3. Recombine substrings into tokens (usually ngrams)
# 4. Index tokens (associate a unique int value with each token)
# 5. Transform each example using this index, either into a vector of ints or a dense float vector.

encoded_vectors = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=vocabSize)

featureTokens = trainData.map(lambda review, sentiment: review)
encoded_vectors.adapt(featureTokens)

In [16]:
feature.numpy()[0:100]

b'As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. Th'

In [17]:
encoded_vectors(feature).numpy()[0:100]

array([ 15, 377,  26,   1,  32,   2, 362,  12, 138,   1,   8,  11,  20,
        24, 639, 412,   1,   2, 114,  53,   1, 265,   2,   1,   5,   2,
       653,   1,  51, 346,  24, 184,  35, 178,   6,  28,   1,  19,  51,
        57, 346,  24, 184,  35, 396,   2,   1,   5,   4,   1, 336, 165,
       444,   3,   2, 444, 440,   4, 133,  63, 828,  72, 139,  32,  28,
        38,   1,  31,  11, 582,  27,  92, 202,  58,   2,   1,   6,   1,
        25, 281, 116])

In [18]:
vocab = np.array( encoded_vectors.get_vocabulary() )
vocab[0:100]

array(['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i',
       'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but',
       'film', 'on', 'not', 'you', 'are', 'his', 'have', 'he', 'be',
       'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so',
       'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some',
       'there', 'what', 'good', 'when', 'more', 'very', 'even', 'she',
       'my', 'no', 'up', 'would', 'which', 'only', 'time', 'really',
       'story', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we',
       'much', 'well', 'been', 'get', 'will', 'into', 'also', 'because',
       'other', 'do', 'people', 'bad', 'great', 'first', 'how', 'most',
       'him', 'dont', 'made', 'then', 'movies', 'make', 'films', 'could',
       'way', 'them', 'any'], dtype='<U14')

### Building the Neural Network

In [None]:
modelRNN = tf.keras.Sequential()

# Add Encoder Layer
modelRNN.add( encoder )

# Add Embedding Layer
modelRNN.add(
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        mask_zero=True
    )   
)

# Add Bidirectional Layer
modelRNN.add(
  tf.keras.layers.Bidirectional(
      tf.keras.layers.LSTM(64)
  )
)

# Add Dense Layer
modelRNN.add(
  tf.keras.layers.Dense(64, activation='relu')   
)

# Add Output Layer
modelRNN.add(
  tf.keras.layers.Dense(1)   
)

In [None]:
# Compile the model
modelRNN.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy']
)

### Train & Test

In [None]:
# Fit the model (i.e, train the model)
modelRNN.fit( trainData, epochs=1 )

### Metrics

In [None]:
loss_test, accuracy_test = modelRNN.evaluate(testData)

print('Loss', loss_test)
print('Accuracy', accuracy_test)

### Predict Sample Reviews

In [None]:
sampleMovieReviews = [
    'Watched the movie with the family. Everyone loved it! Funny, educating, and inspirational! Definitely recommend it watching.',
    'This is a movie?! More like a nightmare! What the hell did I just watch?? Where did you find the cast? Are they humans?',
    'I loved this movie very much! Would definitely watch again!',
]

predictedSentiments = modelRNN.predict( np.array(sampleMovieReviews) )

print( predictedSentiments )