# IMDb Review Sentiment Analysis

In [1]:
# Importing required libraries

from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import numpy as np
import os

Using TensorFlow backend.


In [2]:
# Number of unique words
VOCAB_SIZE = 88584

# Max length of a review
MAXLEN = 250

BATCH_SIZE = 64

In [3]:
#Loading the data into training and testing sets

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words = VOCAB_SIZE)

In [4]:
# Observing a review
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 22665,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 21631,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 19193,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 10311,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 31050,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 12118,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5

# Preprocessing Data

In [5]:
# Padding the review data so that they are all the same length

train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

# Building the Model

In [6]:
model = keras.Sequential([
            keras.layers.Embedding(VOCAB_SIZE, 32),          # word embedding layer
            keras.layers.LSTM(32),                           # long short term memory layer
            keras.layers.Dense(1, activation = "sigmoid")    # dense layer w/ sigmoid activation function
])

In [7]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          2834688   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,843,041
Trainable params: 2,843,041
Non-trainable params: 0
_________________________________________________________________


# Compilation & Training

In [8]:
model.compile(loss = "binary_crossentropy", optimizer = "rmsprop", metrics = ["accuracy"])

# Larger epochs were initially used, but validation accuracy did not significantly improve
trained_model = model.fit(train_data, train_labels, epochs = 4, validation_split = 0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [9]:
# Evaluation of the model

results = model.evaluate(test_data, test_labels)



In [10]:
print(results) # the model is roughly 85% accurate

[0.35944056918621065, 0.8566799759864807]


# Making Predictions

Since the IMDb reviews are encoded, any test review that is written for prediction needs to be encoded in the same way so the model can understand it. In order to do this,, the encodings from the imdb dataset will be loaded and used to encode new data.

In [11]:
# IMDb word mappings
word_index = imdb.get_word_index()

def encode_text(text):
    
    # converting text into tokens
    tokens = keras.preprocessing.text.text_to_word_sequence(text) 
    
    # if word in tokens is in the IMDb word index, replace location in list
    # otherwise 0 as an unknown placeholder
    tokens = [word_index[word] if word in word_index else 0 for word in tokens]
    
    return sequence.pad_sequences([tokens], MAXLEN)[0]

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [12]:
text = "That movie was absolutely incredible"
encoded = encode_text(text)
print(encoded)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [13]:
# Decode function

reverse_word_index = {value: key for (key, value) in word_index.items()}

def decode_integers(integers):
    PAD = 0
    text = ""
    for num in integers:
        if num != PAD:
            text += reverse_word_index[num] + " "
    
    return text[:-1]

In [14]:
# testing function with 'encoded' variable

print(decode_integers(encoded))

that movie was absolutely incredible


In [20]:
# Making an actual prediction

def predict(text):
    encoded_text = encode_text(text)
    pred = np.zeros((1, 250))         # normalized model reviews are 250 (MAXLEN) words
    pred[0] = encoded_text
    result = model.predict(pred)
    print(result[0])
    
    if result[0] > 0.5:
        print("This review is classified as postive.")
    else:
        print("This review is classified as negative.")

In [36]:
positive_review = "That movie was so awesome! I really loved it and I can't wait to watch it again!"
predict(positive_review)

negative_review = "That movie sucked, I hated it. One of the worst movies I've ever seen."
predict(negative_review)

[0.7857643]
This review is classified as postive.
[0.35191002]
This review is classified as negative.


In [39]:
neutral_review = "This movie was ok."
predict(neutral_review)

[0.48639053]
This review is classified as negative.
