In [2]:
import tensorflow as tf 
import numpy as np 
from tensorflow . keras.preprocessing import sequence 
from numpy import array 
import logging 
logging.getLogger ('tensorflow').disabled = True

In [3]:
# Fetch " IMDB Movie Review " data , constraining our reviews to 
# the 10000 most commonly used words 
vocab_size = 10000 
(x_train, y_train), (x_test, y_test ) = tf.keras.datasets.imdb.load_data (num_words = vocab_size) 

# Map for readable classnames 
class_names = ["Negative" ,"Positive"]

In [4]:
# Get the word index from the dataset 
word_index = tf.keras.datasets.imdb.get_word_index () 

# Ensure that " special " words are mapped into human readable terms 
word_index = {k: (v + 3) for k , v in word_index.items()} 
word_index["<PAD>"] = 0 
word_index["<START>"] = 1
word_index["<UNKNOWN>"] = 2
word_index["<UNUSED>"] = 3 

# Perform reverse word Lookup and make it callable 
reverse_word_index = dict([(value, key) for(key, value) in word_index.items()]) 
def decode_review (text): 
    return ' '.join ([reverse_word_index.get (i, '?') for i in text])

In [5]:
# Concatonate test and training datasets 
allreviews = np.concatenate (( x_train, x_test ), axis = 0 ) 

# Review Lengths across test and training whole datasets 
print("Maximum review length: {}".format(len(max((allreviews), key = len)))) 
print("Minimum review length: {}".format(len(min((allreviews), key = len)))) 
result = [len(x) for x in allreviews] 
print("Mean review length: {}".format(np.mean(result))) 

# Print a review and it's class as stored in the dataset . Replace the number 
# to select a different review . print ( " " ) 
print("Machine readable Review") 
print("Review Text :" + str (x_train [60])) 
print("Review Sentiment :" + str (y_train[60])) 

# Print a review and it's class in human readable format . Replace the number 
# to select a different review . print ( " " ) 
print("Human Readable Review") 
print("Review Text: " + decode_review (x_train [60])) 
print("Review Sentiment: " + class_names [y_train[60]])

Maximum review length: 2494
Minimum review length: 7
Mean review length: 234.75892
Machine readable Review
Review Text :[1, 13, 219, 14, 33, 4, 2, 22, 1413, 12, 16, 373, 175, 2711, 1115, 1026, 430, 939, 16, 23, 2444, 25, 43, 697, 89, 12, 16, 170, 8, 130, 262, 19, 32, 4, 665, 7, 4, 2, 322, 5, 4, 1520, 7, 4, 86, 250, 10, 10, 4, 249, 173, 16, 4, 3891, 6, 19, 4, 167, 564, 5, 564, 1325, 36, 805, 8, 216, 638, 17, 2, 21, 25, 100, 376, 507, 4, 2110, 15, 79, 125, 23, 567, 13, 2134, 233, 36, 4852, 2, 5, 81, 1672, 10, 10, 92, 437, 129, 58, 13, 69, 8, 401, 61, 1432, 39, 1286, 46, 7, 12]
Review Sentiment :0
Human Readable Review
Review Text: <START> i saw this at the <UNKNOWN> film festival it was awful every clichéd violent rich boy fantasy was on display you just knew how it was going to end especially with all the shots of the <UNKNOWN> wife and the rape of the first girl br br the worst part was the q a with the director writer and writer producer they tried to come across as <UNKNOWN> but you 

In [6]:
# The Length of reviews 
review_length = 500 

# Padding / truncated our reviews 
x_train = sequence.pad_sequences(x_train, maxlen = review_length) 
x_test = sequence.pad_sequences(x_test, maxlen = review_length) 

# Check the size of our datasets. Review data for both test and training should 
# contain 25000 reviews of 500 integers. Class data should contain 25000 values , 
# one for each review . Class values are 0 or 1 , indicating a negative 
# or positive review . 
print("Shape Training Review Data: " + str(x_train.shape)) 
print("Shape Training Class Data: " + str(y_train.shape)) 
print("Shape Test Review Data: " + str(x_test.shape))
print("Shape Test Class Data: "  + str(y_test.shape)) 
# Note padding is added to start of review , not the end print ("") 
print("Human Readable Review Text (post padding): " + decode_review ( x_train [60]))

Shape Training Review Data: (25000, 500)
Shape Training Class Data: (25000,)
Shape Test Review Data: (25000, 500)
Shape Test Class Data: (25000,)
Human Readable Review Text (post padding): <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

In [7]:
# We begin by defining the a empty stack . We'll use this for building our 
# network , later by Layer. 
model = tf.keras.models.Sequential() 

# The Embedding Layer provides a spatial mapping ( or Word Embedding ) of all the 
# individual words in our training set . Words close to one another share context 
# and or meaning. This spatial mapping is learning during the training process.
model.add(
    tf.keras.layers.Embedding(
        input_dim = vocab_size, # The size of our vocabulary 
        output_dim = 32, # Dimensions to which each words shall be mapped 
        input_length = review_length # Length of input sequences 
    )
)
# Dropout Layers fight overfitting and forces the model to learn multiple 
# representations of the same data by randomly disabling neurons in the 
# Learning phase. 
model.add( 
    tf.keras.layers.Dropout(
        rate = 0.25 # Randomly disable 25 % of neurons
    )
)
# Dropout Layers fight overfitting and forces the model to learn multiple 
# representations of the same data by randomly disabling neurons in the 
# Learning phase. 
model.add(
    tf.keras.layers.Dropout(
        rate = 0.25 # Randomly disable 25 % of neurons 
    ) 
) 
# We are using a fast version of LSTM whih is optimised for GPUs . This Layer 
# Looks at the sequence of words in the review , along with their word embeddings
# and uses both of these to determine to sentiment of a given review .
model.add(
    tf.keras.layers.LSTM(
        units = 32 # 32 LSTM units in this layer 
     ) 
) 
# Add a second dropout layer with the same aim as the first.
model.add(
    tf.keras.layers.Dropout(
        rate = 0.25 # Randomly disable 25 % of neurons 
    ) 
)

# ALL LSTM units are connected to a single node in the dense Layer . A sigmoid 
# activation function determines the output from this node - a value 
# between 0 and 1. Closer to indicates a negative review . closer to 1 
# indicates a positive review.
model.add(
    tf.keras.layers.Dense(
        units = 1, # Single unit 
        activation = 'sigmoid' # Sigmoid activation function(output from 0 to 1)
    )
)
# Compile the model 
model.compile( 
    loss = tf.keras.losses.binary_crossentropy, # Loss function 
    optimizer = tf.keras.optimizers.Adam(), # optimiser function 
    metrics = ['accuracy']) # reporting metric 
# Display a summary of the models structure 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           320000    
                                                                 
 dropout (Dropout)           (None, 500, 32)           0         
                                                                 
 dropout_1 (Dropout)         (None, 500, 32)           0         
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 328,353
Trainable params: 328,353
Non-trai

In [8]:
# Train the LSTM on the training data 
history = model.fit ( 
    # Training data : features ( review ) and classes ( positive or negative ) 
    x_train , y_train, 
    # Number of samples to work through before updating the 
    # internal model parameters via back propagation. The 
    # higher the batch , the more memory you need.
    
    batch_size = 256, 
    # An epoch is an iteration over the entire training data. 
    epochs = 3 ,# The model will set apart his fraction of the training
    # data , will not train on it , and will evaluate the loss 
    # and any model metrics on this data at the end of 
    # each epoch.
    validation_split = 0.2,
    verbose = 1 )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
# Get Model Predictions for test data 
from sklearn.metrics import classification_report 
predictions = (model.predict (x_test) > 0.5).astype ("int32") 
print (classification_report (y_test, predictions , target_names = class_names))

              precision    recall  f1-score   support

    Negative       0.83      0.93      0.88     12500
    Positive       0.92      0.81      0.86     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

