In [144]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [147]:
import warnings
warnings.filterwarnings('ignore')

from tensorflow.keras.datasets import imdb
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from random import randint
from zipfile import ZipFile
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords 

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow.keras.backend as K
import tensorflow.keras.preprocessing as preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dropout, Flatten, Dense, Embedding, LSTM
from tensorflow.keras.layers import Input, Bidirectional
from tensorflow.keras.constraints import max_norm, unit_norm
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [148]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## **1. Import and analyse the data set**

In [149]:
%%time

# Import and analyse the data set
# Set the vocalbulary size and the number of words from each review
# # Setting the number of words as 100 to train the network better
vocab_size = 10000
maxlen = 100

# vocab_size is no.of words to consider from the dataset, ordering based on frequency.
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocab_size)

CPU times: user 4.6 s, sys: 539 ms, total: 5.14 s
Wall time: 5.25 s


In [150]:
# Print the shape of train and test data
print("Shape of X_train:",X_train.shape)
print("Shape of y_train:",y_train.shape)
print("Shape of X_test:",X_test.shape)
print("Shape of y_test:",y_test.shape)

Shape of X_train: (25000,)
Shape of y_train: (25000,)
Shape of X_test: (25000,)
Shape of y_test: (25000,)


In [151]:
# print unique Labels in y_train
print("Labels: ", np.unique(y_train))

Labels:  [0 1]


In [152]:
print("Max length of sequence in X_train: ", max(len(x) for x in X_train))
print("Max length of sequence in X_test: ", max(len(x) for x in X_test))

Max length of sequence in X_train:  2494
Max length of sequence in X_test:  2315


## **2. Perform relevant sequence adding on the data.**

In [153]:
# Mean of X_train data
np.mean([len(i) for i in X_train])

238.71364

In [154]:
# Perform relevant sequence adding on the data.
# Make all sequences of the same length
#padding -  200 length
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test =  pad_sequences(X_test, maxlen=maxlen)

## **3. Perform following data analysis**

### **Print shape of features and labels**

In [155]:
# Print shape of features and labels
print("Shape of features:")
print ('X_train after padding contains', X_train.shape[0], 'rows and', X_train.shape[1], 'columns.')
print ('X_test after padding contains', X_test.shape[0], 'rows and', X_test.shape[1], 'columns.')
print("\n")
print("Shape of labels:")
print ('y_train contains', y_train.shape[0], 'rows.')
print ('y_test contains', y_test.shape[0], 'rows.')

Shape of features:
X_train after padding contains 25000 rows and 100 columns.
X_test after padding contains 25000 rows and 100 columns.


Shape of labels:
y_train contains 25000 rows.
y_test contains 25000 rows.


In [156]:
# train_len = [len(sequence) for sequence in X_train]
# test_len = [len(sequence) for sequence in X_test]

print('Maximum value of a word index:', max([max(sequence) for sequence in X_train]))
print('Maximum length num words of review in train:', max([len(sequence) for sequence in X_train]))

Maximum value of a word index: 9999
Maximum length num words of review in train: 100


In [157]:
# Convert the label values to be Positive/Negative rather than 0/1
unique, counts = np.unique(np.concatenate((y_train, y_test)), return_counts=True)
sentiments = {1: 'Positive', 0: 'Negative'}

### **Print value of any one feature and it's label**

In [158]:
# Printing the number of words in random two review as a sample - Note : padded to 200

print("1st reveiw in dataset", X_train[0].shape[0])
print("100th reveiw in dataset", X_train[100].shape[0])

1st reveiw in dataset 100
100th reveiw in dataset 100


In [159]:
# Print value of any one feature and it's label
loc = randint(0,25000)
print('Value:', loc)

# Print the value of feature and label at loc location 
print("Value of the feature:" ,X_train[loc])

Value: 13721
Value of the feature: [ 688    8    4  192   15   12   80   30 8326   34    2  108   13  586
  683 3685  808    6   78   22   21   94   24   66  441  345   42   33
  222   12  218    8  259  120    4  559    7 1086 3685  808  218    6
  441   22   88  164   44   12    9  204  285   43  166   25  140   51
  774   13  110   14  159 3685  808 7247  125    5  497    8 2208   11
   23  285   39    2 1026    8  344  584   63 2886  195  199   28    2
 9213   11   12 3685  808  218    6   78   20   21   12 1015    8 5475
   83 7649]


In [160]:
print("Value of the label:", sentiments.get(y_train[loc]))

Value of the label: Negative


## **4. Decode the feature value to get original sentence**

In [161]:
# Decode the feature value to get original sentence
def decode_feature(sequence):
    id_to_word = dict([(value, key) for (key, value) in imdb.get_word_index().items()])
    decoded_feature = ' '.join([id_to_word.get(idx-3,'?') for idx in sequence])
    print(decoded_feature)

In [162]:
decode_feature(X_train[loc])
print('\n')
print('The sentiment for the above review is:', sentiments.get(y_train[loc]))
print('\n')

due to the fact that it will be overshadowed by ? films i wouldn't call blank check a bad film but its not really entertaining either or at least it isn't to anyone over the age of 6 blank check isn't a entertaining film because nothing about it is original everything just makes you go what haven't i seen this before blank check rips off and tries to cash in on everything from ? rich to home alone which strangely enough both have ? culkin in it blank check isn't a bad movie but it deserves to fade into obscurity


The sentiment for the above review is: Negative




## **5. Design, train, tune and test a sequential model.**

In [163]:
def base_model():
  # Build the network
  model = Sequential()
  model.add(Embedding(input_dim = vocab_size, output_dim = 100, input_length = maxlen))
  model.add(Flatten())
  model.add(Dense(1, activation = 'sigmoid'))
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return model

In [164]:
def lstm_model():
  #Lets try with an LSTM model.
  lstm = Sequential()
  lstm.add(Embedding(input_dim = vocab_size, output_dim = 100, input_length = maxlen))
  lstm.add(Dropout(0.2))
  lstm.add(LSTM(128, activation = 'tanh', dropout = 0.2,
                kernel_constraint = unit_norm(),
                recurrent_constraint = unit_norm(),
                bias_constraint = unit_norm()))
  lstm.add(Dense(150, activation = 'relu')) 
  lstm.add(Dropout(0.2))
  lstm.add(Dense(1, activation = 'sigmoid'))
  lstm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
  return lstm

In [165]:
model = base_model()

In [166]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 flatten_5 (Flatten)         (None, 10000)             0         
                                                                 
 dense_8 (Dense)             (None, 1)                 10001     
                                                                 
Total params: 1,010,001
Trainable params: 1,010,001
Non-trainable params: 0
_________________________________________________________________


In [167]:
%%time

#fit Model
# using 20% data for validation
model.fit(X_train, y_train, epochs = 15, batch_size = 512, validation_split = 0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 36.6 s, sys: 1.62 s, total: 38.2 s
Wall time: 41.7 s


<keras.callbacks.History at 0x7f75f90a0bd0>

In [168]:
#Printing scores and accuracy
scores, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Score :", scores)
print("Accuracy :", accuracy)

Score : 0.3921610414981842
Accuracy : 0.8475199937820435


In [169]:
train_loss, train_acc = model.evaluate(X_train, y_train)
print('Training Loss: %.4f and Accuracy: %.2f%%' % (train_loss, train_acc * 100))

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test Loss: %.4f and Accuracy: %.2f%%' % (test_loss, test_acc * 100))

Training Loss: 0.0883 and Accuracy: 96.86%
Test Loss: 0.3922 and Accuracy: 84.75%


In [170]:
#Lets try with an LSTM model.
lstm_model = lstm_model()

In [171]:
lstm_model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 100)          1000000   
                                                                 
 dropout_3 (Dropout)         (None, 100, 100)          0         
                                                                 
 lstm_2 (LSTM)               (None, 128)               117248    
                                                                 
 dense_9 (Dense)             (None, 150)               19350     
                                                                 
 dropout_4 (Dropout)         (None, 150)               0         
                                                                 
 dense_10 (Dense)            (None, 1)                 151       
                                                                 
Total params: 1,136,749
Trainable params: 1,136,749
No

In [172]:
%%time

lstm_model.fit(X_train, y_train, epochs = 5, batch_size = 128, validation_split = 0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 9min 31s, sys: 33.2 s, total: 10min 4s
Wall time: 6min 24s


<keras.callbacks.History at 0x7f75e8a01b10>

In [173]:
%%time

train_loss, train_acc = lstm_model.evaluate(X_train, y_train)
print('Training Loss: %.4f and Accuracy: %.2f%%' % (train_loss, train_acc * 100))

test_loss, test_acc = lstm_model.evaluate(X_test, y_test)
print('Test Loss: %.4f and Accuracy: %.2f%%' % (test_loss, test_acc * 100))

Training Loss: 0.1585 and Accuracy: 94.98%
Test Loss: 0.5459 and Accuracy: 83.09%
CPU times: user 2min 14s, sys: 11.3 s, total: 2min 25s
Wall time: 2min 11s


## **6. Use the designed model to print the prediction on any one sample.**

In [174]:
def predict_sample(sample):
  encoded_review = []
  imdb_index = imdb.get_word_index()
  index_from = 3
  imdb_index = {key:value + index_from for key, value in imdb_index.items()}
  imdb_index['the']
  review_split = sample.split(" ")
  for word in review_split:
    encoded_review.append(imdb_index[word])
  review_padded = pad_sequences([encoded_review], maxlen=100)
  pred = lstm_model.predict(review_padded)
  if pred > 0.5:
    sentiment = 'positive'
  else:
    sentiment = 'negative'
  print("Review: {0}\n\tSentiment: {1}".format(sample, sentiment))

In [175]:
positiveReview = "i liked this wonderful movie"
negatvieReview = "i did not like this awkward movie"

In [176]:
for review in [positiveReview, negatvieReview]:
  predict_sample(review)

Review: i liked this wonderful movie
	Sentiment: positive
Review: i did not like this awkward movie
	Sentiment: negative
