In [82]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
from tensorflow.keras.models import load_model

In [2]:
## ----here it will only consider those words/tokens for any sentense which appeared in top 10000 most appeared word in whole vocubolary
## ----it mean maximum distinct tokens we can have is 10000.
max_feature = 10000 
(x_train, y_train),(x_test, y_test) = imdb.load_data(num_words= max_feature)

In [56]:
dic = imdb.get_word_index()

array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]])

In [5]:
from tensorflow.keras.preprocessing import sequence

max_len = 500 ## here it will help us to filterour those sample reviews where feature is less than 500

x_train = sequence.pad_sequences(x_train, maxlen= max_len, padding= 'pre')
x_test = sequence.pad_sequences(x_test, maxlen= max_len, padding= 'pre')
x_train


array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]])

In [6]:
model = Sequential()
## input_dim: here we define the numbers of all tokens we have
## input_legth:  here we define input length of each review

model.add(Embedding(input_dim=max_feature, output_dim= 128, input_length= max_len))
model.add(SimpleRNN(128, activation= 'relu'))
model.add(Dense(1, activation= 'sigmoid'))

model.compile(optimizer= 'adam', loss= 'binary_crossentropy', metrics= 'mae')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 128)          1280000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               32896     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,313,025
Trainable params: 1,313,025
Non-trainable params: 0
_________________________________________________________________


In [8]:
from tensorflow.keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(monitor= 'val_loss', patience= 5, restore_best_weights= True)
history = model.fit(x_train, y_train, epochs= 1, verbose= 1, batch_size= 10, callbacks=[earlystopping] , validation_split= 0.2)



In [80]:
def preprocessing(sample):
    example_review =sample
    l_word = example_review.lower().split()
    val_word = [dic.get(word, 2)+ 3 for word in l_word]
    padded_seq = sequence.pad_sequences([val_word], maxlen= 500)
    return padded_seq

def prediction(seq):
    result= model.predict(seq)[0][0]

    if result > 0.5:
        print("Review is positive with {:.2f}".format(result*100),"%")
    else:
        print("Review is negative with {:.2f}".format(result*100),"%")

    


In [81]:
example_review = "This movie was fantastic! The acting was great and the plot was thrilling."

padded_seq = preprocessing(example_review)
prediction(padded_seq)

Review is negative with 35.98 %


In [84]:
model.save('Embedding_RNN.h5')