In [34]:
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Bidirectional, Dense, Embedding
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
from termcolor import colored
from collections import Counter
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#import nltk
#nltk.download('popular')


In [17]:
df_reviews=pd.read_csv('IMDB Dataset.csv')
print('original data')
print(df_reviews.head(3))
# Preprocessing function
from bs4 import BeautifulSoup
import string

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text


# Apply preprocessing to the 'review' column
df_reviews['review'] = df_reviews['review'].apply(preprocess_text)
print ('preprocessed data')
# Display the preprocessed DataFrame
print(df_reviews.head(3))

original data
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive


  text = BeautifulSoup(text, "html.parser").get_text()


preprocessed data
                                              review sentiment
0  one reviewer mentioned watching 1 oz episode y...  positive
1  wonderful little production filming technique ...  positive
2  thought wonderful way spend time hot summer we...  positive


In [18]:
review=df_reviews.iloc[0,0]
print('review: ' ,colored(review,'green'))
sentiment=df_reviews.iloc[0,1]
print('sentiment: ',colored(sentiment,'red'))

review:  [32mone reviewer mentioned watching 1 oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate tu

In [47]:
total_reviews=df_reviews['review']
total_sentiments=df_reviews['sentiment']
vocab_size=5000
tokenizer = Tokenizer(num_words=vocab_size )
tokenizer.fit_on_texts(total_reviews)
tokenizer.word_index

{'movie': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'time': 5,
 'good': 6,
 'character': 7,
 'get': 8,
 'even': 9,
 'story': 10,
 'would': 11,
 'make': 12,
 'see': 13,
 'really': 14,
 'scene': 15,
 'much': 16,
 'well': 17,
 'people': 18,
 'great': 19,
 'bad': 20,
 'also': 21,
 'show': 22,
 'first': 23,
 'dont': 24,
 'way': 25,
 'thing': 26,
 'made': 27,
 'could': 28,
 'think': 29,
 'life': 30,
 'go': 31,
 'know': 32,
 'watch': 33,
 'love': 34,
 'many': 35,
 'seen': 36,
 'actor': 37,
 'two': 38,
 'plot': 39,
 'say': 40,
 'never': 41,
 'look': 42,
 'acting': 43,
 'end': 44,
 'little': 45,
 'best': 46,
 'year': 47,
 'ever': 48,
 'better': 49,
 'take': 50,
 'man': 51,
 'come': 52,
 'still': 53,
 'work': 54,
 'find': 55,
 'part': 56,
 'want': 57,
 'something': 58,
 'give': 59,
 'lot': 60,
 'back': 61,
 'director': 62,
 'real': 63,
 'im': 64,
 'guy': 65,
 'watching': 66,
 'doesnt': 67,
 'performance': 68,
 'didnt': 69,
 'play': 70,
 'woman': 71,
 'actually': 72,
 'though': 73,
 'funny': 74,
 'a

In [20]:
tokenizer.word_counts

OrderedDict([('one', 52648),
             ('reviewer', 942),
             ('mentioned', 1029),
             ('watching', 8925),
             ('1', 2311),
             ('oz', 254),
             ('episode', 4760),
             ('youll', 2602),
             ('hooked', 274),
             ('right', 6558),
             ('exactly', 1951),
             ('happened', 1994),
             ('methe', 86),
             ('first', 16841),
             ('thing', 16080),
             ('struck', 266),
             ('brutality', 136),
             ('unflinching', 31),
             ('scene', 20696),
             ('violence', 1990),
             ('set', 5996),
             ('word', 3537),
             ('go', 14280),
             ('trust', 590),
             ('show', 16863),
             ('faint', 98),
             ('hearted', 127),
             ('timid', 46),
             ('pull', 1086),
             ('punch', 431),
             ('regard', 425),
             ('drug', 1559),
             ('sex', 3218),
      

In [21]:
# # Limit the vocabulary size to the top 20,000 words
# vocab_size = 20000
# tokenizer.word_index = {k: v for k, v in tokenizer.word_index.items() if v <= vocab_size}
# tokenizer.word_index[tokenizer.oov_token] = vocab_size + 1
# tokenizer.num_words = vocab_size + 2

# # Get the vocabulary size
# vocabulary_size = len(tokenizer.word_index)
# print("Vocabulary size:", vocabulary_size)
tokenizer.document_count

50000

In [22]:
sequences = tokenizer.texts_to_sequences(total_reviews)
sequences

[[3,
  1021,
  944,
  66,
  414,
  3109,
  174,
  367,
  2918,
  105,
  499,
  484,
  23,
  26,
  2984,
  3109,
  4986,
  15,
  486,
  128,
  105,
  251,
  31,
  1573,
  22,
  22,
  905,
  2034,
  2051,
  638,
  281,
  486,
  3148,
  227,
  252,
  371,
  3109,
  255,
  2356,
  589,
  754,
  1247,
  399,
  4316,
  1959,
  1041,
  1960,
  1823,
  817,
  242,
  230,
  4127,
  3411,
  399,
  244,
  3808,
  1186,
  866,
  2273,
  215,
  3829,
  1654,
  41,
  136,
  11,
  40,
  185,
  1061,
  22,
  550,
  88,
  31,
  22,
  453,
  2422,
  701,
  94,
  249,
  3932,
  2304,
  163,
  701,
  1139,
  701,
  67,
  831,
  97,
  23,
  174,
  48,
  116,
  2984,
  1448,
  2029,
  306,
  40,
  1422,
  190,
  1337,
  970,
  3109,
  91,
  230,
  440,
  1256,
  486,
  486,
  1935,
  2713,
  4894,
  269,
  454,
  8,
  150,
  17,
  640,
  610,
  4894,
  545,
  1041,
  550,
  334,
  572,
  1171,
  1041,
  385,
  66,
  3109,
  104,
  317,
  3456,
  3005,
  8,
  784,
  3680,
  350],
 [291,
  45,
  239,
  1234,


In [23]:
len(sequences)

50000

In [48]:
from keras.utils import pad_sequences
# Get the minimum and the maximum length of reviews
print("Max length of a review:: ", len(max((sequences), key=len)))
print("Min length of a review:: ", len(min((sequences), key=len)))
# Keeping a fixed length of all reviews to max words
#max_words = len(max((sequences), key=len))
#sequences = pad_sequences(sequences,padding='post',maxlen=max_words)
max_length=200
sequences = pad_sequences(sequences,maxlen=max_length)
sequences.shape

Max length of a review::  200
Min length of a review::  200


(50000, 200)

In [25]:
sequences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    3, 1021,
        944,   66,  414, 3109,  174,  367, 2918,  105,  499,  484,   23,
         26, 2984, 3109, 4986,   15,  486,  128,  105,  251,   31, 1573,
         22,   22,  905, 2034, 2051,  638,  281,  486, 3148,  227,  252,
        371, 3109,  255, 2356,  589,  754, 1247,  399, 4316, 1959, 1041,
       1960, 1823,  817,  242,  230, 4127, 3411,  399,  244, 3808, 1186,
        866, 2273,  215, 3829, 1654,   41,  136,   11,   40,  185, 1061,
         22,  550,   88,   31,   22,  453, 2422,  701,   94,  249, 3932,
       2304,  163,  701, 1139,  701,   67,  831,   

In [26]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,SimpleRNN,Embedding,Flatten

In [54]:
model = Sequential()

model.add(SimpleRNN(500,input_shape=(max_length,1),return_sequences=False))
model.add(Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_4 (SimpleRNN)    (None, 500)               251000    
                                                                 
 dense_9 (Dense)             (None, 1)                 501       
                                                                 
Total params: 251,501
Trainable params: 251,501
Non-trainable params: 0
_________________________________________________________________


In [28]:
from sklearn.preprocessing import LabelBinarizer
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(total_sentiments)
print(sentiment_data.shape)
print(sentiment_data)

(50000, 1)
[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


In [29]:
X_train=sequences[800:40000]
y_train=sentiment_data[800:40000]
X_valid=sequences[:800]
y_valid=sentiment_data[:800]
X_test=sequences[40000:]
y_test=sentiment_data[40000:]
print(X_test[0])
print(y_test[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0   23   57   40 3293  862 2208
  156    1 2272 1155   33  125    2    1  823  368  118  161  353  118
 1230  206  108  209  414    2  339   52   43  388    7 4903   56  216
  494   10    6  321   27  279   20   65   13  191  920   58  604 3174
    7    1 3426  197 1083  122  441   11   52 1732  287 2272    1  609
 1969 

In [33]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.fit(X_train,y_train,batch_size=64,epochs=5,validation_data=(X_valid,y_valid))
# Printing model score on test data
print()
print("RNN Score on test dataset---> ", model.evaluate(X_test, y_test, verbose=0))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

RNN Score on test dataset--->  [0.6941046118736267, 0.4975999891757965]


In [35]:
#Revised model with added embeddings layer
model2 = Sequential()
# fixing embedding size
embd_len = 128
 
# Creating a RNN model
model2.add(Embedding(vocab_size+1,embd_len,input_length=max_length))
model2.add(SimpleRNN(128,return_sequences=False))
model2.add(Dense(1,activation='sigmoid'))

model2.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          640128    
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 673,153
Trainable params: 673,153
Non-trainable params: 0
_________________________________________________________________


In [53]:
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model2.fit(X_train,y_train,batch_size=64,epochs=5,validation_data=(X_valid,y_valid))
# Printing model score on test data
print()
print("RNN Score on test dataset---> ", model2.evaluate(X_test, y_test, verbose=0))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

RNN Score on test dataset--->  [0.42620375752449036, 0.8507999777793884]


In [52]:
#GRU model
model3 = Sequential()
# fixing embedding size
embd_len = 128
 
# Creating a RNN model
model3.add(Embedding(vocab_size+1,embd_len,input_length=max_length))
model3.add(GRU(embd_len,return_sequences=False))
model3.add(Dense(1,activation='sigmoid'))

model3.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 200, 128)          640128    
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dense_8 (Dense)             (None, 1)                 129       
                                                                 
Total params: 739,329
Trainable params: 739,329
Non-trainable params: 0
_________________________________________________________________


In [38]:
model3.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model3.fit(X_train,y_train,batch_size=64,epochs=5,validation_data=(X_valid,y_valid))
# Printing model score on test data
print()
print("RNN Score on test dataset---> ", model3.evaluate(X_test, y_test, verbose=0))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

RNN Score on test dataset--->  [0.3541611135005951, 0.8574000000953674]


In [51]:
model4=Sequential()
model4.add(Embedding(input_dim=vocab_size,output_dim=embd_len,input_length=max_length))
model4.add(LSTM(embd_len))
model4.add(Dense(1,activation='sigmoid'))

model4.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 200, 128)          640000    
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense_7 (Dense)             (None, 1)                 129       
                                                                 
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [50]:
model4.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model4.fit(X_train,y_train,batch_size=64,epochs=5,validation_data=(X_valid,y_valid))
# Printing model score on test data
print()
print("RNN Score on test dataset---> ", model4.evaluate(X_test, y_test, verbose=0))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

RNN Score on test dataset--->  [0.3544605076313019, 0.867900013923645]


In [41]:
def sentiment_predictor(review):
    sequence=tokenizer.texts_to_sequences([review])
    padded_sequence=pad_sequences(sequence,maxlen=200)
    predict_result=model4.predict(padded_sequence)
    predict_sentiment="Positive" if predict_result[0][0] > 0.5 else "Negative"
    return predict_sentiment


    

In [42]:
new_review="Excellent Movie. Recommended to watch"
sentiment_new=sentiment_predictor(new_review)
print(f"The Movie review is :  {sentiment_new}")

The Movie review is :  Positive


In [43]:
new_review="Worst Movie. Never watch it"
sentiment_new=sentiment_predictor(new_review)
print(f"The Movie review is :  {sentiment_new}")

The Movie review is :  Negative


In [44]:
new_review="Overall Movie was good. However performance of some actors needs improvement"
sentiment_new=sentiment_predictor(new_review)
print(f"The Movie review is :  {sentiment_new}")

The Movie review is :  Negative


In [45]:
new_review="Movie was below average"
sentiment_new=sentiment_predictor(new_review)
print(f"The Movie review is :  {sentiment_new}")

The Movie review is :  Negative


In [46]:
new_review="Very good movie and acting"
sentiment_new=sentiment_predictor(new_review)
print(f"The Movie review is :  {sentiment_new}")

The Movie review is :  Positive
