In [32]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/user/nltk_data...


True

In [23]:
data = pd.read_csv('data/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
def remove_tags(string):
    #print(string)
    removelist = ""
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^w'+removelist+']', ' ',result)    #remove non-alphanumeric characters 
    result = result.lower()
    
    return result
data['review']=data['review'].apply(lambda cw : remove_tags(cw))

                      w                             w                                                                               w             w                                                           w                                                     w                           w                                   w                                         w                  w                                                                                  w                                                                       w                                                                                                                 w                                                w                                                                                                                                                                                                                               w                 w                                   w                  

KeyboardInterrupt: 

In [18]:
data['review']

0                              w                       ...
1          w                                           ...
2                       w     w         w              ...
3                                   w                  ...
4                                                      ...
                               ...                        
49995                                 w                ...
49996                                                  ...
49997                                                  ...
49998                                  w               ...
49999                                                  ...
Name: review, Length: 50000, dtype: object

In [24]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
data

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically there's family little boy (Jake) thi...,negative
4,"Petter Mattei's ""Love Time Money"" visually stu...",positive
...,...,...
49995,I thought movie right good job. It creative or...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I Catholic taught parochial elementary schools...,negative
49998,I'm going disagree previous comment side Malti...,negative


In [26]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['review'] = data.review.apply(lemmatize_text)
data

Unnamed: 0,review,sentiment
0,One reviewer mentioned watching 1 Oz episode h...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought wonderful way spend time hot summer ...,positive
3,Basically there's family little boy (Jake) thi...,negative
4,"Petter Mattei's ""Love Time Money"" visually stu...",positive
...,...,...
49995,I thought movie right good job. It creative or...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I Catholic taught parochial elementary school ...,negative
49998,I'm going disagree previous comment side Malti...,negative


In [28]:
s = 0.0
for i in data['review']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each review : ",s/data.shape[0])
pos = 0
for i in range(data.shape[0]):
    if data.iloc[i]['sentiment'] == 'positive':
        pos = pos + 1
neg = data.shape[0]-pos
print("Percentage of reviews with positive sentiment is "+str(pos/data.shape[0]*100)+"%")
print("Percentage of reviews with negative sentiment is "+str(neg/data.shape[0]*100)+"%")

Average length of each review :  136.17788
Percentage of reviews with positive sentiment is 50.0%
Percentage of reviews with negative sentiment is 50.0%


In [29]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [30]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [33]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [35]:
vocab_size

3000

In [36]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

In [37]:
num_epochs = 5
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, verbose=1, 
                    validation_split=0.1)

Epoch 1/5
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 57ms/step - accuracy: 0.7365 - loss: 0.5135 - val_accuracy: 0.8560 - val_loss: 0.3281
Epoch 2/5
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 64ms/step - accuracy: 0.8842 - loss: 0.2856 - val_accuracy: 0.8840 - val_loss: 0.3059
Epoch 3/5
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 62ms/step - accuracy: 0.9079 - loss: 0.2345 - val_accuracy: 0.8925 - val_loss: 0.2759
Epoch 4/5
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 60ms/step - accuracy: 0.9199 - loss: 0.2013 - val_accuracy: 0.8672 - val_loss: 0.3151
Epoch 5/5
[1m1055/1055[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 60ms/step - accuracy: 0.9437 - loss: 0.1583 - val_accuracy: 0.8784 - val_loss: 0.3248


In [38]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step
Accuracy of prediction on test set :  0.87312


In [40]:
# reviews on which we need to predict
sentence = [
    "The cinematography was breathtaking and emotionally powerful",
    "This might be the worst film I've watched all year",
    "The dialogue was awkward but the visual effects were spectacular",
    "I couldn't stop smiling throughout the entire movie",
    "The story lacked depth and the characters were one-dimensional",
    "Despite the poor reviews, I found it thoroughly entertaining",
    "The acting was wooden and the plot made no sense whatsoever",
    "A beautiful film with stunning performances by the entire cast",
    "The pacing was terrible but the soundtrack was memorable",
    "This movie touched my heart and left me thinking about it for days"
]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
The cinematography was breathtaking and emotionally powerful
Predicted sentiment :  Positive
This might be the worst film I've watched all year
Predicted sentiment :  Negative
The dialogue was awkward but the visual effects were spectacular
Predicted sentiment :  Positive
I couldn't stop smiling throughout the entire movie
Predicted sentiment :  Negative
The story lacked depth and the characters were one-dimensional
Predicted sentiment :  Negative
Despite the poor reviews, I found it thoroughly entertaining
Predicted sentiment :  Negative
The acting was wooden and the plot made no sense whatsoever
Predicted sentiment :  Negative
A beautiful film with stunning performances by the entire cast
Predicted sentiment :  Positive
The pacing was terrible but the soundtrack was memorable
Predicted sentiment :  Negative
This movie touched my heart and left me thinking about it for days
Predicted sentiment :  Positive


In [42]:
# After training
model.save('sentiment_model.h5')  # Save entire model (architecture + weights + optimizer state)

# Alternatively, save just the weights
model.save_weights('sentiment_model.weights.h5')


