In [14]:
# importing libraries

import pandas as pd 
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Embedding,LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Loading and preprocessing of data

In [15]:
 

data = pd.read_csv('IMDB Dataset.csv')
data.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [16]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [17]:

data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})


In [18]:
# splitting data 

train_data,test_data = train_test_split(data,test_size=0.2,random_state=42)

In [19]:
#Tokenize text data 

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

In [20]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

Building Long Short-Term Memory(LSTM)

In [21]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))

In [22]:
# compile the model 

model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [23]:
#training model

model.fit(X_train,Y_train,epochs=5,batch_size=64,validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 546ms/step - accuracy: 0.7857 - loss: 0.4605 - val_accuracy: 0.8512 - val_loss: 0.3478
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 482ms/step - accuracy: 0.8572 - loss: 0.3430 - val_accuracy: 0.8673 - val_loss: 0.3232
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 468ms/step - accuracy: 0.8634 - loss: 0.3248 - val_accuracy: 0.8576 - val_loss: 0.3335
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 428ms/step - accuracy: 0.8802 - loss: 0.2890 - val_accuracy: 0.8589 - val_loss: 0.3381
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m224s[0m 447ms/step - accuracy: 0.8922 - loss: 0.2661 - val_accuracy: 0.8786 - val_loss: 0.3235


<keras.src.callbacks.history.History at 0x2279a140d30>

In [24]:
# model evaluation 

loss,accuracy = model.evaluate(X_test,Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 50ms/step - accuracy: 0.8809 - loss: 0.3142
Test Loss: 0.3142399191856384
Test Accuracy: 0.8809000253677368


In [25]:
# building predictive system 

def predict_sentiment(review):
  # tokenize and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [26]:
# example 

new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
The sentiment of the review is: positive


In [27]:
# example 

new_review = "This movie was not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
The sentiment of the review is: negative
