<a href="https://colab.research.google.com/github/devdandekar24/Sentiment-Analysis/blob/main/Sentiment_Analysis_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### LSTM: Long Short Term Memory

In [None]:
!pip install kaggle



Importing the dependencies

In [None]:
import os
import json

from zipfile import ZipFile
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Data collection

In [None]:
kaggle_dict=json.load(open("kaggle.json"))


In [None]:
#setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"]=kaggle_dict["username"]
os.environ["KAGGLE_KEY"]=kaggle_dict["key"]


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

# print("Path to dataset files:", path)
print(os.listdir(path))

['IMDB Dataset.csv']


In [None]:
csvfile=os.path.join(path,'IMDB Dataset.csv')
df=pd.read_csv(csvfile)

In [None]:
df.shape

(50000, 2)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
df.replace({"sentiment":{"positive":1}},inplace=True)
df.replace({"sentiment":{"negative":0}},inplace=True)

  df.replace({"sentiment":{"negative":0}},inplace=True)


In [None]:
#split data into train and test
train_data,test_data=train_test_split(df,test_size=0.2,random_state=123)

In [None]:
print(train_data.shape,"and",test_data.shape)

(40000, 2) and (10000, 2)


Data preprocessing

In [None]:
# Tokenize the data
tokenizer= Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
x_train=pad_sequences(tokenizer.texts_to_sequences(train_data["review"]),maxlen=200)
x_test=pad_sequences(tokenizer.texts_to_sequences(test_data["review"]),maxlen=200)

In [None]:
print(x_train)

[[   8    1 3521 ...    5   11   27]
 [   0    0    0 ...   95    5   64]
 [   0    0    0 ... 2248    9  417]
 ...
 [   0    0    0 ... 2036  155  550]
 [   0    0    0 ...    4    1  105]
 [ 919  121   65 ...  230  731    9]]


In [None]:
print(x_test)

[[   0    0    0 ...   20   11 3689]
 [   7    3  596 ...  673  444  444]
 [ 316 1498  176 ... 1121    4    9]
 ...
 [   0    0    0 ...   82 1221    2]
 [ 294   16  106 ...  330  114   96]
 [   0    0    0 ...    3 3532   53]]


In [None]:
y_train=train_data['sentiment']
y_test=test_data['sentiment']

#### Building the LSTM model

In [None]:
#build the model
model=Sequential()
model.add(Embedding(input_dim=5000,output_dim=128))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation="sigmoid"))

In [None]:
model.build(input_shape=(None, 200))  # 200 = max sequence length
model.summary()

In [None]:
# compile the model
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

In [None]:
#training the model
model.fit(x_train, y_train,epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 409ms/step - accuracy: 0.7238 - loss: 0.5427 - val_accuracy: 0.8056 - val_loss: 0.4264
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 422ms/step - accuracy: 0.8534 - loss: 0.3473 - val_accuracy: 0.8389 - val_loss: 0.3679
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 381ms/step - accuracy: 0.8817 - loss: 0.2904 - val_accuracy: 0.8631 - val_loss: 0.3304
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 406ms/step - accuracy: 0.9059 - loss: 0.2384 - val_accuracy: 0.8777 - val_loss: 0.3099
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 381ms/step - accuracy: 0.8947 - loss: 0.2649 - val_accuracy: 0.8665 - val_loss: 0.3325


<keras.src.callbacks.history.History at 0x7a047b3d3790>

Model Evaluation

In [None]:
loss,accuracy=model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 108ms/step - accuracy: 0.8705 - loss: 0.3237
Test Loss: 0.32420217990875244
Test Accuracy: 0.8695999979972839


##### Building the predictive system

In [None]:
def predict_sentiment(review):
  # tokenize and pad the review
  sequence=tokenizer.texts_to_sequences([review])
  padded_sequence=pad_sequences(sequence, maxlen=200)
  prediction= model.predict(padded_sequence)
  sentiment="positive" if prediction[0][0]>0.5 else "negative"
  return sentiment

In [None]:
# example
new_review="This movie was not fantastic. I hated it."
sentiment= predict_sentiment(new_review)
print(f"The sentiment of review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
The sentiment of review is: negative
