<a href="https://colab.research.google.com/github/gousesada/Movie-Review-Sentiment-Analysis-Using-LSTM-Networks/blob/main/GenAIsentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Load the data (assuming you have a CSV file with 'text' and 'sentiment' columns)
data = pd.read_csv('/content/IMDB Dataset.csv')
texts = data['review'].values
labels = data['sentiment'].values

In [18]:
X,y = data['review'].values,data['sentiment'].values
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')

shape of train data is (37500,)
shape of test data is (12500,)


In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
# Example function to tokenize the dataset
def tokenize(x_train, y_train, x_test, y_test):
    stop_words = set(stopwords.words('english'))
    tokenize()

In [4]:
# Encode labels
le = LabelEncoder()
labels = le.fit_transform(labels)
le, labels

(LabelEncoder(), array([1, 1, 1, ..., 0, 0, 0]))

In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [6]:
# Tokenize the text
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [7]:
# Build the model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [9]:
# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
early_stopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x79869bed26e0>

In [10]:
# Train the model
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 152ms/step - accuracy: 0.7334 - loss: 0.5121 - val_accuracy: 0.8614 - val_loss: 0.3229
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 143ms/step - accuracy: 0.8985 - loss: 0.2620 - val_accuracy: 0.8724 - val_loss: 0.3077
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 137ms/step - accuracy: 0.9352 - loss: 0.1767 - val_accuracy: 0.8604 - val_loss: 0.3636
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 134ms/step - accuracy: 0.9610 - loss: 0.1150 - val_accuracy: 0.8614 - val_loss: 0.4572
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 136ms/step - accuracy: 0.9733 - loss: 0.0844 - val_accuracy: 0.8484 - val_loss: 0.4858


In [11]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 39ms/step - accuracy: 0.8727 - loss: 0.3029
Test Loss: 0.3049
Test Accuracy: 0.8710


In [34]:
# Function to predict sentiment
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded)[0][0]
    print(prediction)
    return "Positive" if prediction > 0.7 else "Negative"


In [35]:
# Example usage
example_text = "i'm unhappy!"
print(f"Sentiment: {predict_sentiment(example_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
0.56337774
Sentiment: Negative
