In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
l=LabelEncoder()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv(r'C:\Users\Lenovo\Downloads\aiml\dl\rnn\review analysis\IMDB Dataset.csv')  # Replace with your filename
df.dropna(inplace=True)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if not w in stop_words]
    return " ".join(words)
df['clean_review'] = df['review'].apply(clean_text)

In [4]:
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
padded_sequences = pad_sequences(sequences, maxlen=200, padding='post', truncating='post')

In [5]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
labels = df['sentiment'].values
labels = df['sentiment'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=64, input_length=200))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [8]:
history = model.fit(X_train, y_train, epochs=10, batch_size=256, validation_split=0.2)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 288ms/step - accuracy: 0.5050 - loss: 0.6935 - val_accuracy: 0.5015 - val_loss: 0.6925
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 231ms/step - accuracy: 0.5174 - loss: 0.6912 - val_accuracy: 0.5196 - val_loss: 0.6907
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 230ms/step - accuracy: 0.5462 - loss: 0.6855 - val_accuracy: 0.5140 - val_loss: 0.6866
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 241ms/step - accuracy: 0.5537 - loss: 0.6611 - val_accuracy: 0.5314 - val_loss: 0.6718
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 232ms/step - accuracy: 0.5665 - loss: 0.6393 - val_accuracy: 0.5205 - val_loss: 0.6916
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 237ms/step - accuracy: 0.5628 - loss: 0.6389 - val_accuracy: 0.5420 - val_loss: 0.6922
Epoch 7/10

In [9]:
y_pred = (model.predict(X_test) > 0.5).astype('int32')                     
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step
Accuracy: 0.6993
              precision    recall  f1-score   support

           0       0.64      0.89      0.75      4961
           1       0.82      0.51      0.63      5039

    accuracy                           0.70     10000
   macro avg       0.73      0.70      0.69     10000
weighted avg       0.73      0.70      0.69     10000



In [10]:
# Save model
model.save('lstm_sentiment_model.h5')

# Save tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)




In [11]:
# Load model & tokenizer
from tensorflow.keras.models import load_model
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

model = load_model('lstm_sentiment_model.h5')

# Preprocessing function (same as before)
def clean_text(text):
    import re
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

# Predict function
def predict_sentiment(review):
    cleaned = clean_text(review)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=200, padding='post', truncating='post')
    pred = model.predict(padded)[0][0]
    sentiment = 'Positive' if pred >= 0.5 else 'Negative'
    return sentiment, pred

# Test it
user_input = "I loved the movie! It was fantastic and inspiring."
sentiment, confidence = predict_sentiment(user_input)
print(f"Review: {user_input}")
print(f"Predicted Sentiment: {sentiment} ({confidence:.2f})")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step
Review: I loved the movie! It was fantastic and inspiring.
Predicted Sentiment: Positive (0.73)
