In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
data = pd.read_csv("/content/IMDB Dataset.csv")

In [17]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [4]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower().split()
    return " ".join([word for word in text if word not in stop_words])

data['review'] = data['review'].apply(clean_text)
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})

In [5]:
vocab_size = 15000
max_len = 250

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['review'])
X = tokenizer.texts_to_sequences(data['review'])
X = pad_sequences(X, maxlen=max_len)

y = data['sentiment'].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_len))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [9]:
history = model.fit(X_train, y_train, epochs=1, batch_size=64, validation_split=0.2)


[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m345s[0m 689ms/step - accuracy: 0.8411 - loss: 0.3812 - val_accuracy: 0.8731 - val_loss: 0.3075


In [10]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 89ms/step

Test Accuracy: 0.8734

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.86      0.87      4961
           1       0.86      0.89      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [18]:
def predict_review(review_text):
    cleaned = clean_text(review_text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction >= 0.5 else "Negative"  # <-- ✅ CORRECT LOGIC
    print(f"\nReview Sentiment: {sentiment} (Confidence: {prediction:.2f})")

In [23]:
user_input = input("\nEnter your movie review: ")
predict_review(user_input)


Enter your movie review: No logic to the story... Bad bad storyline. Bad acting. The mood is dull and boring. Really dont do this again...i sincerly hope the writer/director can find other work, and a hint dont put this on your CV." 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step

Review Sentiment: Negative (Confidence: 0.02)
