In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(tokens)
    return processed_text
df['processed_review'] = df['review'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [3]:
df.drop('review', axis=1, inplace=True)

In [4]:
df[df.duplicated]

Unnamed: 0,sentiment,processed_review
3537,negative,quite producers appalling adaptation trying im...
3769,positive,favourite police series time turns tvfilm work...
4391,positive,beautiful film pure cassavetes style gena rowl...
6352,negative,liked grinch movie go watch near good seussian...
6479,negative,want much believe quote specifically english s...
...,...,...
49912,positive,incredible piece drama powerful hits found fil...
49950,negative,brief episode appeared one night gallery show ...
49984,negative,hello derrick cannon welcome first ever cannon...
49986,negative,movie disgrace major league franchise live min...


In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df

Unnamed: 0,sentiment,processed_review
0,positive,one reviewers mentioned watching 1 oz episode ...
1,positive,wonderful little production filming technique ...
2,positive,thought wonderful way spend time hot summer we...
3,negative,basically theres family little boy jake thinks...
4,positive,petter matteis love time money visually stunni...
...,...,...
49995,positive,thought movie right good job wasnt creative or...
49996,negative,bad plot bad dialogue bad acting idiotic direc...
49997,negative,catholic taught parochial elementary schools n...
49998,negative,im going disagree previous comment side maltin...


In [7]:
df.isnull().sum()

Unnamed: 0,0
sentiment,0
processed_review,0


In [8]:
df[df['sentiment'] == 'positive'].shape

(24883, 2)

In [9]:
df[df['sentiment'] == 'negative'].shape

(24695, 2)

In [10]:
X = df.drop('sentiment', axis=1)
y = df['sentiment']

In [11]:
X

Unnamed: 0,processed_review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...
...,...
49995,thought movie right good job wasnt creative or...
49996,bad plot bad dialogue bad acting idiotic direc...
49997,catholic taught parochial elementary schools n...
49998,im going disagree previous comment side maltin...


In [12]:
y

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive
...,...
49995,positive
49996,negative
49997,negative
49998,negative


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [15]:
X_train = X_train['processed_review'].tolist()
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)
X_train_sequences = word_tokenizer.texts_to_sequences(X_train)
X_test_sequences = word_tokenizer.texts_to_sequences(X_test)
max_length = max([len(seq) for seq in X_train_sequences])
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')
vocab_length = len(word_tokenizer.word_index) + 1
print(f"Vocabulary Length: {vocab_length}")
print(f"X_train_padded Shape: {X_train_padded.shape}")
print(f"X_test_padded Shape: {X_test_padded.shape}")

Vocabulary Length: 191579
X_train_padded Shape: (39662, 1429)
X_test_padded Shape: (1, 1429)


In [16]:
X_train_padded, X_test_padded, y_train, y_test = train_test_split(X_train_padded, y_train, test_size=0.2)

In [17]:
print(f"X_train_padded Shape: {X_train_padded.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"X_test_padded Shape: {X_test_padded.shape}")
print(f"y_test Shape: {y_test.shape}")

X_train_padded Shape: (31729, 1429)
y_train Shape: (31729,)
X_test_padded Shape: (7933, 1429)
y_test Shape: (7933,)


Trying LSTM



In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Flatten, Dropout
model = Sequential([
    Embedding(input_dim=vocab_length, output_dim=100),
    LSTM(2, return_sequences=True),
    Dropout(0.9),
    Flatten(),
    Dropout(0.9),
    Dense(1, activation='sigmoid')
])

In [19]:
model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [20]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.5, min_lr=0.00001)

In [21]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=16, validation_data=(X_test_padded, y_test),
                    callbacks=[early_stopping, lr_reduction])

Epoch 1/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1979s[0m 995ms/step - accuracy: 0.5423 - loss: 0.6790 - val_accuracy: 0.8389 - val_loss: 0.4100 - learning_rate: 0.0010
Epoch 2/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1983s[0m 985ms/step - accuracy: 0.8364 - loss: 0.3854 - val_accuracy: 0.8761 - val_loss: 0.3011 - learning_rate: 0.0010
Epoch 3/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 952ms/step - accuracy: 0.9565 - loss: 0.1262
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2018s[0m 994ms/step - accuracy: 0.9565 - loss: 0.1262 - val_accuracy: 0.8881 - val_loss: 0.3414 - learning_rate: 0.0010
Epoch 4/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 986ms/step - accuracy: 0.9880 - loss: 0.0415
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
[1m1984/1984[0m [32m━━━━━━━━━

In [22]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 117ms/step - accuracy: 0.8790 - loss: 0.2907
Test Accuracy: 0.8760872483253479
Test Loss: 0.3010702133178711


In [23]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_padded)
y_pred = (y_pred > 0.5).astype(int).flatten()
print(classification_report(y_test, y_pred))

[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 116ms/step
              precision    recall  f1-score   support

           0       0.95      0.80      0.87      4012
           1       0.82      0.96      0.88      3921

    accuracy                           0.88      7933
   macro avg       0.89      0.88      0.88      7933
weighted avg       0.89      0.88      0.88      7933



Trying DNN

In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Dropout
model = Sequential([
    Embedding(input_dim=vocab_length, output_dim=100),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Flatten(),
    Dense(1, activation='sigmoid')
])

In [43]:
model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

In [45]:
model.summary()

In [26]:
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=16, validation_data=(X_test_padded, y_test),
                    callbacks=[early_stopping, lr_reduction])

Epoch 1/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m704s[0m 353ms/step - accuracy: 0.7078 - loss: 0.5166 - val_accuracy: 0.8848 - val_loss: 0.2838 - learning_rate: 0.0010
Epoch 2/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 339ms/step - accuracy: 0.9434 - loss: 0.1523
Epoch 2: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m719s[0m 342ms/step - accuracy: 0.9434 - loss: 0.1523 - val_accuracy: 0.8831 - val_loss: 0.3214 - learning_rate: 0.0010
Epoch 3/10
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 331ms/step - accuracy: 0.9762 - loss: 0.0718
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
[1m1984/1984[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m665s[0m 333ms/step - accuracy: 0.9762 - loss: 0.0718 - val_accuracy: 0.8859 - val_loss: 0.3604 - learning_rate: 5.0000e-04
Epoch 4/10
[1m1984/1984[0m [32m━━━━━━━━

In [27]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.8856 - loss: 0.2835
Test Accuracy: 0.8847850561141968
Test Loss: 0.28376343846321106


In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.80      0.87      4012
           1       0.82      0.96      0.88      3921

    accuracy                           0.88      7933
   macro avg       0.89      0.88      0.88      7933
weighted avg       0.89      0.88      0.88      7933



Trying CNN

In [53]:
word_tokenizer = Tokenizer(num_words=10000)
word_tokenizer.fit_on_texts(X_train)

X_train_sequences = word_tokenizer.texts_to_sequences(X_train)
X_test_sequences = word_tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

vocab_length = 10000


In [55]:
word_tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
word_tokenizer.fit_on_texts(X_train)

X_train_sequences = word_tokenizer.texts_to_sequences(X_train)
X_test_sequences = word_tokenizer.texts_to_sequences(X_test)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post')

vocab_length = min(10000, len(word_tokenizer.word_index) + 1)

In [59]:
min_len = min(len(X_train_padded), len(y_train))

X_train_padded = X_train_padded[:min_len]
y_train = y_train[:min_len]


In [61]:
from tensorflow.keras import layers, models

max_length = X_train_padded.shape[1]
model = models.Sequential([
    layers.Embedding(input_dim=vocab_length, output_dim=100, input_length=max_length),
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.MaxPooling1D(pool_size=2),
    layers.Flatten(),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [62]:
history = model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))

Epoch 1/5
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 668ms/step - accuracy: 0.5054 - loss: 0.6947 - val_accuracy: 0.5020 - val_loss: 0.6931
Epoch 2/5
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m308s[0m 632ms/step - accuracy: 0.5705 - loss: 0.6837 - val_accuracy: 0.5016 - val_loss: 0.7006
Epoch 3/5
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 636ms/step - accuracy: 0.7786 - loss: 0.4759 - val_accuracy: 0.4987 - val_loss: 0.8736
Epoch 4/5
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 657ms/step - accuracy: 0.9034 - loss: 0.1952 - val_accuracy: 0.5022 - val_loss: 1.1677
Epoch 5/5
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 654ms/step - accuracy: 0.9283 - loss: 0.1225 - val_accuracy: 0.4964 - val_loss: 1.6216


In [63]:
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test)
print(f"Test Accuracy: {test_accuracy}")
print(f"Test Loss: {test_loss}")

[1m199/199[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.4884 - loss: 1.6497
Test Accuracy: 0.4963756799697876
Test Loss: 1.6215596199035645


The CNN has the worst performance of all 3 models