SENTIMENT ANALYSIS FOR IMDB MOVIE REVIEW

IMPORT THE NECESSARY LIBRARIES

In [2]:
## Create custom LSTM model for sentiment analysis of IMDB movie reviews from Kaggle using Pytorch
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset # for loading datasets from Hugging Face 
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("âœ… All libraries imported successfully!")
print("ðŸ“Š Pandas version:", pd.__version__)

âœ… All libraries imported successfully!
ðŸ“Š Pandas version: 2.3.3


In [3]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

LOAD THE DATASET FROM KAGGLE

In [4]:
ds = load_dataset("stanfordnlp/imdb")
df = pd.DataFrame(ds['train'])
print("Dataset loaded successfully! Shape:", df.shape)
print(df.head())

Dataset loaded successfully! Shape: (25000, 2)
                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0


In [5]:
## preprocessing dataset
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove expanded contractions
    contraction_mapping = {"can't": "cannot", 
                           "won't": "will not", 
                           "n't": " not", 
                           "'re": " are", 
                           "'s": " is", 
                           "'d": " would", 
                           "'ll": " will"}
    for contraction, expansion in contraction_mapping.items():
        text = re.sub(r"\b{}\b".format(contraction), expansion, text)
    return text

print("âœ… Preprocessing function defined successfully!")

âœ… Preprocessing function defined successfully!


In [6]:
## Tokenization and Vocabulary Building

MAX_VOCAB = 20000
MIN_FREQ = 2
MAXLEN = 256
EMBED_DIM = 128
HIDDEN_DIM = 128
OUT_DIM = 1
NUM_LAYERS = 1
BATCH_SIZE = 32
EPOCHS = 10
DROP_OUT = 0.5
PAD_IDX = 1
UNK_IDX = 1

from collections import Counter

def tokenize(text):
    return text.split()

def build_vocab(sentences, min_freq=2):
    token_counter = Counter()
    for sentence in sentences:
        tokens = sentence.split()
        token_counter.update(tokens)
    vocab = {token: idx + 2 for idx, (token, freq) in enumerate(token_counter.items()) if freq >= min_freq}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

def encode_sentence(sentence, vocab, maxlen=200):
    tokens = sentence.split()
    encoded = [vocab.get(token, UNK_IDX) for token in tokens]
    if len(encoded) < maxlen:
        encoded += [PAD_IDX] * (maxlen - len(encoded))
    else:
        encoded = encoded[:maxlen]
    return encoded
print("âœ… Tokenization and vocabulary building functions defined successfully!")    

âœ… Tokenization and vocabulary building functions defined successfully!


In [7]:
df['review'] = df['text'].apply(preprocess_text) ## preprocess reviews
vocab = build_vocab(df['review'], min_freq=MIN_FREQ) ## build vocabulary    
print("âœ… Vocabulary built successfully! Vocabulary size:", len(vocab))

df['encoded_review'] = df['review'].apply(lambda x: encode_sentence(x, vocab, maxlen=MAXLEN)) ## encode reviews
print("âœ… Reviews encoded successfully!")

âœ… Vocabulary built successfully! Vocabulary size: 56175
âœ… Reviews encoded successfully!


In [8]:
## Split data into train, validation, and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42, stratify=train_data['label'])
print("âœ… Data split into train, validation, and test sets successfully!")
print("Train size:", len(train_data), "Validation size:", len(val_data), "Test size:", len(test_data))  

âœ… Data split into train, validation, and test sets successfully!
Train size: 18000 Validation size: 2000 Test size: 5000


In [9]:
class IMDBDataset(Dataset): ## custom dataset class
    def __init__(self, data): ## initialize with data
        self.reviews = data['encoded_review'].tolist() ## list of encoded reviews
        self.labels = data['label'].tolist() ## list of labels
    
    def __len__(self): ## length of dataset
        return len(self.reviews) ## return length of reviews
    
    def __getitem__(self, idx): ## get item by index
        return torch.tensor(self.reviews[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.float) ## return review and label as tensors
print("âœ… Custom Dataset class defined successfully!")
 

âœ… Custom Dataset class defined successfully!


In [10]:
## Create DataLoaders
train_dataset = IMDBDataset(train_data)
val_dataset = IMDBDataset(val_data)
test_dataset = IMDBDataset(test_data)   
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
print("âœ… DataLoaders created successfully!")

âœ… DataLoaders created successfully!


In [11]:
## Define LSTM Model
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, out_dim, num_layers, drop_out):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=drop_out)
        self.fc = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(drop_out)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        out = self.fc(hidden)
        return self.sigmoid(out).squeeze()
print("âœ… LSTM model class defined successfully!")

âœ… LSTM model class defined successfully!


In [12]:
vocab_size = len(vocab) ## vocabulary size
model = SentimentLSTM(vocab_size, EMBED_DIM, HIDDEN_DIM, OUT_DIM, NUM_LAYERS, DROP_OUT) ## instantiate model
model = model.to(device) ## move model to device
print("âœ… Model instantiated and moved to device:", device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
print("âœ… Loss function and optimizer defined successfully!")

âœ… Model instantiated and moved to device: mps
âœ… Loss function and optimizer defined successfully!


In [13]:
## Training and Evaluation Functions
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        for reviews, labels in train_loader:
            reviews, labels = reviews.to(device), labels.to(device)
            optimizer.zero_grad()
            predictions = model(reviews)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            preds = (predictions >= 0.5).float()
            epoch_acc += (preds == labels).sum().item()
        epoch_loss /= len(train_loader)
        epoch_acc /= len(train_loader.dataset)
        val_loss, val_acc = evaluate_model(model, val_loader, criterion)
        print(f'Epoch {epoch+1}/{epochs} | Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}')

## Model Evaluation      
def evaluate_model(model, data_loader, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for reviews, labels in data_loader:
            reviews, labels = reviews.to(device), labels.to(device)
            predictions = model(reviews)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            preds = (predictions >= 0.5).float()
            epoch_acc += (preds == labels).sum().item()
    epoch_loss /= len(data_loader)
    epoch_acc /= len(data_loader.dataset)
    return epoch_loss, epoch_acc
print("âœ… Training and evaluation functions defined successfully!")
train_model(model, train_loader, val_loader, criterion, optimizer, EPOCHS)
print("âœ… Model training completed!")
test_loss, test_acc = evaluate_model(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}')
print("âœ… Model evaluation on test set completed!")

âœ… Training and evaluation functions defined successfully!
Epoch 1/10 | Train Loss: 0.6944 | Train Acc: 0.5023 | Val Loss: 0.6941 | Val Acc: 0.4985
Epoch 2/10 | Train Loss: 0.6934 | Train Acc: 0.5032 | Val Loss: 0.6938 | Val Acc: 0.4990
Epoch 3/10 | Train Loss: 0.6931 | Train Acc: 0.5057 | Val Loss: 0.6936 | Val Acc: 0.4980
Epoch 4/10 | Train Loss: 0.6927 | Train Acc: 0.5082 | Val Loss: 0.6934 | Val Acc: 0.5025
Epoch 5/10 | Train Loss: 0.6925 | Train Acc: 0.5111 | Val Loss: 0.6933 | Val Acc: 0.4975
Epoch 6/10 | Train Loss: 0.6922 | Train Acc: 0.5157 | Val Loss: 0.6932 | Val Acc: 0.4990
Epoch 7/10 | Train Loss: 0.6919 | Train Acc: 0.5171 | Val Loss: 0.6932 | Val Acc: 0.4990
Epoch 8/10 | Train Loss: 0.6917 | Train Acc: 0.5192 | Val Loss: 0.6931 | Val Acc: 0.5000
Epoch 9/10 | Train Loss: 0.6914 | Train Acc: 0.5202 | Val Loss: 0.6930 | Val Acc: 0.5025
Epoch 10/10 | Train Loss: 0.6912 | Train Acc: 0.5209 | Val Loss: 0.6930 | Val Acc: 0.5030
âœ… Model training completed!
Test Loss: 0.6925 |

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
classification_report and confusion_matrix
def get_predictions(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for reviews, labels in data_loader:
            reviews = reviews.to(device)
            predictions = model(reviews)
            preds = (predictions >= 0.5).float().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())
    return np.array(all_preds), np.array(all_labels)
test_preds, test_labels = get_predictions(model, test_loader)
print("Classification Report:")
print(classification_report(test_labels, test_preds, target_names=['negative', 'positive']))
cm = confusion_matrix(test_labels, test_preds)
print("Confusion Matrix:")
print(cm)

Classification Report:
              precision    recall  f1-score   support

    negative       0.51      0.84      0.64      2500
    positive       0.55      0.19      0.28      2500

    accuracy                           0.52      5000
   macro avg       0.53      0.52      0.46      5000
weighted avg       0.53      0.52      0.46      5000

Confusion Matrix:
[[2112  388]
 [2028  472]]


In [16]:
# Single-example prediction
example = "I absolutely loved this movie! The acting was superb and the story was touching."
processed = preprocess_text(example)
encoded = encode_sentence(processed, vocab, maxlen=MAXLEN)
input_tensor = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    prob = model(input_tensor).item()

label = 'positive' if prob >= 0.5 else 'negative'
print(f"Review: {example}")
print(f"Processed: {processed}")
print(f"Predicted sentiment: {label} (prob={prob:.4f})")

Review: I absolutely loved this movie! The acting was superb and the story was touching.
Processed: i absolutely loved this movie the acting was superb and the story was touching
Predicted sentiment: negative (prob=0.4971)
