## IMDb Sentiment Classification using PyTorch RNN
 ------------------------------------------------
### Objective:
 - Build a Recurrent Neural Network (RNN) for binary sentiment classification
 - Use IMDb dataset (positive / negative reviews)
 - Train, evaluate, and summarize model performance

### Framework: PyTorch
### Model: Embedding + SimpleRNN + Linear


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from collections import Counter
import matplotlib.pyplot as plt
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [3]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

print(f"Train samples: {len(train_data)}, Test samples: {len(test_data)}")


README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train samples: 25000, Test samples: 25000


In [4]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")

def build_vocab(data, min_freq=2):
    counter = Counter()
    for example in data:
        counter.update(tokenizer(example["text"]))
    vocab = {"<pad>": 0, "<unk>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

vocab = build_vocab(train_data)
print("Vocab size:", len(vocab))


Vocab size: 51718


In [5]:
def encode_text(text, vocab, max_len=200):
    tokens = tokenizer(text)
    ids = [vocab.get(tok, vocab["<unk>"]) for tok in tokens[:max_len]]
    ids += [vocab["<pad>"]] * (max_len - len(ids))
    return torch.tensor(ids)

class IMDbDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        return encode_text(text, self.vocab), torch.tensor(label, dtype=torch.float32)

train_dataset = IMDbDataset(train_data, vocab)
test_dataset = IMDbDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [6]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        out = self.fc(hidden[-1])
        return self.sigmoid(out).squeeze()

model = RNNModel(len(vocab), embed_dim=64, hidden_dim=128).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [7]:
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss, total_acc = 0, 0
    for text, label in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        text, label = text.to(device), label.to(device)
        optimizer.zero_grad()
        pred = model(text)
        loss = criterion(pred, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_acc += ((pred > 0.5) == label).sum().item()
    print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Acc: {total_acc/len(train_dataset):.4f}")


Epoch 1/3: 100%|█████████████████████████████████████████████████████████████████████| 782/782 [15:58<00:00,  1.23s/it]


Epoch 1 | Loss: 0.6968 | Acc: 0.5019


Epoch 2/3: 100%|█████████████████████████████████████████████████████████████████████| 782/782 [10:09<00:00,  1.28it/s]


Epoch 2 | Loss: 0.6925 | Acc: 0.5212


Epoch 3/3: 100%|█████████████████████████████████████████████████████████████████████| 782/782 [15:41<00:00,  1.20s/it]

Epoch 3 | Loss: 0.6916 | Acc: 0.5237





In [9]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for text, label in test_loader:
        text, label = text.to(device), label.to(device)
        output = model(text)
        pred = (output > 0.5).float()
        total += label.size(0)
        correct += (pred == label).sum().item()
print(f"Test Accuracy: {correct / total:.4f}")


Test Accuracy: 0.5793
