In [1]:
# Imports

import os
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [7]:
if torch.cuda.is_available():
    device = "cuda" # Use NVIDIA GPU (if available)
elif torch.backends.mps.is_available():
    device = "mps" # Use Apple Silicon GPU (if available)
else:
    device = "cpu" # Default to CPU if no GPU is available
    
print(device)

mps


In [2]:
# Custom Dataset to Read Local IMDB Files

class IMDBDataset(Dataset):
    """
    Custom PyTorch Dataset for IMDB movie reviews.
    Expects directory structure with 'pos' and 'neg' subfolders.
    """
    def __init__(self, root_dir, vocab=None, tokenizer=None, max_length=256):
        self.root_dir = root_dir
        self.max_length = max_length

        # Collect all file paths & labels
        self.samples = []
        for label_name in ["pos", "neg"]:
            label = 1 if label_name == "pos" else 0
            folder_path = os.path.join(root_dir, label_name)
            for filename in os.listdir(folder_path):
                if filename.endswith(".txt"):
                    self.samples.append(
                        (os.path.join(folder_path, filename), label)
                    )

        # Use provided tokenizer or fall back to a simple regex-based one
        self.tokenizer = tokenizer if tokenizer else self.basic_tokenizer

        # Build vocab if not provided (e.g., training split)
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = self.build_vocab()

    def basic_tokenizer(self, text):
        """
        Basic tokenizer: 
        - Lowercases
        - Splits on word boundaries
        - Strips punctuation
        """
        text = text.lower()
        tokens = re.findall(r"\b\w+\b", text)
        return tokens

    def build_vocab(self, min_freq=2):
        """
        Construct a vocabulary dictionary from the training set.
        Rare tokens (freq < min_freq) are ignored.
        """
        counter = Counter()
        print(" Building vocabulary...")
        for path, _ in tqdm(self.samples):
            with open(path, "r", encoding="utf-8") as f:
                tokens = self.tokenizer(f.read())
                counter.update(tokens)

        # Reserve 0 for <PAD>, 1 for <UNK>
        vocab = {"<PAD>": 0, "<UNK>": 1}
        for token, freq in counter.items():
            if freq >= min_freq:
                vocab[token] = len(vocab)

        print(f" Vocab size: {len(vocab):,}")
        return vocab

    def numericalize(self, tokens):
        """
        Convert list of tokens into list of vocab indices.
        """
        return [self.vocab.get(t, self.vocab["<UNK>"]) for t in tokens]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        tokens = self.tokenizer(text)
        token_ids = self.numericalize(tokens)[: self.max_length]
        return torch.tensor(token_ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

In [3]:
# Collate Function for Batching

def collate_batch(batch):
    """
    Collate function for DataLoader.
    - Pads sequences in a batch to the same length
    - Stacks labels into a tensor
    """
    sequences, labels = zip(*batch)
    padded_seqs = pad_sequence(
        sequences
        , batch_first=True
        , padding_value=0  # <PAD> index
    )
    labels = torch.stack(labels)
    return padded_seqs, labels

In [4]:
# Build Datasets & DataLoaders

train_dir = '../../ML_Tinkering_Python/data/aclImdb/train/'
test_dir  = '../../ML_Tinkering_Python/data/aclImdb/test/'

# Build train dataset (and vocab)
train_dataset = IMDBDataset(
    root_dir=train_dir
    , vocab=None     # triggers vocab building
)

# Build test dataset using the same vocab
test_dataset = IMDBDataset(
    root_dir=test_dir
    , vocab=train_dataset.vocab
)

train_loader = DataLoader(
    train_dataset
    , batch_size=32
    , shuffle=True
    , collate_fn=collate_batch
)

test_loader = DataLoader(
    test_dataset
    , batch_size=32
    , shuffle=False
    , collate_fn=collate_batch
)


 Building vocabulary...


100%|███████████████████████████████████| 25000/25000 [00:09<00:00, 2608.16it/s]


 Vocab size: 47,010


In [5]:
# Simple RNN-based Classification Model

class RNNClassifier(nn.Module):
    """
    Simple LSTM-based text classification model.
    - Embedding layer
    - LSTM encoder
    - Linear layer → binary output
    """
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embed_dim
            , hidden_size=hidden_dim
            , batch_first=True
        )
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)             # (batch, seq_len, embed_dim)
        _, (hidden, _) = self.lstm(embedded)     # hidden: (1, batch, hidden_dim)
        hidden = hidden.squeeze(0)              # (batch, hidden_dim)
        logits = self.fc(hidden)                # (batch, num_classes)
        return logits

In [8]:
# Instantiate model & key params ~ [loss fn (criterion) & optim]

model = RNNClassifier(vocab_size=len(train_dataset.vocab)).to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(
    model.parameters()
    , lr=1e-3
)

model

RNNClassifier(
  (embedding): Embedding(47010, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [9]:
# Training Loop (Basic)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for x_batch, y_batch in tqdm(
        train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"
    ):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        logits = model(x_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x_batch.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += x_batch.size(0)

    avg_loss = total_loss / total
    acc = correct / total
    print(f"Train Loss: {avg_loss:.4f} | Acc: {acc:.4f}")

RNNClassifier(
  (embedding): Embedding(47010, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

Epoch 1/3: 100%|██████████████████████████████| 782/782 [00:53<00:00, 14.72it/s]

Train Loss: 0.6929 | Acc: 0.5133





RNNClassifier(
  (embedding): Embedding(47010, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

Epoch 2/3: 100%|██████████████████████████████| 782/782 [00:50<00:00, 15.48it/s]

Train Loss: 0.6892 | Acc: 0.5253





RNNClassifier(
  (embedding): Embedding(47010, 128, padding_idx=0)
  (lstm): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

Epoch 3/3: 100%|██████████████████████████████| 782/782 [00:50<00:00, 15.50it/s]

Train Loss: 0.6619 | Acc: 0.5979





In [None]:
# Quick Evaluation on Test Set

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        logits = model(x_batch)
        preds = logits.argmax(dim=1)
        correct += (preds == y_batch).sum().item()
        total += x_batch.size(0)

print(f"Test Accuracy: {correct / total:.4f}")
