In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import numpy as np
from collections import Counter
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
)

import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# 1. Parameters
BATCH_SIZE = 64
EMBED_DIM = 128
HIDDEN_DIM = 128
EPOCHS = 5
MAX_VOCAB_SIZE = 25_000
MAX_SEQ_LEN = 256
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x16a23d330>

In [7]:
import os

def load_imdb_data(data_dir, split):
    """Loads IMDB reviews from disk. Returns list of (label, text) tuples."""
    data = []
    for label in ('pos', 'neg'):
        labeled_dir = os.path.join(data_dir, split, label)
        for fname in os.listdir(labeled_dir):
            if fname.endswith('.txt'):
                with open(os.path.join(labeled_dir, fname), encoding='utf-8') as f:
                    text = f.read()
                    data.append((label, text))
    return data

# Example usage:
train_data = load_imdb_data('../../ML_Tinkering_Python/data/aclImdb', 'train')
test_data = load_imdb_data('../../ML_Tinkering_Python/data/aclImdb', 'test')

In [8]:
# 3. Build Vocabulary
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_list):
    for label, text in data_list:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(
    yield_tokens(train_data),
    specials=["<unk>", "<pad>"],
    max_tokens=MAX_VOCAB_SIZE
)
vocab.set_default_index(vocab["<unk>"])

In [9]:
# 4. Text + Label Preprocessing
label_map = {"neg": 0, "pos": 1}

def process_text(text):
    tokens = tokenizer(text)
    token_ids = vocab(tokens)
    if len(token_ids) > MAX_SEQ_LEN:
        token_ids = token_ids[:MAX_SEQ_LEN]
    else:
        token_ids += [vocab["<pad>"]] * (MAX_SEQ_LEN - len(token_ids))
    return torch.tensor(token_ids, dtype=torch.long)

def process_label(label):
    return torch.tensor(label_map[label], dtype=torch.long)

In [10]:
# 5. Create PyTorch Dataset
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, samples):
        self.samples = []
        for label, text in samples:
            self.samples.append((process_text(text), process_label(label)))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        return self.samples[idx]

train_dataset = IMDBDataset(train_data)
test_dataset = IMDBDataset(test_data)