In [1]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cpu
!pip install datasets -q


Found existing installation: torch 2.4.1
Uninstalling torch-2.4.1:
  Successfully uninstalled torch-2.4.1
Found existing installation: torchvision 0.19.1
Uninstalling torchvision-0.19.1:
  Successfully uninstalled torchvision-0.19.1
Found existing installation: torchaudio 2.4.1
Uninstalling torchaudio-2.4.1:
  Successfully uninstalled torchaudio-2.4.1
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.0.1
  Downloading https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp38-cp38-win_amd64.whl (174.1 MB)
     ------------------------------------- 174.1/174.1 MB 16.9 MB/s eta 0:00:00
Collecting torchvision==0.15.2
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.15.2%2Bcpu-cp38-cp38-win_amd64.whl (1.2 MB)
     ---------------------------------------- 1.2/1.2 MB 2.7 MB/s eta 0:00:00
Collecting torchaudio==2.0.2
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.0.2%2Bcpu-cp38-cp38-win_amd64.whl (2.1 MB)
     ----------------------


[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from collections import Counter
from itertools import chain
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'datasets'

In [None]:
# Load the IMDB dataset
imdb = load_dataset("imdb")

In [None]:
# Improved tokenizer with basic preprocessing (lowercasing, alphabetic filtering)
def simple_tokenizer(text):
    text = text.lower()
    tokens = re.findall(r'\b[a-z]{3,}\b', text)  # keep only words with 3+ letters
    return tokens

In [None]:
# Filter vocabulary with frequency cutoff
MIN_FREQ = 5
counter = Counter(chain.from_iterable(simple_tokenizer(example['text']) for example in imdb['train']))
filtered = {word: freq for word, freq in counter.items() if freq >= MIN_FREQ}
vocab = {word: i+2 for i, word in enumerate(filtered)}  # +2 for pad and unk
vocab['[PAD]'] = 0
vocab['[UNK]'] = 1
inv_vocab = {i: w for w, i in vocab.items()}

In [None]:
# Tokenization and Encoding
def encode(example):
    tokens = simple_tokenizer(example['text'])
    ids = [vocab.get(token, vocab['[UNK]']) for token in tokens]
    return {'input_ids': ids, 'label': example['label']}

tokenized_imdb = imdb.map(encode, remove_columns=['text'])

In [None]:
# Padding
def pad(batch):
    max_len = max(len(x) for x in batch['input_ids'])
    padded = [x + [vocab['[PAD]']] * (max_len - len(x)) for x in batch['input_ids']]
    return {'input_ids': torch.tensor(padded), 'labels': torch.tensor(batch['label'])}

In [None]:
# Dataset Wrapper
from torch.utils.data import Dataset, DataLoader

class IMDbDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings['input_ids'][idx],
            'labels': self.encodings['labels'][idx]
        }
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
# Pad and wrap datasets
train_dataset = pad(tokenized_imdb['train'])
test_dataset = pad(tokenized_imdb['test'])
train_dataset = IMDbDataset(train_dataset)
test_dataset = IMDbDataset(test_dataset)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
# Linear Classifier
class LinearSentimentClassifier(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 1)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).sum(dim=1).squeeze(1)
        return torch.sigmoid(embedded)

In [None]:
# Updated Model with Bias and Init Fixes
class LinearSentimentClassifier(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 1)
        self.bias = nn.Parameter(torch.zeros(1))
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).sum(dim=1).squeeze(1)
        logits = embedded + self.bias
        return torch.sigmoid(logits)

# Train Function with Debug Logging
model = LinearSentimentClassifier(len(vocab))
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

def train():
    model.train()
    for epoch in range(4):
        total_loss = 0.0
        correct = 0
        total = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()
            inputs = batch['input_ids']
            labels = batch['labels'].float()

            # Clamp to avoid index errors
            inputs = torch.clamp(inputs, max=len(vocab)-1)

            outputs = model(inputs)

            # Binary predictions for accuracy
            preds = (outputs >= 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            # Debug info every 200 batches
            if i % 200 == 0:
                print(f"  Batch {i}, Loss: {loss.item():.4f}, Outputs avg: {outputs.mean().item():.4f}, Labels avg: {labels.mean().item():.4f}")

        avg_loss = total_loss / len(train_loader)
        accuracy = correct / total * 100
        print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

train()


In [None]:
# Word importance analysis
weights = model.embedding.weight.data.squeeze()
word_weights = [(inv_vocab[i], weights[i].item()) for i in range(len(weights)) if i in inv_vocab]
sorted_words = sorted(word_weights, key=lambda x: x[1])

print("\n🔻 Most negative words:")
print([w for w, _ in sorted_words[:20]])

print("\n🔺 Most positive words:")
print([w for w, _ in sorted_words[-20:]])