In [1]:
# Colab cell 1: Install & import
!pip install --quiet tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import os
import re
import tarfile
import urllib.request
from collections import Counter
from pathlib import Path

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Colab cell 2: Load IMDB reviews (raw text) via TFDS
ds_train = tfds.load('imdb_reviews', split='train', as_supervised=True)
ds_test  = tfds.load('imdb_reviews', split='test',  as_supervised=True)

train_texts, train_labels = [], []
for text, label in tfds.as_numpy(ds_train):
    train_texts.append(text.decode('utf-8'))
    train_labels.append(int(label))

test_texts, test_labels = [], []
for text, label in tfds.as_numpy(ds_test):
    test_texts.append(text.decode('utf-8'))
    test_labels.append(int(label))

2025-07-09 18:44:20.128026: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752111860.153165  675836 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752111860.161493  675836 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752111860.181614  675836 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752111860.181642  675836 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752111860.181646  675836 computation_placer.cc:177] computation placer alr

In [3]:
print(f"Train samples: {len(train_texts)}, Test samples: {len(test_texts)}")
print(f"data head: {train_texts[:2]}")

Train samples: 25000, Test samples: 25000
data head: ["This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.", 'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell asleep b

In [4]:
def clean(text):
    """Clean text by removing HTML tags, special characters, and extra spaces."""
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text.lower()  # Convert to lowercase

def tokenize(text):
    """Tokenize text into words."""
    #print("cleaning text")
    text = clean(text)
    #print("tokenizing text")
    return text.split()


def build_vocabulary(texts, vocab_size=10000):
    for text in texts:
        tokens = tokenize(text)
        counter = Counter(tokens)
        vocab = {}
        for word, count in counter.most_common(vocab_size):
            vocab[word] = count
          
    return vocab

vocab = build_vocabulary(train_texts)

In [5]:
def text_to_bow(text, vocab):
  tokens = tokenize(text)
  bow = np.zeros(len(vocab), dtype=np.float32)
  for token in tokens:
    if token in vocab:
        index = list(vocab.keys()).index(token)
        bow[index] += 1
  return bow

In [6]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        bow = text_to_bow(text, self.vocab)
        return torch.tensor(bow, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [7]:
batch_size = 32
train_dataset = IMDBDataset(train_texts, train_labels, vocab)
test_dataset = IMDBDataset(test_texts, test_labels, vocab)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [8]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_units=128):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_units)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(hidden_units, hidden_units)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.5)
        self.fc3 = nn.Linear(hidden_units, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x
    
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = MLP(input_size=len(vocab), hidden_units=256).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [9]:
def train_model(model, loader, criterion, optimizer, device, epochs=10):
  model.train()
  for epoch in range(epochs):
      running_loss = 0.0
      for inputs, labels in loader:
          inputs, labels = inputs.to(device), labels.to(device)
          optimizer.zero_grad()
          outputs = model(inputs)
          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()
      print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(loader):.4f}")

In [10]:
def evaluate_model(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(all_labels, all_preds))

In [11]:
print(f"Train samples: {len(train_texts)}, Test samples: {len(test_texts)}")
train_model(model, train_loader, criterion, optimizer, device, epochs=25)
evaluate_model(model, test_loader, device)

Train samples: 25000, Test samples: 25000
Epoch [1/25], Loss: 0.5987
Epoch [2/25], Loss: 0.5556
Epoch [3/25], Loss: 0.5451
Epoch [4/25], Loss: 0.5410
Epoch [5/25], Loss: 0.5355
Epoch [6/25], Loss: 0.5297
Epoch [7/25], Loss: 0.5240
Epoch [8/25], Loss: 0.5198
Epoch [9/25], Loss: 0.5159
Epoch [10/25], Loss: 0.5124
Epoch [11/25], Loss: 0.5084
Epoch [12/25], Loss: 0.5048
Epoch [13/25], Loss: 0.4957
Epoch [14/25], Loss: 0.4937
Epoch [15/25], Loss: 0.4909
Epoch [16/25], Loss: 0.4869
Epoch [17/25], Loss: 0.4792
Epoch [18/25], Loss: 0.4752
Epoch [19/25], Loss: 0.4709
Epoch [20/25], Loss: 0.4674
Epoch [21/25], Loss: 0.4613
Epoch [22/25], Loss: 0.4587
Epoch [23/25], Loss: 0.4519
Epoch [24/25], Loss: 0.4455
Epoch [25/25], Loss: 0.4457
Accuracy: 0.7262
              precision    recall  f1-score   support

           0       0.73      0.72      0.73     12500
           1       0.72      0.73      0.73     12500

    accuracy                           0.73     25000
   macro avg       0.73      0.7

In [12]:
# test the model with a sample text
sample_text = "This movie was fantastic! I loved it"
sample_text_a = "good good good good good"
sample_bow = text_to_bow(sample_text, vocab)
sample_tensor = torch.tensor(sample_bow, dtype=torch.float32).unsqueeze(0).to(device)
model.eval()
with torch.no_grad():
    output = model(sample_tensor)
    print(f"Raw output: {output}")
    _, predicted = torch.max(output, 1)
    sentiment = "Positive" if predicted.item() == 1 else "Negative"
    print(f"Sample text sentiment: {sentiment}")

Raw output: tensor([[ 0.1469, -0.1198]])
Sample text sentiment: Negative
