In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

In [2]:
import re

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
from tqdm.notebook import tqdm

In [5]:
from collections import defaultdict


In [6]:
corpus = set()
for example in dataset["train"]:
    # split on whitespace and punctuation
    for word in re.split(r"\W+", example["text"].lower()):
        # corpus[word] += 1
        corpus.add(word)

corpus = sorted(corpus)

print(len(corpus))
print(list(corpus)[:10])

def text_to_bow(text) -> torch.Tensor:
    bow = torch.zeros(len(corpus) + 1)
    for word in re.split(r"\W+", text.lower()):
        if word in corpus:
            bow[corpus.index(word)] += 1
        else:
            bow[-1] += 1
    return bow / bow.sum()

print(text_to_bow("hello world how are you doing today"))

74892
['', '0', '00', '000', '0000000000001', '00001', '00015', '000s', '001', '003830']
tensor([0., 0., 0.,  ..., 0., 0., 0.])


In [7]:
class ModelBow(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(len(corpus) + 1, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, bow):
        return self.sigmoid(self.linear(bow))
    
model_bow = ModelBow()


In [14]:
def train_on_imdb(model, lr=0.001, epochs=10, batch_size=32):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.BCELoss()
    dataloader = torch.utils.data.DataLoader(dataset["train"], batch_size=batch_size, shuffle=True)
    for epoch in tqdm(range(epochs)):
        for batch in dataloader:
            optimizer.zero_grad()
            bow = torch.stack([text_to_bow(text) for text in batch["text"]])
            y = torch.Tensor(batch["label"], dtype=torch.float32, device=bow.device)
            y_hat = model(bow)
            loss = loss_fn(y_hat, y)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch} loss: {loss.item()}")

In [15]:
train_on_imdb(model_bow)

  0%|          | 0/10 [00:00<?, ?it/s]

TypeError: new() received an invalid combination of arguments - got (Tensor, device=torch.device, dtype=torch.dtype), but expected one of:
 * (*, torch.device device)
 * (torch.Storage storage)
 * (Tensor other)
 * (tuple of ints size, *, torch.device device)
      didn't match because some of the keywords were incorrect: dtype
 * (object data, *, torch.device device)
      didn't match because some of the keywords were incorrect: dtype


In [10]:
def evaluate_on_imdb(model, batch_size=32):
    correct = 0
    for _, i in tqdm(enumerate(range(0, len(dataset["test"]), batch_size)), total=len(dataset["test"])//batch_size):
        batch = dataset["test"][i:i+batch_size]
        X = torch.stack([text_to_bow(text) for text in batch["text"]])
        y = torch.tensor([label for label in batch["label"]], dtype=torch.float32)
        y_pred = model(X)
        correct += (y_pred.round() == y.unsqueeze(1)).sum().item()
    print(f"Accuracy: {correct / len(dataset['test'])}")
    print(f"Correct: {correct}")
    print(f"Total: {len(dataset['test'])}")

evaluate_on_imdb(model_bow)

  0%|          | 0/781 [00:00<?, ?it/s]

KeyboardInterrupt: 