In [1]:
from collections import defaultdict
import numpy as np

import torch
from torch import nn
from torch.optim import Adam

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

## BOW model

In [3]:
class BoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(BoW, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(
                                num_embeddings = vocab_size,
                                embedding_dim = embedding_dim,
                                device = device,
                        )
        
        nn.init.xavier_uniform_(self.embedding.weight)
        self.bias = nn.Parameter(torch.zeros(embedding_dim, device=device), requires_grad=True)

        
    def forward(self, words):
        out = self.embedding(words)
        out = out.sum(dim=0) + self.bias
        out = out.view(1, -1)
        return out

# Loading the model

In [4]:
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [5]:
def read_dataset(path: str):
    with open(path, "r") as f:
        for line in f:
            try:
                line = f.readline().lower().strip().split(" ||| ")
                text_class, text = line[0], line[1]
                yield ([w2i[word] for word in text.split(" ")], t2i[text_class])
            except:
                pass

In [6]:
train = list(read_dataset("../data/classes/train.txt"))
vocab_size = len(w2i)
n_classes = len(t2i)

In [7]:
vocab_size

11402

In [8]:
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("../data/classes/dev.txt"))

# Training the model

In [9]:
bow_model = BoW(vocab_size, n_classes)

In [10]:
loss_criterion = nn.CrossEntropyLoss()
optimizer = Adam(bow_model.parameters())

In [11]:
# Just 10 epochs as the goal is not to train a real model
# but just to see if the implementation is working
for i in range(10):
    train_loss = 0
    test_accuracy = 0
    for words, sentence_class in train:
        words = torch.tensor(words, device=device)
        sentence_class = torch.tensor([sentence_class], device=device)
        predictions = bow_model(words)
        loss = loss_criterion(predictions, sentence_class)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Iteration {i} - Train loss: {train_loss/len(train)}")
    with torch.no_grad():
        for words, sentence_class in dev:
            words = torch.tensor(words, device=device)
            predictions = bow_model(words)
            predicted_class = np.argmax(predictions.detach().cpu().numpy())
            if predicted_class == sentence_class:
                test_accuracy += 1
    print(f"Iteration {i} - Test accuracy: {test_accuracy/len(dev)}")

Iteration 0 - Train loss: 1.3278593930942513
Iteration 0 - Test accuracy: 0.2672727272727273
Iteration 1 - Train loss: 1.0759641071816397
Iteration 1 - Test accuracy: 0.27636363636363637
Iteration 2 - Train loss: 0.8615163682510009
Iteration 2 - Test accuracy: 0.2781818181818182
Iteration 3 - Train loss: 0.7069003791077418
Iteration 3 - Test accuracy: 0.2781818181818182
Iteration 4 - Train loss: 0.5909722992027185
Iteration 4 - Test accuracy: 0.2963636363636364
Iteration 5 - Train loss: 0.5014758727509766
Iteration 5 - Test accuracy: 0.30363636363636365
Iteration 6 - Train loss: 0.43082522152232405
Iteration 6 - Test accuracy: 0.3054545454545455
Iteration 7 - Train loss: 0.37401477615296674
Iteration 7 - Test accuracy: 0.30727272727272725
Iteration 8 - Train loss: 0.32759979818790647
Iteration 8 - Test accuracy: 0.3145454545454546
