In [1]:
# Bag Of Words classifier to detect language


In [2]:
import torch
import torch.nn.functional as F
from torch import nn 
from torch import nn, optim

In [3]:
# train and test data manually

In [4]:
training_data = [
        ("Veinte paginas".lower().split(), "Spanish"),
        ("I will visit the library".lower().split(), "English"),
        ("I am reading a book".lower().split(), "English"),
        ("This is my favourite chapter".lower().split(), "English"),
        ("Estoy en la biblioteca".lower().split(), "Spanish"),
        ("Tengo un libro".lower().split(), "Spanish")
        ]

test_data = [
        ("Estoy leyendo".lower().split(), "Spanish"),
        ("This is not my favourite book".lower().split(), "English")
        ]

In [5]:

# generate vocabulary --  I think this is very ineffcient
# NB BOW require the use of test set otherwise there would be never seen words

In [6]:
word_dict = {}
i = 0
for words, language in training_data + test_data:
    for word in words:
        if word not in word_dict:
            word_dict[word] = i
            i +=1

In [7]:
corpus_size = len(word_dict)
languages = 2
label_index = {'Spanish': 0, "English": 1}

In [8]:
# set up the classifier
class BowClasssifier(nn.Module):
    
    def __init__(self, languages, corpus_size):
        
        super(BowClasssifier, self).__init__()
        self.linear = nn.Linear(corpus_size, languages)
        
    def forward(self, bow_vec):
        
        l1 = F.log_softmax(self.linear(bow_vec), dim=1)
        return l1

In [9]:
# make util functions
def make_bow_vec(sentence, word_dict):
    bow_vec = torch.zeros(len(word_dict))
    for word in sentence:
        bow_vec[word_dict[word]] += 1
    return bow_vec.view(1, -1)

def make_target(label, label_index):
    return torch.LongTensor([label_index[label]])


In [10]:
# inistantiate

In [11]:
model = BowClasssifier(languages, corpus_size)
loss_function =  nn.NLLLoss()
opt = optim.SGD(model.parameters(), lr = 0.1)

In [12]:
epochs = 100
for epoch in range(epochs):
    
    # transform sentence-label pair into bow vector-label
    
    for sentence, lan  in training_data:
        # zero gradient 
        model.zero_grad()
        x_s = make_bow_vec(sentence, word_dict)
        y_s = make_target(lan, label_index)
        pred = model(x_s)
        loss = loss_function(pred, y_s)
        loss.backward()
        opt.step()
    if epoch % 10 ==0:
        print(f"Epoch {epoch}/{epochs} loss: {loss.item()}")
    
    

Epoch 0/100 loss: 0.8330457210540771
Epoch 10/100 loss: 0.1341915875673294
Epoch 20/100 loss: 0.06795433908700943
Epoch 30/100 loss: 0.04517393186688423
Epoch 40/100 loss: 0.03376471623778343
Epoch 50/100 loss: 0.026934580877423286
Epoch 60/100 loss: 0.022393571212887764
Epoch 70/100 loss: 0.019158320501446724
Epoch 80/100 loss: 0.01673750951886177
Epoch 90/100 loss: 0.014858413487672806


In [13]:
# evaluate on test set

In [14]:
# deactivate autograd for evaluation/inference
with torch.no_grad():
    for sentence, lan  in test_data:
        x_s = make_bow_vec(sentence, word_dict)
        y_s = make_target(lan, label_index)
        pred = model(x_s)
        _, y_hat = torch.max(pred, 1)
        
        print(f"{' '.join(sentence)} is in {y_hat.item()}, and true value is {y_s.item()}")
        
        

estoy leyendo is in 0, and true value is 0
this is not my favourite book is in 1, and true value is 1


In [15]:
word_dict

{'veinte': 0,
 'paginas': 1,
 'i': 2,
 'will': 3,
 'visit': 4,
 'the': 5,
 'library': 6,
 'am': 7,
 'reading': 8,
 'a': 9,
 'book': 10,
 'this': 11,
 'is': 12,
 'my': 13,
 'favourite': 14,
 'chapter': 15,
 'estoy': 16,
 'en': 17,
 'la': 18,
 'biblioteca': 19,
 'tengo': 20,
 'un': 21,
 'libro': 22,
 'leyendo': 23,
 'not': 24}

In [16]:
word_dict["some"]

KeyError: 'some'

In [None]:
make_bow_vec(["not"], word_dict)

In [None]:
# model is simple so even with these few data points it knows how to recognise the language
# let's interpret the model paramaters:

In [None]:
#for an imput word, give  me the model paramater

def return_param(word):
    index = word_dict[word]
    for p in model.parameters():
        dims = len(p.size())
        if dims == 2:
            print(f"{word} : spanish {p[0][index].item()}" )
            print(f"{word} : english {p[1][index].item()}" )


In [None]:
return_param("not")

In [None]:
for p in model.parameters():
    print(p)