# DataLoader

In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np


class AnyData(Dataset):
    
    def __init__(self):
        with open('/Users/georgychernousov/studying-ml/word2vec/reviews.txt','r') as f:
            reviews = f.readlines()
        self.words = ' '.join(reviews).split()[:1000000]
        vocab = set(self.words)
        vocab_size = len(vocab)
        self.n_namples = vocab_size
        self.word_to_ix = {word:ix for ix, word in enumerate(vocab)}
        self.ix_to_word = {ix:word for ix, word in enumerate(vocab)}
        
    
    def __getitem__(self, index):
        """
        return x, y, where x - context words, y target word
        """
        left_context = self.words[index-2:index]
        left_bias = 2 - len(left_context)
        right_context = self.words[index+1:index+3+left_bias]
        if len(right_context) < 2:
            left_context.extend(self.words[index-4:index-2])
        
        return torch.tensor([self.word_to_ix[i] for i in [*left_context, *right_context]], dtype=torch.long), \
    torch.tensor(self.word_to_ix[self.words[index]], dtype=torch.long),
        
    def __len__(self):
        return self.n_namples
    
dataset = AnyData()
dataset[0]
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
dataiter = iter(dataloader)
data_1 = dataiter.next()

# CBOW model

In [4]:
import torch.nn as nn

class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # self.embeddings = nn.EmbeddingBag(vocab_size, embedding_dim, sparse=True)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = self.embeddings(inputs)# .view(1,-1)
        embeds = torch.mean(embeds, dim=1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

# Train

In [5]:
EMDEDDING_DIM = 100
model = CBOW(len(dataset), EMDEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

dataset = AnyData()
dataloader = DataLoader(dataset=dataset, batch_size=128, shuffle=True)

from tqdm import tqdm
for epoch in range(50):
    trainingloss = 0
    for context, target in tqdm(dataloader):
        optimizer.zero_grad()
        # offset = torch.cumsum(torch.tensor([0, *[4 for i in list(range(context.size()[0] - 1))]]), dim=0)
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        trainingloss += loss.item()
    # if epoch % 1 == 0:
    print(f'Epoch {epoch}, total loss {trainingloss}')

100%|██████████| 254/254 [00:25<00:00,  9.96it/s]


Epoch 0, total loss 2641.356616973877


100%|██████████| 254/254 [00:23<00:00, 10.67it/s]


Epoch 1, total loss 2638.795205116272


100%|██████████| 254/254 [00:23<00:00, 10.59it/s]


Epoch 2, total loss 2636.218873023987


100%|██████████| 254/254 [00:23<00:00, 11.01it/s]


Epoch 3, total loss 2633.6315956115723


 21%|██        | 53/254 [00:04<00:18, 10.87it/s]


KeyboardInterrupt: 