# DataLoader

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np


class AnyData(Dataset):
    
    def __init__(self):
        with open('/Users/georgychernousov/studying-ml/word2vec/reviews.txt','r') as f:
            reviews = f.readlines()
        self.words = ' '.join(reviews).split()[:1000000]
        vocab = set(self.words)
        vocab_size = len(vocab)
        self.n_namples = vocab_size
        self.word_to_ix = {word:ix for ix, word in enumerate(vocab)}
        self.ix_to_word = {ix:word for ix, word in enumerate(vocab)}
        
    
    def __getitem__(self, index):
        """
        return x, y, where x - context words, y target word
        """
        left_context = self.words[index-2:index]
        left_bias = 2 - len(left_context)
        right_context = self.words[index+1:index+3+left_bias]
        if len(right_context) < 2:
            left_context.extend(self.words[index-4:index-2])
        
        return torch.tensor([self.word_to_ix[i] for i in [*left_context, *right_context]], dtype=torch.long), \
    torch.tensor(self.word_to_ix[self.words[index]], dtype=torch.long),
        
    def __len__(self):
        return self.n_namples
    
dataset = AnyData()
dataset[0]
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
dataiter = iter(dataloader)
data_1 = dataiter.next()

# CBOW model

In [None]:
import torch.nn as nn

class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        

    def forward(self, inputs):
        embeds = self.embeddings(inputs.view(-1,1))[:, -1, :]
        embeds = torch.mean(embeds, dim=0).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)




# Train

In [None]:
EMDEDDING_DIM = 100
model = CBOW(len(dataset), EMDEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

dataset = AnyData()
dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)

from tqdm import tqdm
for epoch in range(50):
    total_loss = 0

    for context, target in tqdm(dataloader):
        log_probs = model(context)

        total_loss += loss_function(log_probs, target)
    if epoch % 2 == 0:
        print(f'Epoch {epoch}, total loss {total_loss}')
    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

100%|██████████| 32424/32424 [01:09<00:00, 467.01it/s]


Epoch 0, total loss 337272.5625


100%|██████████| 32424/32424 [01:09<00:00, 464.92it/s]
100%|██████████| 32424/32424 [01:07<00:00, 483.41it/s]


Epoch 2, total loss 1403462.0
