# DataLoader

In [25]:
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np


class TXTDataset(Dataset):
    
    def __init__(self):
        with open('/Users/georgychernousov/studying-ml/word2vec/reviews.txt','r') as f:
            reviews = f.readlines()
        self.words = ' '.join(reviews).split()[:1000000]
        vocab = set(self.words)
        vocab_size = len(vocab)
        self.n_namples = vocab_size
        self.word_to_ix = {word:ix for ix, word in enumerate(vocab)}
        self.ix_to_word = {ix:word for ix, word in enumerate(vocab)}
        self.neg_samples_amount = 10 # кратное контекстному окну
        
    ### cbow loader
    # def __getitem__(self, index):
    #     """
    #     return x, y, where x - context words, y target word
    #     """
    #     left_context = self.words[index-2:index]
    #     left_bias = 2 - len(left_context)
    #     right_context = self.words[index+1:index+3+left_bias]
    #     if len(right_context) < 2:
    #         left_context.extend(self.words[index-4:index-2])
    #     
    #     context = torch.tensor([self.word_to_ix[i] for i in [*left_context, *right_context]], dtype=torch.long)
    #     target = torch.tensor(self.word_to_ix[self.words[index]], dtype=torch.long)
    #     return context, target
    
    ### skipgram loader
    def __getitem__(self, index):
        left_context = self.words[index-2:index]
        left_bias = 2 - len(left_context)
        right_context = self.words[index+1:index+3+left_bias]
        if len(right_context) < 2:
            left_context.extend(self.words[index-4:index-2])
        
        context = torch.tensor([self.word_to_ix[i] for i in [*left_context, *right_context]])
        
        center = self.word_to_ix[self.words[index]]
        context_neg = np.random.choice(self.n_namples, self.neg_samples_amount, replace=False)
        
        return center, context, context_neg
        
    def __len__(self):
        return self.n_namples
    
dataset = TXTDataset()
dataloader = DataLoader(dataset=dataset, batch_size=2, shuffle=True)
dataiter = iter(dataloader)
data_1 = dataiter.next()
data_1

[tensor([31079,  9553]),
 tensor([[26402, 32155, 17128, 12960],
         [10328, 21830, 10530,  5701]]),
 tensor([[28396, 12114,  7512,  7679,  5026, 13825, 30690, 29736, 14684, 12741],
         [ 9162, 24900, 18068, 14040, 12931, 13494, 17641, 20840, 10903, 15989]])]

# CBOW model (without negative sampling)

In [2]:
import torch.nn as nn

class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax(dim = -1)
        self.init_emb(embedding_dim)
        
    def init_emb(self, embedding_dim):
        """
        init the weight as original word2vec do.
        :return: None
        """
        initrange = 0.5 / embedding_dim
        self.embeddings.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds = torch.mean(embeds, dim=1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

# Skipgram model

In [29]:
import torch.nn as nn
import torch.nn.functional as F

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        
        self.in_emb = nn.Embedding(vocab_size, embedding_dim)
        self.out_emb = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, center_word, context_words, neg_context):
        '''
        center_word: центральное слово, [batch_size,]
        context_words: Слова вокруг окна контекста появляются вокруг [Batch_size * 2)]]
        neg_context: нет слов вокруг центрального слова, от отрицательной выборки [batch_size, (window_size * 2 * k)]
        return: loss
        '''
        center_word_emb = self.in_emb(center_word)  # [batch_szie,embed_size]
        context_words_emb = self.out_emb(context_words)  # [batch,(2*C),embed_size]
        neg_emb = self.out_emb(neg_context)  # [batch, (2*C * K),embed_size]
            
        log_pos = torch.bmm(context_words_emb,center_word_emb.unsqueeze(2)).squeeze()
        log_neg = torch.bmm(neg_emb, -center_word_emb.unsqueeze(2)).squeeze()
        
        log_pos = F.logsigmoid(log_pos).sum(1)
        log_neg = F.logsigmoid(log_neg).sum(1)
        
        loss = log_pos + log_neg
        return -loss

# Train CBOW

In [24]:
from datetime import datetime as dt
EMDEDDING_DIM = 100
model = CBOW(len(dataset), EMDEDDING_DIM)
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

dataset = TXTDataset()
dataloader = DataLoader(dataset=dataset, batch_size=1024, shuffle=True)

from tqdm import tqdm
for epoch in range(5):
    start = dt.now()
    trainingloss = 0
    for context, target in dataloader:
        optimizer.zero_grad()
        log_probs = model(context)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        trainingloss += loss.item()
    print(f'Epoch {epoch}, time {(dt.now() - start).total_seconds():.2f} sec., total loss {trainingloss}')

Epoch 0, time 16.21 sec., total loss 332.0977602005005
Epoch 1, time 15.20 sec., total loss 332.08157539367676
Epoch 2, time 15.46 sec., total loss 332.0683536529541
Epoch 3, time 16.01 sec., total loss 332.05159091949463
Epoch 4, time 16.43 sec., total loss 332.0367784500122


# Train SkipGram

In [30]:
from datetime import datetime as dt
EMDEDDING_DIM = 100
model = SkipGram(len(dataset), EMDEDDING_DIM)

optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

dataset = TXTDataset()
dataloader = DataLoader(dataset=dataset, batch_size=1024, shuffle=True)

from tqdm import tqdm
for epoch in range(10):
    start = dt.now()
    trainingloss = 0
    for center, context, context_neg in dataloader:
        optimizer.zero_grad()
        loss = model(center, context, context_neg).mean()
        loss.backward()
        optimizer.step()
        trainingloss += loss.item()
    # if epoch % 1 == 0:
    print(f'Epoch {epoch}, time {(dt.now() - start).total_seconds():.2f} sec., total loss {trainingloss}')

Epoch 0, time 24.77 sec., total loss 1789.4531364440918
Epoch 1, time 25.28 sec., total loss 1783.4533309936523
Epoch 2, time 25.11 sec., total loss 1772.236515045166
Epoch 3, time 23.39 sec., total loss 1772.7364120483398
Epoch 4, time 23.45 sec., total loss 1765.78959274292
Epoch 5, time 23.30 sec., total loss 1757.301284790039
Epoch 6, time 24.06 sec., total loss 1760.1774139404297
Epoch 7, time 24.75 sec., total loss 1749.7579040527344
Epoch 8, time 24.64 sec., total loss 1739.440170288086
Epoch 9, time 25.10 sec., total loss 1738.682041168213
