# Assignment 1.3: Naive word2vec (40 points)

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch and code from your previous task.

## Results of this task: (30 points)
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)

## Extra questions: (10 points)
 * Intrinsic evaluation: you can find datasets [here](http://download.tensorflow.org/data/questions-words.txt)
 * Extrinsic evaluation: you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

from pathlib import Path
from pprint import pprint

UNK_TOKEN = '<UNK>'

np.random.seed(4242)
random.seed(4242)

In [2]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

device(type='cuda')

In [3]:
from collections import Counter


class CBOWBatcher:
    THRESHOLD = 5
    def __init__(self, dataset, window_size=2, threshold=THRESHOLD):
        self.window_size = window_size
        self.threshold = threshold
        c = Counter(dataset)
        # all the words we have plus <UNK> token for rare words
        unique = {w for w in dataset if c[w] > self.threshold}
        self.word2ind = {w: i for i, w in enumerate(unique)}
        self.word2ind[UNK_TOKEN] = len(self.word2ind)
        self.ind2word = {i: w for w, i in self.word2ind.items()}
        # We need to store only the numbers of the words here, as we have their numbers already
        # we create a padded array for tokens to process all the words from corpus
        # remove all the uncommon words here
        self.tokens = ([self.word2ind[UNK_TOKEN]] * window_size) +\
            [self.word2ind.get(w, self.word2ind[UNK_TOKEN]) for w in dataset] +\
            ([self.word2ind[UNK_TOKEN]] * window_size)
        self.vocab_size = len(set(self.tokens))
        assert self.vocab_size == len(self.word2ind)
        assert all(t < self.vocab_size for t in self.tokens)
        pprint(f'Corpus size: {len(dataset)}')
        pprint(f'Actual count of words used: {self.vocab_size}')
        pprint(f'{len(dataset)} words in dataset tokenized to {len(self.tokens)} tokens')

    def get_batch(self, batch_size=512):
        X = [None] * batch_size
        y = [None] * batch_size
        current = 0
        for start in np.random.permutation(range(len(self.tokens) - 2 * window_size)):
            center = start + window_size
            X[current] = [self.tokens[i]
                          for i in range(center - window_size, center + window_size + 1) if i != center]
            y[current] = self.tokens[center]
            current += 1
            if current == batch_size:
                # We need the generator, so only `yield ` is an option here
                yield torch.from_numpy(np.asarray(X)).to(device=device),\
                      torch.from_numpy(np.asarray(y)).to(device=device)
                # clean the buffer after we yielded it and we got back our process here
                X = [None] * batch_size
                y = [None] * batch_size
                current = 0
        if current:
            # if batch didn't get to the full size but the corpus ended
            yield torch.from_numpy(np.asarray(X[:current])).to(device=device),\
                  torch.from_numpy(np.asarray(y[:current])).to(device=device)         


In [4]:
test8_Data = Path.cwd() / 'text8'
with test8_Data.open() as f:
    # 1. simple cleaning: lowering all the words
    text8 = [a.lower() for line in f for a in line.split()]
    batcher = CBOWBatcher(text8, threshold=6)

'Corpus size: 17005207'
'Actual count of words used: 58113'
'17005207 words in dataset tokenized to 17005211 tokens'


In [5]:
class CBOWW2V(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size=256, window=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size * window * 2)
        self.relu = nn.ReLU(inplace=False)
        self.W1 = nn.Linear(embedding_size * window * 2, vocab_size)
        nn.init.xavier_normal_(self.W1.weight)

    def forward(self, x):
        # get the embedding by indices
        x = self.embed(x)
        # hidden linear layer
        x = self.relu(x)
        # get the predictions
        x = self.W1(x)
        # we need only 1 word by the given ones
        # here we got 4 options, so let's average them
        return x.mean(dim=1)

    def get_word_emdedding(self, word):
        word = torch.LongTensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)


def test_CBOWW2V_shapes():
    window_size = 2
    batch_size = 64
    vocab_size = 50
    x = torch.zeros((batch_size, window_size * 2), dtype=torch.long)
    model = CBOWW2V(vocab_size, 42)
    scores = model(x)
    assert scores.size() == torch.Size([batch_size, vocab_size]), scores.size()


test_CBOWW2V_shapes()


In [16]:
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm


EACH_PRINT = 100
writer = SummaryWriter() 
def train_model(model, optimizer, epochs=1, max_steps=None):
    loss = nn.CrossEntropyLoss()
    for e in range(epochs):
        total_loss = 0
        t = tqdm(batcher.get_batch(1024), desc=f'Epoch {e}')
        for step, (x, y) in enumerate(t):
            if step > max_steps:
                break
            model.train()
            x = x.to(device=device, dtype=torch.long)
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            check = loss(scores, y)

            optimizer.zero_grad()
            check.backward()
            optimizer.step()
            total_loss += check.data
            average_loss = float(total_loss / (step + 1))
            writer.add_scalar('Current loss/train', check.data, step)
            writer.add_scalar('Total loss/train', total_loss, step)
            writer.add_scalar('Average loss/train', average_loss, step)
            t.set_postfix(loss=check.data)
            if not step % EACH_PRINT:
                pprint(f'Iteration {step}, current loss = {check.data:.4f}, average loss = {average_loss:.4f}')


In [7]:
pprint(device)

learning_rate = 1.568
embedding_size = 222
window_size = 2
model = CBOWW2V(batcher.vocab_size, embedding_size)
model = model.to(device=device)
optimizer = optim.ASGD(model.parameters(), lr=learning_rate)

train_model(model, optimizer)

device(type='cuda')


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 0', max=1.0, style=ProgressStyle(…

'Iteration 0, loss = 10.9621'
'Iteration 100, loss = 8.4109'
'Iteration 200, loss = 8.2269'
'Iteration 300, loss = 9.8921'
'Iteration 400, loss = 10.7657'
'Iteration 500, loss = 7.2920'
'Iteration 600, loss = 7.5492'
'Iteration 700, loss = 8.0928'
'Iteration 800, loss = 10.1871'
'Iteration 900, loss = 7.1046'
'Iteration 1000, loss = 9.2898'
'Iteration 1100, loss = 6.7658'
'Iteration 1200, loss = 7.3498'
'Iteration 1300, loss = 7.7469'
'Iteration 1400, loss = 8.1395'
'Iteration 1500, loss = 7.5857'
'Iteration 1600, loss = 6.9428'
'Iteration 1700, loss = 7.1464'
'Iteration 1800, loss = 7.0798'
'Iteration 1900, loss = 6.7013'
'Iteration 2000, loss = 7.1450'
'Iteration 2100, loss = 6.9293'
'Iteration 2200, loss = 7.1913'
'Iteration 2300, loss = 6.7560'
'Iteration 2400, loss = 6.8238'
'Iteration 2500, loss = 6.8997'
'Iteration 2600, loss = 6.3742'
'Iteration 2700, loss = 6.6186'
'Iteration 2800, loss = 6.8392'
'Iteration 2900, loss = 7.0350'
'Iteration 3000, loss = 6.7030'
'Iteration 3100, 

In [21]:
# results for the model are:
# 16607/? [4:12:24<00:00, 1.10it/s, loss=tensor(6.5550, device='cuda:0')]
# This is 1 epoch on the whole corpus

model

CBOWW2V(
  (embed): Embedding(58113, 888)
  (relu): ReLU()
  (W1): Linear(in_features=888, out_features=58113, bias=True)
)

![Loss](imgs/LossGraph.png)

In [23]:
pprint(device)

learning_rate = 0.1568
embedding_size = 222
window_size = 2
model2 = CBOWW2V(batcher.vocab_size, embedding_size)
model2 = model2.to(device=device)
optimizer2 = optim.ASGD(model2.parameters(), lr=learning_rate)

train_model(model2, optimizer2, epochs=10, max_steps=700)

device(type='cuda')


RuntimeError: CUDA error: unknown error