# Assignment 1.3: Naive word2vec (40 points)

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch and code from your previous task.

## Results of this task: (30 points)
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)

## Extra questions: (10 points)
 * Intrinsic evaluation: you can find datasets [here](http://download.tensorflow.org/data/questions-words.txt)
 * Extrinsic evaluation: you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

In [1]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim

from pathlib import Path
from pprint import pprint

UNK_TOKEN = '<UNK>'

np.random.seed(4242)
random.seed(4242)

In [2]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

device

device(type='cuda')

In [3]:
from collections import Counter


class CBOWBatcher:
    THRESHOLD = 5
    def __init__(self, dataset, window_size=2, threshold=THRESHOLD):
        self.window_size = window_size
        self.threshold = threshold
        c = Counter(dataset)
        # all the words we have plus <UNK> token for rare words
        unique = {w for w in dataset if c[w] > self.threshold}
        self.word2ind = {w: i for i, w in enumerate(unique)}
        self.word2ind[UNK_TOKEN] = len(self.word2ind)
        self.ind2word = {i: w for w, i in self.word2ind.items()}
        # We need to store only the numbers of the words here, as we have their numbers already
        # we create a padded array for tokens to process all the words from corpus
        # remove all the uncommon words here
        self.tokens = ([self.word2ind[UNK_TOKEN]] * window_size) +\
            [self.word2ind.get(w, self.word2ind[UNK_TOKEN]) for w in dataset] +\
            ([self.word2ind[UNK_TOKEN]] * window_size)
        self.vocab_size = len(set(self.tokens))
        assert self.vocab_size == len(self.word2ind)
        assert all(t < self.vocab_size for t in self.tokens)
        pprint(f'Corpus size: {len(dataset)}')
        pprint(f'Actual count of words used: {self.vocab_size}')
        pprint(f'{len(dataset)} words in dataset tokenized to {len(self.tokens)} tokens')

    def get_batch(self, batch_size=512):
        X = [None] * batch_size
        y = [None] * batch_size
        current = 0
        for start in np.random.permutation(range(len(self.tokens) - 2 * window_size)):
            center = start + window_size
            X[current] = [self.tokens[i]
                          for i in range(center - window_size, center + window_size + 1) if i != center]
            y[current] = self.tokens[center]
            current += 1
            if current == batch_size:
                # We need the generator, so only `yield ` is an option here
                yield torch.from_numpy(np.asarray(X)).to(device=device),\
                      torch.from_numpy(np.asarray(y)).to(device=device)
                # clean the buffer after we yielded it and we got back our process here
                X = [None] * batch_size
                y = [None] * batch_size
                current = 0
        if current:
            # if batch didn't get to the full size but the corpus ended
            yield torch.from_numpy(np.asarray(X[:current])).to(device=device),\
                  torch.from_numpy(np.asarray(y[:current])).to(device=device)         


In [4]:
test8_Data = Path.cwd() / 'text8'
with test8_Data.open() as f:
    # 1. simple cleaning: lowering all the words
    text8 = [a.lower() for line in f for a in line.split()]
    batcher = CBOWBatcher(text8, threshold=6)

'Corpus size: 17005207'
'Actual count of words used: 58113'
'17005207 words in dataset tokenized to 17005211 tokens'


In [5]:
class CBOWW2V(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size=256, window=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_size * window * 2)
        self.relu = nn.ReLU(inplace=False)
        self.W1 = nn.Linear(embedding_size * window * 2, vocab_size)
        nn.init.xavier_normal_(self.W1.weight)

    def forward(self, x):
        # get the embedding by indices
        x = self.embed(x)
        # hidden linear layer
        x = self.relu(x)
        # get the predictions
        x = self.W1(x)
        # we need only 1 word by the given ones
        # here we got 4 options, so let's average them
        return x.mean(dim=1)

    def get_word_emdedding(self, word):
        word = torch.LongTensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)


def test_CBOWW2V_shapes():
    window_size = 2
    batch_size = 64
    vocab_size = 50
    x = torch.zeros((batch_size, window_size * 2), dtype=torch.long)
    model = CBOWW2V(vocab_size, 42)
    scores = model(x)
    assert scores.size() == torch.Size([batch_size, vocab_size]), scores.size()


test_CBOWW2V_shapes()


In [6]:
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm


EACH_PRINT = 100
writer = SummaryWriter() 
def train_model(model, optimizer, epochs=1, max_steps=None):
    loss = nn.CrossEntropyLoss()
    for e in range(epochs):
        total_loss = 0
        t = tqdm(batcher.get_batch(1024), desc=f'Epoch {e}')
        for step, (x, y) in enumerate(t):
            if step > max_steps:
                break
            model.train()
            x = x.to(device=device, dtype=torch.long)
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            check = loss(scores, y)

            optimizer.zero_grad()
            check.backward()
            optimizer.step()
            total_loss += check.data
            average_loss = float(total_loss / (step + 1))
            writer.add_scalar('Current loss/train', check.data, step)
            writer.add_scalar('Total loss/train', total_loss, step)
            writer.add_scalar('Average loss/train', average_loss, step)
            t.set_postfix(loss=check.data)
            if not step % EACH_PRINT:
                pprint(f'Iteration {step}, current loss = {check.data:.4f}, average loss = {average_loss:.4f}')


In [7]:
# pprint(device)

# learning_rate = 1.568
# embedding_size = 222
# window_size = 2
# model = CBOWW2V(batcher.vocab_size, embedding_size)
# model = model.to(device=device)
# optimizer = optim.ASGD(model.parameters(), lr=learning_rate)

# train_model(model, optimizer)

device(type='cuda')


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 0', max=1.0, style=ProgressStyle(…

'Iteration 0, loss = 10.9621'
'Iteration 100, loss = 8.4109'
'Iteration 200, loss = 8.2269'
'Iteration 300, loss = 9.8921'
'Iteration 400, loss = 10.7657'
'Iteration 500, loss = 7.2920'
'Iteration 600, loss = 7.5492'
'Iteration 700, loss = 8.0928'
'Iteration 800, loss = 10.1871'
'Iteration 900, loss = 7.1046'
'Iteration 1000, loss = 9.2898'
'Iteration 1100, loss = 6.7658'
'Iteration 1200, loss = 7.3498'
'Iteration 1300, loss = 7.7469'
'Iteration 1400, loss = 8.1395'
'Iteration 1500, loss = 7.5857'
'Iteration 1600, loss = 6.9428'
'Iteration 1700, loss = 7.1464'
'Iteration 1800, loss = 7.0798'
'Iteration 1900, loss = 6.7013'
'Iteration 2000, loss = 7.1450'
'Iteration 2100, loss = 6.9293'
'Iteration 2200, loss = 7.1913'
'Iteration 2300, loss = 6.7560'
'Iteration 2400, loss = 6.8238'
'Iteration 2500, loss = 6.8997'
'Iteration 2600, loss = 6.3742'
'Iteration 2700, loss = 6.6186'
'Iteration 2800, loss = 6.8392'
'Iteration 2900, loss = 7.0350'
'Iteration 3000, loss = 6.7030'
'Iteration 3100, 

In [7]:
# results for the model are:
# 16607/? [4:12:24<00:00, 1.10it/s, loss=tensor(6.5550, device='cuda:0')]
# This is 1 epoch on the whole corpus


![Loss](imgs/LossGraph.png)

In [8]:
pprint(device)

learning_rate = 0.01568
embedding_size = 222
window_size = 2
model2 = CBOWW2V(batcher.vocab_size, embedding_size)
model2 = model2.to(device=device)
optimizer2 = optim.ASGD(model2.parameters(), lr=learning_rate)

train_model(model2, optimizer2, epochs=10, max_steps=700)

device(type='cuda')


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 0', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 10.9762, average loss = 10.9762'
'Iteration 100, current loss = 9.1245, average loss = 9.9924'
'Iteration 200, current loss = 8.6543, average loss = 9.4664'
'Iteration 300, current loss = 8.5092, average loss = 9.1626'
'Iteration 400, current loss = 8.1165, average loss = 8.9625'
'Iteration 500, current loss = 8.1539, average loss = 8.8114'
'Iteration 600, current loss = 8.1073, average loss = 8.7006'
'Iteration 700, current loss = 8.0480, average loss = 8.6094'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 1', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 8.0200, average loss = 8.0200'
'Iteration 100, current loss = 7.9176, average loss = 7.9958'
'Iteration 200, current loss = 8.2346, average loss = 7.9812'
'Iteration 300, current loss = 7.7683, average loss = 7.9580'
'Iteration 400, current loss = 7.8098, average loss = 7.9380'
'Iteration 500, current loss = 7.8590, average loss = 7.9228'
'Iteration 600, current loss = 7.6081, average loss = 7.9035'
'Iteration 700, current loss = 7.6734, average loss = 7.8843'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 2', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.8092, average loss = 7.8092'
'Iteration 100, current loss = 7.7972, average loss = 7.7396'
'Iteration 200, current loss = 7.8740, average loss = 7.7219'
'Iteration 300, current loss = 7.5432, average loss = 7.7078'
'Iteration 400, current loss = 7.7078, average loss = 7.7026'
'Iteration 500, current loss = 7.6802, average loss = 7.6945'
'Iteration 600, current loss = 7.6871, average loss = 7.6861'
'Iteration 700, current loss = 7.7036, average loss = 7.6776'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 3', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.4463, average loss = 7.4463'
'Iteration 100, current loss = 7.6481, average loss = 7.5834'
'Iteration 200, current loss = 7.5962, average loss = 7.5765'
'Iteration 300, current loss = 7.5070, average loss = 7.5743'
'Iteration 400, current loss = 7.4437, average loss = 7.5735'
'Iteration 500, current loss = 7.5256, average loss = 7.5656'
'Iteration 600, current loss = 7.3709, average loss = 7.5587'
'Iteration 700, current loss = 7.7177, average loss = 7.5501'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 4', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.4954, average loss = 7.4954'
'Iteration 100, current loss = 7.3805, average loss = 7.4977'
'Iteration 200, current loss = 7.2157, average loss = 7.4743'
'Iteration 300, current loss = 7.5926, average loss = 7.4833'
'Iteration 400, current loss = 7.6093, average loss = 7.4751'
'Iteration 500, current loss = 7.3370, average loss = 7.4715'
'Iteration 600, current loss = 7.4374, average loss = 7.4688'
'Iteration 700, current loss = 7.4879, average loss = 7.4610'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 5', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.4032, average loss = 7.4032'
'Iteration 100, current loss = 7.4551, average loss = 7.4248'
'Iteration 200, current loss = 7.4735, average loss = 7.4195'
'Iteration 300, current loss = 7.3905, average loss = 7.4114'
'Iteration 400, current loss = 7.3889, average loss = 7.4013'
'Iteration 500, current loss = 7.1531, average loss = 7.3975'
'Iteration 600, current loss = 7.4002, average loss = 7.3927'
'Iteration 700, current loss = 7.4430, average loss = 7.3905'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 6', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.3436, average loss = 7.3436'
'Iteration 100, current loss = 7.2757, average loss = 7.3565'
'Iteration 200, current loss = 7.5975, average loss = 7.3543'
'Iteration 300, current loss = 7.3533, average loss = 7.3464'
'Iteration 400, current loss = 7.3375, average loss = 7.3378'
'Iteration 500, current loss = 7.4013, average loss = 7.3348'
'Iteration 600, current loss = 7.3426, average loss = 7.3331'
'Iteration 700, current loss = 7.4967, average loss = 7.3293'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 7', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.3705, average loss = 7.3705'
'Iteration 100, current loss = 7.3923, average loss = 7.3241'
'Iteration 200, current loss = 7.3180, average loss = 7.3164'
'Iteration 300, current loss = 7.2032, average loss = 7.3077'
'Iteration 400, current loss = 7.4335, average loss = 7.3008'
'Iteration 500, current loss = 7.2299, average loss = 7.2984'
'Iteration 600, current loss = 7.1218, average loss = 7.2937'
'Iteration 700, current loss = 7.3347, average loss = 7.2914'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 8', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.3831, average loss = 7.3831'
'Iteration 100, current loss = 7.4332, average loss = 7.2559'
'Iteration 200, current loss = 7.1456, average loss = 7.2596'
'Iteration 300, current loss = 7.1779, average loss = 7.2586'
'Iteration 400, current loss = 7.2656, average loss = 7.2500'
'Iteration 500, current loss = 7.1989, average loss = 7.2483'
'Iteration 600, current loss = 7.2343, average loss = 7.2429'
'Iteration 700, current loss = 7.1757, average loss = 7.2410'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 9', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.2151, average loss = 7.2151'
'Iteration 100, current loss = 7.4060, average loss = 7.2226'
'Iteration 200, current loss = 7.2664, average loss = 7.2244'
'Iteration 300, current loss = 7.2240, average loss = 7.2145'
'Iteration 400, current loss = 7.2233, average loss = 7.2115'
'Iteration 500, current loss = 6.9893, average loss = 7.2129'
'Iteration 600, current loss = 7.1606, average loss = 7.2116'
'Iteration 700, current loss = 7.0183, average loss = 7.2109'


In [9]:
train_model(model2, optimizer2, epochs=10, max_steps=700)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 0', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.1260, average loss = 7.1260'
'Iteration 100, current loss = 7.2119, average loss = 7.1957'
'Iteration 200, current loss = 7.1208, average loss = 7.1992'
'Iteration 300, current loss = 7.2154, average loss = 7.1937'
'Iteration 400, current loss = 7.2971, average loss = 7.1917'
'Iteration 500, current loss = 7.0597, average loss = 7.1894'
'Iteration 600, current loss = 7.1239, average loss = 7.1866'
'Iteration 700, current loss = 7.0536, average loss = 7.1827'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 1', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.1181, average loss = 7.1181'
'Iteration 100, current loss = 7.0128, average loss = 7.1522'
'Iteration 200, current loss = 7.2486, average loss = 7.1653'
'Iteration 300, current loss = 7.0744, average loss = 7.1659'
'Iteration 400, current loss = 7.1537, average loss = 7.1665'
'Iteration 500, current loss = 7.1731, average loss = 7.1631'
'Iteration 600, current loss = 7.3547, average loss = 7.1577'
'Iteration 700, current loss = 7.1556, average loss = 7.1505'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 2', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.2332, average loss = 7.2332'
'Iteration 100, current loss = 7.3226, average loss = 7.1279'
'Iteration 200, current loss = 7.2767, average loss = 7.1306'
'Iteration 300, current loss = 7.1263, average loss = 7.1325'
'Iteration 400, current loss = 7.1830, average loss = 7.1329'
'Iteration 500, current loss = 7.1389, average loss = 7.1331'
'Iteration 600, current loss = 7.1645, average loss = 7.1325'
'Iteration 700, current loss = 7.1424, average loss = 7.1307'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 3', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.1788, average loss = 7.1788'
'Iteration 100, current loss = 7.2195, average loss = 7.0941'
'Iteration 200, current loss = 7.3154, average loss = 7.1046'
'Iteration 300, current loss = 7.2892, average loss = 7.1053'
'Iteration 400, current loss = 7.1160, average loss = 7.1058'
'Iteration 500, current loss = 6.9340, average loss = 7.1031'
'Iteration 600, current loss = 7.0431, average loss = 7.1018'
'Iteration 700, current loss = 7.1764, average loss = 7.1005'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 4', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.1245, average loss = 7.1245'
'Iteration 100, current loss = 7.2808, average loss = 7.1102'
'Iteration 200, current loss = 7.0386, average loss = 7.0929'
'Iteration 300, current loss = 7.0899, average loss = 7.0934'
'Iteration 400, current loss = 7.1474, average loss = 7.0888'
'Iteration 500, current loss = 6.9376, average loss = 7.0882'
'Iteration 600, current loss = 6.9950, average loss = 7.0852'
'Iteration 700, current loss = 7.1026, average loss = 7.0837'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 5', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.0162, average loss = 7.0162'
'Iteration 100, current loss = 7.0702, average loss = 7.0735'
'Iteration 200, current loss = 6.9705, average loss = 7.0729'
'Iteration 300, current loss = 6.8781, average loss = 7.0696'
'Iteration 400, current loss = 6.9721, average loss = 7.0711'
'Iteration 500, current loss = 7.0846, average loss = 7.0718'
'Iteration 600, current loss = 7.1897, average loss = 7.0751'
'Iteration 700, current loss = 6.9750, average loss = 7.0767'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 6', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.1536, average loss = 7.1536'
'Iteration 100, current loss = 7.2375, average loss = 7.0802'
'Iteration 200, current loss = 6.8845, average loss = 7.0730'
'Iteration 300, current loss = 7.1155, average loss = 7.0666'
'Iteration 400, current loss = 7.1253, average loss = 7.0634'
'Iteration 500, current loss = 7.2385, average loss = 7.0590'
'Iteration 600, current loss = 6.9251, average loss = 7.0557'
'Iteration 700, current loss = 7.0387, average loss = 7.0583'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 7', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 6.9969, average loss = 6.9969'
'Iteration 100, current loss = 7.0110, average loss = 7.0370'
'Iteration 200, current loss = 7.2653, average loss = 7.0400'
'Iteration 300, current loss = 6.8691, average loss = 7.0404'
'Iteration 400, current loss = 6.9650, average loss = 7.0352'
'Iteration 500, current loss = 7.0372, average loss = 7.0321'
'Iteration 600, current loss = 6.9309, average loss = 7.0357'
'Iteration 700, current loss = 6.9789, average loss = 7.0344'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 8', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.0260, average loss = 7.0260'
'Iteration 100, current loss = 7.0643, average loss = 7.0293'
'Iteration 200, current loss = 7.0534, average loss = 7.0363'
'Iteration 300, current loss = 7.1356, average loss = 7.0355'
'Iteration 400, current loss = 7.1113, average loss = 7.0372'
'Iteration 500, current loss = 7.1767, average loss = 7.0370'
'Iteration 600, current loss = 6.9815, average loss = 7.0316'
'Iteration 700, current loss = 7.2865, average loss = 7.0299'


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Epoch 9', max=1.0, style=ProgressStyle(…

'Iteration 0, current loss = 7.0352, average loss = 7.0352'
'Iteration 100, current loss = 7.1777, average loss = 7.0201'
'Iteration 200, current loss = 6.9152, average loss = 7.0106'
'Iteration 300, current loss = 7.0912, average loss = 7.0170'
'Iteration 400, current loss = 7.1072, average loss = 7.0194'
'Iteration 500, current loss = 6.9492, average loss = 7.0202'
'Iteration 600, current loss = 6.9867, average loss = 7.0197'
'Iteration 700, current loss = 6.9986, average loss = 7.0178'


In [10]:
torch.save(model.state_dict(), PATH)

CBOWW2V(
  (embed): Embedding(58113, 888)
  (relu): ReLU()
  (W1): Linear(in_features=888, out_features=58113, bias=True)
)