# Assignment 1.3: Naive word2vec (40 points)

This task can be formulated very simply. Follow this [paper](https://arxiv.org/pdf/1411.2738.pdf) and implement word2vec like a two-layer neural network with matrices $W$ and $W'$. One matrix projects words to low-dimensional 'hidden' space and the other - back to high-dimensional vocabulary space.

![word2vec](https://i.stack.imgur.com/6eVXZ.jpg)

You can use TensorFlow/PyTorch and code from your previous task.

## Results of this task: (30 points)
 * trained word vectors (mention somewhere, how long it took to train)
 * plotted loss (so we can see that it has converged)
 * function to map token to corresponding word vector
 * beautiful visualizations (PCE, T-SNE), you can use TensorBoard and play with your vectors in 3D (don't forget to add screenshots to the task)

## Extra questions: (10 points)
 * Intrinsic evaluation: you can find datasets [here](http://download.tensorflow.org/data/questions-words.txt)
 * Extrinsic evaluation: you can use [these](https://medium.com/@dataturks/rare-text-classification-open-datasets-9d340c8c508e)

Also, you can find any other datasets for quantitative evaluation.

Again. It is **highly recommended** to read this [paper](https://arxiv.org/pdf/1411.2738.pdf)

Example of visualization in tensorboard:
https://projector.tensorflow.org

Example of 2D visualisation:

![2dword2vec](https://www.tensorflow.org/images/tsne.png)

In [0]:
import random, torch, collections
import torch.nn as nn
from pprint import pprint
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [0]:
!wget http://mattmahoney.net/dc/text8.zip
!unzip text8.zip
with open('text8') as text_file:
    corpus = text_file.read().split()

--2020-02-20 14:58:00--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.75
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.75|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip.1’


2020-02-20 14:58:15 (2.00 MB/s) - ‘text8.zip.1’ saved [31344016/31344016]

Archive:  text8.zip
replace text8? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [18]:
pprint(' '.join(word for word in corpus[:100]))

('anarchism originated as a term of abuse first used against early working '
 'class radicals including the diggers of the english revolution and the sans '
 'culottes of the french revolution whilst the term is still used in a '
 'pejorative way to describe any act that used violent means to destroy the '
 'organization of society it has also been taken up as a positive label by '
 'self defined anarchists the word anarchism is derived from the greek without '
 'archons ruler chief king anarchism as a political philosophy is the belief '
 'that rulers are unnecessary and should be abolished although there are '
 'differing')


In [0]:
VOCABULARY_SIZE = 10000
UNK = '<UNK>'

def create_dataset(corpus, vocab_size=VOCABULARY_SIZE, unk_token=UNK):
    dataset = []
    counter_dict = collections.Counter(corpus)
    vocab = counter_dict.most_common(VOCABULARY_SIZE)
    words = [x[0] for x in vocab]
    words.append(unk_token)
    min_allowed_freq = vocab[-1][1]
    #use only high-frequency words
    #change all other words to UNK
    for _, word in enumerate(corpus):
        if counter_dict[word] > min_allowed_freq:
            dataset.append(word)
        else:
            dataset.append(unk_token)
        
    word2idx = {word: idx for (idx, word) in enumerate(words)}
    idx2word = {idx: word for (idx, word) in enumerate(words)}
    return dataset, word2idx, idx2word, len(words)

In [0]:
data, word2idx, idx2word, vocab_size = create_dataset(corpus)

In [0]:
class Batcher(object):
    def __init__(self,dataset, window_size, batch_size, word2idx, idx2word):
        self.dataset = dataset
        self.window_size = window_size
        self.batch_size = batch_size
        self.word2idx = word2idx
        self.idx2word = idx2word

    def __iter__(self):
        return self
    
    def __next__(self):
        global index
        batch = []
        labels = []
        dataset = self.dataset 
        window_size = self.window_size
        batch_size = self.batch_size
        word2idx = self.word2idx
        idx2word = self.idx2word
        
        for _ in range(batch_size):
            # create a batch only if have at least
            #n words to the left and n words to the right
            #where n is window size
            if (index - window_size < 0) or (index + window_size > len(dataset)-1):
                #we need to increment index through epochs of learning
                index = (index + 1) % len(dataset)
            #now create context and batch
            else:
                #add word 
                batch.append(word2idx[dataset[index]])
                
                labels_batch = []
                sliding_window = dataset[index-window_size:index] + dataset[index+1: index+window_size+1]
                for word in sliding_window:
                    labels_batch.append(word2idx[word])
                labels.append(labels_batch)
                #again update index
                index = (index + 1) % len(dataset)
        
        return (batch, labels)

In [0]:
def onehot(batch, batch_size, voc_size):
    if batch_size > 1:
        batch_onehot = torch.zeros((batch_size, voc_size))
        for x, y in enumerate(batch):
            for i in y:
                i = int(i)
                batch_onehot[x, i] = batch_onehot[x, i] + 1
    else:
        batch_onehot = torch.zeros((voc_size))
        for x in batch:
            x = int(x)
            batch_onehot[x] = 1
    return batch_onehot

In [0]:
class Word2Vec(nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(Word2Vec, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.linear_1 = nn.Linear(input_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, input_size)

    def forward(self, input):
        output = self.linear_1(input)
        output = F.log_softmax(self.linear_2(output), dim=1)
        return output

In [28]:
import time
part_data = data[:300000]
input_size = vocab_size 
batch_size = 1000
hidden_size = 20
window_size = 2
print_every = 100

USE_GPU = True

dtype = torch.float32 

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    
print('using device:', device)

model = Word2Vec(input_size=vocab_size, hidden_size=hidden_size)
model = model.to(device=device)
index = 0
batcher = Batcher(dataset=part_data, window_size=window_size, batch_size=batch_size, word2idx=word2idx, idx2word=idx2word)
build_batch = iter(batcher)

loss_function = nn.NLLLoss()

losses = []
it_per_ep = len(part_data) // batch_size

using device: cpu


In [36]:
start_time = time.clock()
for e in range(30000):
    batch, label = next(build_batch)
    one_hot_vector = onehot(batch=batch, batch_size=batch_size, voc_size=vocab_size)
    model.train() 
    x = onehot.to(device=device, dtype=dtype) / window_size 
    scores = model(x).to(device=device, dtype=dtype)
    loss = loss_function(scores, torch.tensor(label, device=device, dtype=torch.long))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if e % it_per_ep == 0:
        print('Iteration %d, loss = %.4lg' % (e, sum(losses[-it_per_ep:])))
        print('Time %lg' % (time.clock() - start_time))
        print()

TypeError: ignored

In [40]:
for x,y in enumerate(batch):
    print(x,y)
    break

0 406
