In [None]:
import os
import json
import nltk
import math
import random
import gensim
import pickle
import collections
import numpy as np
import urllib.request
import pandas as pd
import tensorflow as tf 

%matplotlib inline
import matplotlib.pyplot as plt

from IPython import display
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform

# word2vec: skip gram & cbow

Models __CBOW (Continuous Bag of Words)__ and __Skip gram__ were invented in the now distant 2013,
*article*:
[*Tomas Mikolov et al.*](https://arxiv.org/pdf/1301.3781v3.pdf)

* __CBOW__ model predict missing word (focus word) using context (surrounding words).
* __skip gram__ model is reverse to _CBOW_. It predicts context based on the word in focus.

* **Context** is a fixed number of words to the left and right of the word in focus (see picture below). The length of the context is defined by the "window" parameter.

![context](pics/context.png)

Two models comparision

![architecture](pics/architecture.png)


### Skip_gram

Consider a corpus with a sequence of words $ w_1, w_2, .., w_T $.

Objective function (we would like to maximize it) for _skip gram_ is defined as follow:


$$ AverageLogProbability = \frac{1}{T} \sum_{t=1}^{T} \sum_{-c \leqslant j\leqslant c, j \neq 0} log\ p (w_{t+j} | w_t) $$

* where $ c $ is a context length.
* $w_t$ -- focus word

The basic formulation for probability $ p (w_{t+j} | w_t) $ is calculated using __Softmax__ -

$$ p (w_h | w_i) = \frac{exp(s(v_i, v_h))}{ \sum^{W}_{w=1}  exp(s(v_{w}, v_{i} )) } $$

where
* $w_i$ -- input focus word
* $w_h$ -- hypothetically context word for a given focus word $w_i$
* $v_i$ and $v_h$ input-word and hypothesis-word vector representations (for $w_i$, $w_h$)
* $s(v_i, v_h) = v^{T} _{h} \cdot v_{i}$
* $W$ is the number of words in vocabulary

___

### CBOW

Predict word using context.

$$ E = -log\ p(w_h\ |\ w_{1},\ w_{2},\ \dots,\ w_{c}) $$


The **probability** is the same as in the *skip gram* model, but now $v_i$ is a sum of context-word vectors.

$$ p(w_h\ |\ w_{1},\ w_{2},\ \dots,\ w_{c})  = \frac{exp(s(v_i, v_h))}{\sum^{W}_{w=1}  exp(s(v_{w}, v_{i}))} $$


* $\ w_{1},\ w_{2},\ \dots,\ w_{c}$ -- input context words
* $w_h$ -- hypothetically focus word for a given context words
* $ v_i = \sum^{c}_{k=1} w_{k}$
* $ v_h$ = vector of hypothesis word
* $s(v_i, v_h) = v^{T} _{h} \cdot v_{i}$
* $W$ is the number of words in vocabulary

___

Lets implement __`CBOW`__ using tensorflow framework.

___

### Excercise 2: preparing the data, building a model

In this exercise you prepare the data for the word2ve neural network by tokenizing it, building a dictionary and encoding it with corresponding identifiers. 

You may want to use the functions you have written during Seminar 2

In [None]:
url = 'https://www.dropbox.com/s/klednu69tgfap8n/train.csv?dl=1'

def maybe_download(dl_url, filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        u = urllib.request.urlopen(dl_url)
        data = u.read()
        u.close()
        with open(filename, "wb") as f :
            f.write(data)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


In [None]:
# download the dataset
filename = maybe_download(url, 'train.csv', 101536537) 

### 2.1

Read the texts from csv file and convert them into a single list of tokens

In [None]:
csv = pd.read_csv('train.csv')

In [None]:
all_texts = csv['message']

In [None]:
# your code goes here
all_texts_joint = 

In [None]:
# your code goes here
# use any tokenizer you deem necessary
tokens = 

In [None]:
assert isinstance(tokens, list)
assert isinstance(tokens[0], str)
print('done')

### 2.2

Build a dictionary {token: token_id} of 50000 most frequent tokens.

After that, make an inverse dictionary {token_id: token}

In [None]:
vocabulary_size = 50000

In [None]:
def build_vocabulary(tokens, max_size=20000):
    """
    Builds a vocabulary of at most max_size words from the supplied list of lists of tokens.
    If a word embedding model is provided, adds only the words present in the model vocabulary.
    """
    # your code goes here
    return vocabulary

In [None]:
dictionary = build_vocabulary(tokens, vocabulary_size)

In [None]:
# your code goes here
reverse_dictionary = 

In [None]:
assert len(dictionary) == len(reverse_dictionary) == 50000
assert sorted(dictionary.keys()) == sorted(reverse_dictionary.values())
assert sorted(reverse_dictionary.keys()) == sorted(dictionary.values())
print('done')

### 2.3

Encode the tokens into a list of their identifiers from your 'dictionary'. Replace the Out Of Vocabulary [OOV] tokens with 'UNKN' identifier.

In [None]:
def encode(tokens, token_to_id):
    # your code goes here
    return encoded_tokens

In [None]:
data = encode(tokens, dictionary)

In [None]:
print('Sample encoded data', data[:5])
print('Sample decoded data', [reverse_dictionary[t] for t in data[:5]])

In [None]:
for t, tid in zip(tokens, data):
    assert ((reverse_dictionary[tid] == t) or (tid==dictionary['UNKN']))
print('done')

### 2.4

You are done with data preparation, now train the word2vec model. You don't need to change anything in this section.

Read the comments to get a better understanding of what is happening

In [None]:
data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for num_skips, skip_window in [(2, 1)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

### Tensorflow word2vec computational graph

In [None]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

    # Input data.
    train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
    # Variables.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    softmax_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                   labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities 
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

### Training loop

In [None]:
num_steps = 100001
lh = []

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    average_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(
          batch_size, num_skips, skip_window)
        feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
        _, l = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += l
        if step % 2000 == 0:
            if step > 0:
                average_loss = average_loss / 2000
            display.clear_output(wait=True)
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, average_loss))
            lh.append(average_loss)
            average_loss = 0

            plt.figure(figsize=(14, 10))

            plt.title("Training loss, step {}/{}".format(step, num_steps))
            plt.xlabel("#step")
            plt.ylabel("loss")
            plt.plot(lh, 'b')
            plt.show()

        if step % 2000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k+1]
                log = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log = '%s %s,' % (log, close_word)
                print(log)
    final_embeddings = normalized_embeddings.eval()

### Save the word vectors and dictionaries for later use

In [None]:
def create_vec_file(final_emb_mtx, vocab_size, vec_size,filename):
    with open(filename, 'w') as f:
        f.write(str(vocab_size)+' '+str(vec_size) + '\n')
        for n in range(vocab_size):
            s = ' '.join([reverse_dictionary[n]] + [str(num) for num in final_emb_mtx[n]])
            f.write(s + '\n')

In [None]:
create_vec_file(final_embeddings, vocab_size=vocabulary_size, vec_size=embedding_size, filename='simple_cbow.w2v')
pickle.dump([dictionary, reverse_dictionary], open('dict_rdict.pkl',"wb"))

### T_SNE projection of word vectors into a 2-dimensional space

In [None]:
dd = json.load(open('topical_words.json'))
colors = {'colors':'r', 'vegetables':'g', 'numbers':'b', 'professions': 'GoldenRod'}
wd = {}
for k in dd:
    for w in dd[k]:
        if w in dictionary:
            wd[w] = {'emb': final_embeddings[dictionary[w]], 'label': k, 'color': colors[k]}
            
M = np.array([wd[k]['emb'] for k in wd])
labels = [wd[k]['label'] for k in wd]
colors = [wd[k]['color'] for k in wd]

P = squareform(pdist(M, metric='cosine'))
tsne2 = TSNE(n_components=2, random_state=34, metric='precomputed', n_iter=9001)
Y = tsne2.fit_transform(P)

In [None]:
plt.figure(figsize=(20,20))
for i, k in enumerate(wd):
    plt.scatter([Y[i][0]], [Y[i][1]], color = wd[k]['color'], label=wd[k]['label'])
    plt.annotate(k, xy=(Y[i][0], Y[i][1]), xytext=(0, 0), textcoords='offset points', 
                 color=wd[k]['color'], fontsize=12)


plt.title('T-SNE on word2vec representations, 4 topics', fontsize=16)        
plt.show()