## Implementation of Word Embedding 

### References
[1](https://github.com/fchollet/keras/blob/master/examples/mnist_siamese_graph.py)
[2](http://sebastianruder.com/word-embeddings-1/index.html)
[3](http://sebastianruder.com/word-embeddings-softmax/index.html)
[4](https://www.tensorflow.org/versions/r0.7/tutorials/word2vec/index.html)

## load text data, generate positive, negative word-context pairs

In [1]:
import numpy as np

In [2]:
## read text
import re
text8 = open("../../data/text8").read()
words = re.findall(r"\w+", text8)
len(words)

17005207

In [3]:
from collections import Counter
def build_vocab(seq, vocab_size=10000, sample_power = 3./4):
    """return 
    1. ind_seq
    2. word2ind, 
    3. ind2word
    4. unigram_distribution
    """
    vocab = Counter(words).most_common(vocab_size)
    vocab_words = set([w for w,c in vocab])
    ## remove uncommon words essentially enlarge context windows
    filtered_seq = [w for w in seq if w in vocab_words]
    word2ind = dict([w, i] for i, (w,c) in enumerate(vocab, 0))
    ind2word = dict([i, w] for i, (w,c) in enumerate(vocab, 0))
    ind_seq = map(word2ind.get, filtered_seq)
    
    unigram_probs = np.array([c for _, c in vocab]).astype(np.float32)
    unigram_probs *= sample_power
    unigram_probs /= unigram_probs.sum()
    
    
    return ind_seq, word2ind, ind2word, unigram_probs

In [4]:
%time ind_seq, word2ind, ind2word, unigram_probs = build_vocab(words)
print len(ind_seq), len(word2ind), len(unigram_probs)

CPU times: user 7.11 s, sys: 100 ms, total: 7.21 s
Wall time: 7.08 s
15268026 10000 10000


In [5]:
from sklearn.utils import shuffle
def generate_pairs(seq, unigram_probs, n = 100000, window_size = 5):
    iwords = np.random.randint(window_size, 
                              len(seq)-window_size-1, 
                              size = n)
    pospairs = np.array([[seq[i], c] 
               for i in iwords
               for c in seq[i-window_size:i] + seq[i+1:i+window_size+1]
               ])
    negwords = np.random.choice(len(unigram_probs), 
                               size = len(pospairs), 
                               p = unigram_probs)
    negpairs = np.c_[pospairs[:, 0], negwords]
    pairs = np.r_[pospairs, negpairs]
    targets = np.r_[np.ones(len(pospairs)), np.zeros(len(negpairs))]
    pairs, targets = shuffle(pairs, targets)
    return pairs, targets

In [14]:

%%time 
pairs, targets = generate_pairs(ind_seq, unigram_probs, 
                                n = 1000000,
                                window_size=5)

CPU times: user 13 s, sys: 320 ms, total: 13.4 s
Wall time: 13.3 s


In [15]:
#n * window_size * 2 * 2
print pairs.shape, targets.shape, targets.mean()

(20000000, 2) (20000000,) 0.5


## models

In [23]:
from keras import models, layers
import keras.backend as K

In [129]:
word_dim = 100

word_input = layers.Input(shape = (1, 1), dtype="int32")
word_embed_layer = layers.Embedding(len(word2ind)+1, output_dim = word_dim, )
word_vec = layers.Flatten()(word_embed_layer(word_input))

context_input = layers.Input(shape = (1, 1), dtype="int32")
context_embed_layer = layers.Embedding(len(word2ind)+1, output_dim = word_dim, )
context_vec = layers.Flatten()(context_embed_layer(context_input))

In [130]:
def euclidean_distance(vects):
    """x, y are K variables - batch vectors
    """
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)


def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))

In [131]:
dist_layer = layers.Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)
dist = dist_layer([word_vec, context_vec])

In [132]:
model = models.Model(input = [word_input, context_input], output = dist)
model.compile(loss = contrastive_loss, optimizer = "rmsprop")

In [136]:
model.fit([pairs[:, 0].reshape(-1, 1, 1), pairs[:, 1].reshape(-1, 1, 1)], 
          targets, 
          batch_size = 64, nb_epoch=1, 
         verbose = 0)

KeyboardInterrupt: 

In [147]:
wordvecs = word_embed_layer.get_weights()[0][1:, :]
wordvecs.shape

(10000, 100)

In [176]:
from sklearn.metrics import pairwise_distances_argmin_min, euclidean_distances
i = word2ind["two"]
v = wordvecs[i, :]
dist = euclidean_distances(wordvecs, [v]).ravel()

In [177]:
nn = dist.argsort()[:10]

In [178]:
map(ind2word.get, nn)

['two',
 'religious',
 'limited',
 'members',
 'team',
 'philosophers',
 'find',
 'lesser',
 'peasant',
 'renaissance']