## Implement word2vec (COBW + Negative Sampling) via Keras

### Use preprocessing
- Keras preprocessing packages, texts, sequences, and images provide good tools to convert data into convenient formats for NN
- texts package focus more on converting text data (strings) to sequences
- sequences package usually deals with sequence of indices (hashes)

### Use functional API to access layer weights

In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'1.0.4'

In [96]:
from keras.preprocessing import text, sequence
from keras import models, layers
import keras.backend as K
import numpy as np
from collections import Counter

### Read text8 and build voculary table

In [68]:
vocab_size = 10000

texts = open("../../data/text8").read()
words = text.text_to_word_sequence(texts)
wc = Counter(words).most_common(vocab_size)


word2ind = dict([(w, i) for i, (w, c) in enumerate(wc, 1)])
word2ind["UNKNOWN"] = 0
ind2word = dict([(i, w) for i, (w, c) in enumerate(wc, 1)])
ind2word[0] = "UNKNOWN"

sequences = np.array(map(lambda w: word2ind.get(w, 0), words))
unknown_count = np.sum(sequences==0)

## unigram sampling table
sampling_table = np.array([unknown_count]+
                          [c for (w, c) in wc]).astype(np.float32)
sampling_table /= sampling_table.sum()
## downsampling
sampling_table = np.power(sampling_table, 3./4)

### generate negative sampling

In [88]:
%%time 
data = sequence.skipgrams(sequences, vocab_size, 
                   window_size=5, sampling_table=sampling_table)
X, y = data
words, contexts = zip(*X)
words, contexts = np.array(words), np.array(contexts)
y = np.asarray(y)
print words.shape, contexts.shape, y.shape

(7001606,) (7001606,) (7001606,)
CPU times: user 1min, sys: 1.71 s, total: 1min 2s
Wall time: 1min


### build embedding model via functional API

In [224]:
word_dim = 50

word_seq = layers.Input(shape = (1, ), dtype = "int32")
word_embed_layer = layers.Embedding(vocab_size, word_dim, input_length=1)
word_vec = word_embed_layer(word_seq)
word_vec = layers.Flatten()(word_vec)


context_seq = layers.Input(shape = (1, ), dtype = "int32")
context_embed_layer = layers.Embedding(vocab_size, word_dim, input_length=1)
context_vec = context_embed_layer(context_seq)
context_vec = layers.Flatten()(context_vec)


dist = layers.Lambda(lambda (x,y): K.batch_dot(x,y, axes=1) )([word_vec, context_vec])
probs = layers.Activation("sigmoid")(dist)

model = models.Model(input = [word_seq, context_seq], output = [probs])
model.compile("rmsprop", "binary_crossentropy")



In [230]:
model.fit([words, contexts], y, batch_size = 1, nb_epoch=1, verbose=0)

KeyboardInterrupt: 