### 1. Import all necessary libraries

In [1]:
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from keras.models import Sequential
from keras.layers.core import Dense, Lambda
from keras.layers.embeddings import Embedding
import keras.backend as K

Using TensorFlow backend.


In [None]:

vocab_size = 5000
embed_size = 300
window_size = 1
# window_size -> int maximum distance between two words in a positive couple.

### 2. Network architecture

In [2]:
# 1. The input to the model are word IDs of context words.
model = Sequential()

# 2. The word IDs are fed into a common embedding layer that is initialized with small random weights.

# Each word ID transformed into a vector of size(embed_size) by the embedding layer.

# Each row of the input context is transformed into a matrix of size (2*window_size, embed_size) by this layer.

model.add(Embedding(input_dim=vocab_size, output_dim=embed_size, 
                    embeddings_initializer='glorot_uniform',
                    input_length=window_size*2))

# 3. lambda layer, which computes an average of all the embeddings.

model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))

# 4. This average is then fed to a dense layer, which creates a dense vector of size (vocab_size) for each row.
model.add(Dense(vocab_size, kernel_initializer='glorot_uniform', 
                activation='softmax'))

# 5. The activation function on the dense layer is a softmax, 
#    which reports the maximum value on the output vector as a probability. The ID
#    with the maximum probability corresponds to the target word.

model.compile(loss='categorical_crossentropy', optimizer="adadelta")

# get weights
weights = model.layers[0].get_weights()[0]