###### Model definitions

In [8]:
# Base class

class Word2Vec:
    def get_variable(self, shape, name, init='normal'):
        if init == 'uniform':
            initializer = tf.random_uniform(shape=shape, minval=-1, maxval=1)
        else:
            initializer = tf.truncated_normal(shape=shape, stddev=STDDEV)
        return tf.get_variable(name, initializer=initializer)
    
    def fc(self, x, dim_in, dim_out, name):
        W = self.get_variable(name=name+"_W", shape=[dim_in, dim_out])
        b = self.get_variable(name=name+"_b", shape=[dim_out])
        return tf.matmul(x, W) + b, W, b

    def train(self, session, train_x, train_y):
        total_loss = 0.0
        total_entries = 0
        batches_count = len(train_x)
        for i in trange(batches_count):
            batch_x, batch_y = train_x[i], train_y[i]
            _, loss = session.run([self.train_op, self.loss])
#             print('loss', loss)
            total_loss += np.sum(loss)
            total_entries += len(loss)
        if total_entries == 0:
            return 0
        return total_loss / total_entries

## Vanilla SkipGram

- Input size equal to size of the vocabulary
- Single hidden layer
- Output layer with softmax of size of the vocabulary
- Loss function - cross entropy
- Gradient Descent
- Embeding - first layer - `W + b` (with broadcasting)


In [9]:
class SkipGram(Word2Vec):
    def __init__(self, vocab_size, embedding_size):
        self.input  = tf.placeholder(tf.float32, shape=[None, vocab_size], name="input")
        self.labels = tf.placeholder(tf.float32, shape=[None, vocab_size], name="labels")

        # Build 2-layers FFNN
        self.hidden_layer, W, b = self.fc(self.input,        vocab_size, embedding_size, name="layer_1")
        self.logits, _, _       = self.fc(self.hidden_layer, embedding_size, vocab_size, name="layer_2")

        # Store first layer
        self.W = W
        self.b = b

        # Define training tensors
        self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.labels)
        self.train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(self.loss)
    
    def get_embedding(self, session):
        return session.run(self.W + self.b)
        

## SkipGram with NegativeSampling

- Input accepts values of `word2int` - dimension `[BATCH_SIZE]`
- One hidden layer, but used as embedding lookup (no bias)
- Output layer with sampled softmax; Computes softmax based on one positive and several negative samples
- Adam
- Embedding - first layer - `W`

In [10]:
class SkipGramNegativeSampling(Word2Vec):
    def __init__(self, vocab_size, embedding_size):
        self.input, self.labels = iterator.get_next()
#         self.input  = tf.placeholder(tf.int64, shape=[None], name="input")
#         self.labels = tf.placeholder(tf.int64, shape=[None, None], name="labels")
        self.vocab_size = vocab_size
        
        self.embedding = self.get_variable(name="layer_1_W", shape=[vocab_size, embedding_size], init="uniform")
        inputs = tf.nn.embedding_lookup(self.embedding, self.input)

        W = self.get_variable(name="layer_2_W", shape=[vocab_size, embedding_size])
        b = self.get_variable(name="layer_2_b", shape=[vocab_size])

        self.negative_sampling(W, b, inputs)
        
    def negative_sampling(self, W, b, inputs):
        self.loss = tf.nn.sampled_softmax_loss(
            weights=self.embedding, 
            biases=b, 
            labels=self.labels, 
            inputs=inputs,
            num_sampled=NUM_SAMPLED,
            num_classes=self.vocab_size)
#         self.loss = tf.nn.nce_loss(
#             weights=W,
#             biases=b,
#             labels=self.labels, 
#             inputs=inputs,
#             num_sampled=NUM_SAMPLED,
#             num_classes=self.vocab_size
#         )
        cost = tf.reduce_mean(self.loss)
        self.train_op = tf.train.AdamOptimizer().minimize(cost)
        # self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
        # self.train_op = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(self.loss)

    def get_embedding(self, session):
        return session.run(self.embedding)
    

In [11]:
class HuffmannNode:
    def __init__(self, dimension):
        self.representation = tf.Variable(tf.random_uniform(shape=[dimension], minval=-1, maxval=1))
    
    def set_left(self, node):
        self.left = node
        
    def set_right(self, node):
        self.right = node

In [12]:
class SkipGramHierarchicalSoftmax(Word2Vec):
    def __init__(self, vocab_size, embedding_size, word_infrequency, word2int):
        self.input  = tf.placeholder(tf.int64, shape=[None], name="input")
        self.labels = tf.placeholder(tf.int64, shape=[None, 1], name="labels")
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.word2int = word2int
        
        self.probabilities = [None] * vocab_size
        
        self.embedding = self.get_variable(name="layer_1_W", shape=[vocab_size, embedding_size], init="uniform")
        x = tf.nn.embedding_lookup(self.embedding, self.input)

        self.root = self.build_huffmann_tree(word_infrequency)
        self.i = 0
        self.traverse_huffman_tree(self.root, x, 1)
        
        self.final_probabilities = tf.concat([[x] for x in self.probabilities], axis=0)
        
        self.labels_probabilities = tf.gather(self.final_probabilities, self.labels)
        
        self.loss = -tf.log(self.labels_probabilities + 1e-10)
        self.train_op = tf.train.AdamOptimizer().minimize(self.loss)
        
    def build_huffmann_tree(self, word_infrequency):
        # Huffmann
        while len(word_infrequency) > 1:
            x, y = word_infrequency.most_common(2) # 2 least frequent words
            
            node = HuffmannNode(dimension=self.embedding_size)
            node.set_left(x[0])
            node.set_right(y[0])
            
            word_infrequency.pop(x[0])
            word_infrequency.pop(y[0])
            word_infrequency[node] = x[1] + y[1]
        
        root = word_infrequency.most_common(1)[0][0]
        return root
    
    def traverse_huffman_tree(self, node, v_wi, probability):
        if type(node) is str:
            self.i += 1
            self.probabilities[self.word2int[node]] = probability
            if self.i % 100 == 0:
                print(self.i, "/", self.vocab_size)
            return None
        
        self.traverse_huffman_tree(node.left, v_wi, 
                                   probability * tf.sigmoid(-tf.reduce_sum(v_wi * node.representation)))
        self.traverse_huffman_tree(node.right, v_wi,
                                   probability * tf.sigmoid(tf.reduce_sum(v_wi * node.representation)))
        

    def get_embedding(self, session):
        return session.run(self.embedding)
