In [12]:
#各个optimizer都差不太多，因此归并到同一个ipynb了

import tensorflow as tf
import math
tf.reset_default_graph()

class Word2Vec:
    def __init__(self, vocabulary_size, emb_dim, base_scope_name="word2vec"):
        self.trainable_variables = list()
        self.output_variables = list()
        self.losses = list()
        self.scopes = list()
        self.base_scope_name = base_scope_name
        self.vocabulary_size = vocabulary_size
        self.emb_dim = emb_dim

        scope = "{0}/variables".format(self.base_scope_name)
        self.scopes.append(scope)
        with tf.variable_scope(name_or_scope=scope):
            self.scope = tf.get_variable_scope()
            self.emb_layer = tf.keras.layers.Embedding(input_dim=vocabulary_size, output_dim=emb_dim,
                                                       name="emb_layer")
            # keras的layer均为懒加载，仅定义阶段不会构造权重，使用build来手动触发构造，该版本（v1.12.3）下tf.keras.layers.Embedding的build输入不起作用，随便给一个None
            self.emb_layer.build(input_shape=None)
            self.trainable_variables.extend(self.emb_layer.trainable_variables)

    def __call__(self, input_batch, *args, **kwargs):
        scope = "{0}/outputs".format(self.base_scope_name)
        self.scopes.append(scope)
        with tf.name_scope(scope):
            emb_output = self.emb_layer(input_batch)
            self.output_variables.append(emb_output)
        return emb_output

    def get_nce_loss(self, input_batch, label_batch, num_sampled, label_true_num=1):
        emb_output = self(input_batch)
        scope = "{0}/variables".format(self.base_scope_name)
        self.scopes.append(scope)
        with tf.variable_scope(name_or_scope=scope):
            nce_weights = tf.get_variable(name="nce_weights", shape=[self.vocabulary_size, self.emb_dim],
                                          initializer=tf.truncated_normal_initializer(
                                              stddev=1.0 / math.sqrt(self.vocabulary_size)))
            nce_biases = tf.get_variable(name="nce_biases", shape=[self.vocabulary_size, ],
                                         initializer=tf.zeros_initializer(), dtype=tf.float32)
            self.trainable_variables.append(nce_weights)
            self.trainable_variables.append(nce_biases)

        scope = "{}/losses".format(self.base_scope_name)
        self.scopes.append(scope)
        with tf.name_scope(scope):
            nce_loss = tf.reduce_mean(
                tf.nn.nce_loss(
                    weights=nce_weights, biases=nce_biases, labels=label_batch, inputs=emb_output,
                    num_sampled=num_sampled, num_classes=self.vocabulary_size, num_true=label_true_num))
            self.losses.append(nce_loss)
        return nce_loss

word2vec=Word2Vec(5,2)
input_ds_sample=tf.data.Dataset.from_tensor_slices([[1.,3.],[3.,4.]])
# label_ds_sample=tf.data.Dataset.from_tensor_slices([[[1,2],[2,3]]])
label_ds_sample=tf.data.Dataset.from_tensor_slices([[[1],[2]]])


input_batch_sample=input_ds_sample.make_one_shot_iterator().get_next()
label_batch_sample=label_ds_sample.make_one_shot_iterator().get_next()
output_batch_sample=word2vec(input_batch_sample)
loss=word2vec.get_nce_loss(input_batch_sample,label_batch_sample,1,1)


optimizer=tf.train.GradientDescentOptimizer(1)

gradient=optimizer.compute_gradients(loss)
apply=optimizer.apply_gradients(gradient)
print(tf.trainable_variables())
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print(sess.run(tf.trainable_variables()[1]))
    print(sess.run(tf.trainable_variables()[-1]))
    print(sess.run([gradient,apply]))
    # print(sess.run([loss,gradient]))
    print(sess.run(tf.trainable_variables()[1]))
    print(sess.run(tf.trainable_variables()[-1]))

"""
这是我写word2vec的一个例子，我把minimize拆开成为了计算梯度与应用梯度
按理来讲，gradient是一个list，每个元素都是一个tuple，tuple[0]是梯度，tuple[1]是被计算梯度的变量，注意这个变量还没有被梯度下降，也就是梯度下降之前的值
但是当我运行了print(sess.run([gradient,apply]))的时候，会发现最后一个nce_bias变量变成了梯度下降之后的结果
并且，高潮是，结果仍然是正确的，所有的变量都被正确地梯度下降了，nce_bias并没有被梯度下降两次
但如果我只运行print(sess.run([loss,gradient]))，打印出来的结果又都是正常的了，即nce_bias就是一堆0

我思考了一下为什么会出现这个结果，我给出的猜想是，
sess.run会先触发graph上的计算，当graph上的计算都完成了，才把值!同时地!返回回来，也就是说，打印的gradient是apply运行之后的结果了。
"""

[<tf.Variable 'word2vec/variables/embeddings:0' shape=(5, 2) dtype=float32>, <tf.Variable 'word2vec/variables/nce_weights:0' shape=(5, 2) dtype=float32_ref>, <tf.Variable 'word2vec/variables/nce_biases:0' shape=(5,) dtype=float32_ref>]
[[-0.5325353   0.10459913]
 [ 0.34093955  0.35427415]
 [ 0.11339732 -0.5679613 ]
 [ 0.46859023  0.3219796 ]
 [ 0.1926802  -0.19246615]]
[0. 0. 0. 0. 0.]
[[(IndexedSlicesValue(values=array([[0.10771506, 0.11192796],
       [0.13088524, 0.18375723]], dtype=float32), indices=array([1, 3], dtype=int32), dense_shape=array([5, 2], dtype=int32)), array([[-0.04970081,  0.03095512],
       [ 0.02433792, -0.01457626],
       [ 0.0433158 , -0.00957564],
       [-0.0334233 ,  0.00683755],
       [ 0.03496585,  0.01258318]], dtype=float32)), (IndexedSlicesValue(values=array([[-0.00223987,  0.00134148],
       [ 0.00232732, -0.00047611],
       [-0.00367604, -0.00316339]], dtype=float32), indices=array([1, 2, 1]), dense_shape=array([5, 2], dtype=int32)), array([[-0.53

'\n这是我写word2vec的一个例子，我把minimize拆开成为了计算梯度与应用梯度\n按理来讲，gradient是一个list，每个元素都是一个tuple，tuple[0]是梯度，tuple[1]是被计算梯度的变量，注意这个变量还没有被梯度下降，也就是梯度下降之前的值\n但是当我运行了\n'

<tf.Tensor 'IteratorGetNext:0' shape=(2,) dtype=float32>