From 92500f0f1451914a7c68efdd71158b8cd03103c1 Mon Sep 17 00:00:00 2001
From: Larry Yan <stevenjobsbsbsb@hotmail.com>
Date: Wed, 31 Jul 2019 19:59:09 +0800
Subject: [PATCH] fix(encoder): add netvlad and netfv

---
 gnes/encoder/video/incep_mixture.py         |  96 ++++++++
 gnes/encoder/video/mixture_core/__init__.py |   0
 gnes/encoder/video/mixture_core/model.py    | 245 ++++++++++++++++++++
 3 files changed, 341 insertions(+)
 create mode 100644 gnes/encoder/video/incep_mixture.py
 create mode 100644 gnes/encoder/video/mixture_core/__init__.py
 create mode 100644 gnes/encoder/video/mixture_core/model.py

diff --git a/gnes/encoder/video/incep_mixture.py b/gnes/encoder/video/incep_mixture.py
new file mode 100644
index 00000000..052960b3
--- /dev/null
+++ b/gnes/encoder/video/incep_mixture.py
@@ -0,0 +1,96 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from typing import List
+
+import numpy as np
+from PIL import Image
+
+from ..base import BaseVideoEncoder
+from ...helper import batching, batch_iterator, get_first_available_gpu
+
+
+class IncepMixtureEncoder(BaseVideoEncoder):
+
+    def __init__(self, model_dir_inception: str,
+                 model_dir_mixture: str,
+                 batch_size: int = 64,
+                 select_layer: str = 'PreLogitsFlatten',
+                 use_cuda: bool = False,
+                 feature_size: int = 300,
+                 vocab_size: int = 28,
+                 cluster_size: int = 256,
+                 method: str = 'netvlad',
+                 input_size: int = 1536,
+                 multitask_method: str = 'Attention'
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model_dir_inception = model_dir_inception
+        self.model_dir_mixture = model_dir_mixture
+        self.batch_size = batch_size
+        self.select_layer = select_layer
+        self.use_cuda = use_cuda
+        self.cluster_size = cluster_size
+        self.feature_size = feature_size
+        self.vocab_size = vocab_size
+        self.method = method
+        self.input_size = input_size
+        self.multitask_method = multitask_method
+
+    def post_init(self):
+        import tensorflow as tf
+        from ..image.inception_cores.inception_v4 import inception_v4
+        from ..image.inception_cores.inception_utils import inception_arg_scope
+        from .mixture_core.incep_mixture import *
+        import os
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(get_first_available_gpu())
+
+        g = tf.Graph()
+        with g.as_default():
+            arg_scope = inception_arg_scope()
+            inception_v4.default_image_size = self.inception_size_x
+            self.inputs = tf.placeholder(tf.float32, (None,
+                                                      self.inception_size_x,
+                                                      self.inception_size_y, 3))
+
+            with tf.contrib.slim.arg_scope(arg_scope):
+                self.logits, self.end_points = inception_v4(self.inputs,
+                                                            is_training=False,
+                                                            dropout_keep_prob=1.0)
+
+            config = tf.ConfigProto(log_device_placement=False)
+            if self._use_cuda:
+                config.gpu_options.allow_growth = True
+            self.sess = tf.Session(config=config)
+            self.saver = tf.train.Saver()
+            self.saver.restore(self.sess, self.model_dir_inception)
+
+        g2 = tf.Graph()
+        with g2.as_default():
+            config = tf.ConfigProto(log_device_placement=False)
+            if self._use_cuda:
+                config.gpu_options.allow_growth = True
+            self.sess2 = tf.Session(config=config)
+            self.mix_model = NetFV(feature_size=self.feature_size,
+                                   cluster_size=self.cluster_size,
+                                   vocab_size=self.vocab_size,
+                                   input_size=self.input_size,
+                                   use_2nd_label=True,
+                                   multitask_method=self.multitask_method,
+                                   method=self.method,
+                                   is_training=False)
+            saver = tf.train.Saver(max_to_keep=1)
+            self.sess2.run(tf.global_variables_initializer())
+            saver.restore(self.sess2, self.model_dir_mixture)
diff --git a/gnes/encoder/video/mixture_core/__init__.py b/gnes/encoder/video/mixture_core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/gnes/encoder/video/mixture_core/model.py b/gnes/encoder/video/mixture_core/model.py
new file mode 100644
index 00000000..6a4a0631
--- /dev/null
+++ b/gnes/encoder/video/mixture_core/model.py
@@ -0,0 +1,245 @@
+#  Tencent is pleased to support the open source community by making GNES available.
+#
+#  Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import math
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+
+
+class NetFV():
+    def __init__(self, feature_size,
+                 cluster_size,
+                 vocab_size,
+                 method='netvlad',
+                 input_size=None,
+                 use_2nd_label=False,
+                 vocab_size_2=None,
+                 add_batch_norm=True,
+                 is_training=False,
+                 use_weights=True,
+                 save_dir=None,
+                 multitask_method=None,
+                 l2_penalty=1e-6):
+        if input_size == None:
+            self.input_size = feature_size
+        else:
+            self.input_size = input_size
+        self.feature_size = feature_size
+        self.is_training = is_training
+        self.vocab_size = vocab_size
+        self.use_2nd_label = use_2nd_label
+        self.vocab_size_2 = vocab_size_2
+        self.add_batch_norm = add_batch_norm
+        self.cluster_size = cluster_size
+        self.use_weights = use_weights
+        self.l2_penalty = l2_penalty
+        self.method = method
+        self.multitask_method = multitask_method
+        self.build_model()
+        self.build_loss()
+
+    @staticmethod
+    def rand_init(feature_size):
+        return tf.random_normal_initializer(stddev=1/math.sqrt(feature_size))
+
+    def build_model(self):
+        self.feeds = tf.placeholder(tf.float32, [None, None, self.input_size])
+        #self.inputs = self.feeds
+        self.inputs = tf.layers.dense(self.feeds, self.feature_size)
+        self.weights = tf.placeholder(tf.float32, [None, self.vocab_size])
+        self.max_frames = tf.shape(self.inputs)[1]
+        if self.method == 'fvnet':
+            self.build_fvnet()
+        elif self.method == 'netvlad':
+            self.build_netvlad()
+        elif self.method == 'pooling':
+            self.build_pooling()
+
+    def build_pooling(self):
+        self.repre = tf.layers.dense(self.inputs, self.feature_size)
+        self.repre = tf.reduce_max(self.repre, axis=1)
+
+    def build_fvnet(self):
+        reshaped_input = tf.reshape(self.inputs, [-1, self.feature_size])
+        cluster_weights = tf.get_variable("cluster_weights",
+                                          [self.feature_size, self.cluster_size],
+                                          initializer=NetFV.rand_init(self.feature_size))
+
+        covar_weights = tf.get_variable("covar_weights",
+                                        [self.feature_size, self.cluster_size],
+                                        initializer=NetFV.rand_init(self.feature_size))
+
+        covar_weights = tf.square(covar_weights)
+        eps = tf.constant([1e-6])
+        covar_weights = tf.add(covar_weights,eps)
+
+        tf.summary.histogram("cluster_weights", cluster_weights)
+        activation = tf.matmul(reshaped_input, cluster_weights)
+        if self.add_batch_norm:
+            activation = slim.batch_norm(activation,
+                                         center=True,
+                                         scale=True,
+                                         is_training=self.is_training,
+                                         scope="cluster_bn")
+        else:
+            cluster_biases = tf.get_variable("cluster_biases",
+                                             [self.cluster_size],
+                                             initializer=NetFV.rand_init(self.feature_size))
+            tf.summary.histogram("cluster_biases", cluster_biases)
+            activation += cluster_biases
+
+        activation = tf.nn.softmax(activation)
+        tf.summary.histogram("cluster_output", activation)
+
+        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
+
+        a_sum = tf.reduce_sum(activation, -2, keepdims=True)
+
+        cluster_weights2 = tf.scalar_mul(0.01, cluster_weights)
+
+        a = tf.multiply(a_sum, cluster_weights2)
+
+        activation = tf.transpose(activation,perm=[0, 2, 1])
+
+        reshaped_input = tf.reshape(reshaped_input,
+                                    [-1, self.max_frames, self.feature_size])
+        fv1 = tf.matmul(activation, reshaped_input)
+
+        fv1 = tf.transpose(fv1, perm=[0, 2, 1])
+
+        # computing second order FV
+        a2 = tf.multiply(a_sum, tf.square(cluster_weights2))
+
+        b2 = tf.multiply(fv1, cluster_weights2)
+        fv2 = tf.matmul(activation, tf.square(reshaped_input))
+
+        fv2 = tf.transpose(fv2, perm=[0, 2, 1])
+        fv2 = tf.add_n([a2, fv2, tf.scalar_mul(-2, b2)])
+
+        fv2 = tf.divide(fv2, tf.square(covar_weights))
+        fv2 = tf.subtract(fv2, a_sum)
+
+        fv2 = tf.reshape(fv2, [-1, self.cluster_size*self.feature_size])
+        fv2 = tf.nn.l2_normalize(fv2, 1)
+        fv2 = tf.reshape(fv2, [-1, self.cluster_size*self.feature_size])
+        fv2 = tf.nn.l2_normalize(fv2, 1)
+
+        fv1 = tf.subtract(fv1, a)
+        fv1 = tf.divide(fv1, covar_weights)
+        fv1 = tf.nn.l2_normalize(fv1, 1)
+        fv1 = tf.reshape(fv1, [-1, self.cluster_size*self.feature_size])
+        fv1 = tf.nn.l2_normalize(fv1, 1)
+
+        self.repre = tf.concat([fv1, fv2], 1)
+        self.repre = tf.layers.dense(self.repre, self.feature_size)
+
+    def build_netvlad(self):
+        reshaped_input = tf.reshape(self.inputs, [-1, self.feature_size])
+        cluster_weights = tf.get_variable("cluster_weights",
+                                          [self.feature_size, self.cluster_size],
+                                          initializer=NetFV.rand_init(self.feature_size))
+        activation = tf.matmul(reshaped_input, cluster_weights)
+        if self.add_batch_norm:
+            activation = slim.batch_norm(activation,
+                                         center=True,
+                                         scale=True,
+                                         is_training=self.is_training,
+                                         scope="cluster_bn")
+        else:
+            cluster_biases = tf.get_variable("cluster_biases",
+                                             [self.cluster_size],
+                                             initializer=NetFV.rand_init(self.feature_size))
+            activation += cluster_biases
+        activation = tf.nn.softmax(activation)
+        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])
+
+        a_sum = tf.reduce_sum(activation, -2, keep_dims=True)
+
+        cluster_weights2 = tf.get_variable("cluster_weights2",
+                                           [1, self.feature_size, self.cluster_size],
+                                           initializer=NetFV.rand_init(self.feature_size))
+
+        a = tf.multiply(a_sum, cluster_weights2)
+        activation = tf.transpose(activation, perm=[0, 2, 1])
+
+        reshaped_input = tf.reshape(reshaped_input,
+                                    [-1, self.max_frames, self.feature_size])
+        vlad = tf.matmul(activation, reshaped_input)
+        vlad = tf.transpose(vlad, perm=[0, 2, 1])
+        vlad = tf.subtract(vlad, a)
+
+        vlad = tf.nn.l2_normalize(vlad, 1)
+
+        vlad = tf.reshape(vlad, [-1, self.cluster_size * self.feature_size])
+        vlad = tf.nn.l2_normalize(vlad, 1)
+        self.repre = vlad
+
+    def build_loss(self):
+        self.probabilities = tf.layers.dense(self.repre,
+                                             self.vocab_size,
+                                             activation=tf.nn.tanh)
+        self.probabilities = tf.layers.dense(self.probabilities, self.vocab_size)
+        self.probabilities = tf.nn.softmax(self.probabilities)
+
+        self.label = tf.placeholder(tf.int32, [None, self.vocab_size])
+        logits = tf.cast(self.label, tf.float32)
+        if self.use_weights:
+            logits = logits * self.weights
+        self.loss = - tf.log(tf.reduce_sum(logits * self.probabilities, axis=1)+1e-9)
+        self.loss = tf.reduce_mean(self.loss)
+        self.pred =tf.argmax(self.probabilities, 1)
+        self.avg_diff = tf.cast(tf.equal(tf.argmax(self.label, 1), self.pred), tf.float32)
+        self.avg_diff = tf.reduce_mean(self.avg_diff)
+
+        # add 2nd layer labels
+        if self.use_2nd_label:
+            self.label_2 = tf.placeholder(tf.int32, [None, self.vocab_size_2])
+            logits2 = tf.cast(self.label_2, tf.float32)
+
+            if self.multitask_method is None:
+                self.probabilities2 = tf.layers.dense(self.repre,
+                                                      self.vocab_size_2,
+                                                      activation=tf.nn.tanh)
+                self.probabilities2 = tf.layers.dense(self.probabilities2, self.vocab_size_2)
+                self.probabilities2 = tf.nn.softmax(self.probabilities2)
+
+            elif self.multitask_method == 'Attention':
+                self.x = tf.get_variable('emb',
+                                         shape=[self.vocab_size, self.feature_size],
+                                         dtype=tf.float32,
+                                         initializer=NetFV.rand_init(self.feature_size))
+                self.emb_label = tf.matmul(self.probabilities, self.x)
+                self.emb_concat = tf.concat([self.emb_label, self.repre], axis=1)
+                self.probabilities2 = tf.layers.dense(self.emb_concat,
+                                                      self.vocab_size_2,
+                                                      activation=tf.nn.tanh)
+                self.probabilities2 = tf.layers.dense(self.probabilities2,
+                                                      self.vocab_size_2)
+                self.probabilities2 = tf.nn.softmax(self.probabilities2)
+
+            self.loss += tf.reduce_mean(-tf.log(
+                         tf.reduce_sum(logits2*self.probabilities2, axis=1)+1e-9))
+            self.pred2 = tf.argmax(self.probabilities2, 1)
+            self.avg_diff2 = tf.cast(tf.equal(tf.argmax(self.label_2, 1), self.pred2), tf.float32)
+            self.avg_diff2 = tf.reduce_mean(self.avg_diff2)
+
+        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.0005,
+                                                epsilon=1e-08,
+                                                name='adam')
+        self.train_op = slim.learning.create_train_op(self.loss, self.optimizer)
+        self.eval_res = {'loss': self.loss, 'avg_diff': self.avg_diff}
+        if self.use_2nd_label:
+            self.eval_res['avg_diff2'] = self.avg_diff2
+