In [54]:
import os
from data_loader.data_loader import DataLoader
import tensorflow as tf
# from nets.depth_net import D_Net
from nets.flow_net import feature_pyramid_flow, construct_model_pwc_full
# from nets.pose_net import P_Net3
from tensorflow.keras.layers import Conv2D, DepthwiseConv2D
from tensorflow.keras.layers import MaxPool2D, GlobalAveragePooling2D, Dense
from tensorflow.keras.layers import BatchNormalization, Activation
import tensorflow.contrib.slim as slim
import numpy as np

In [55]:
# manually select one or several free gpu
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
# use CPU only
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [56]:
def stats_graph(graph):
    flops = tf.profiler.profile(graph,
    options=tf.profiler.ProfileOptionBuilder.float_operation())
    params = tf.profiler.profile(graph,    options=tf.profiler.ProfileOptionBuilder.trainable_variables_parameter())
    print("FLOPs: {}; Trainable params: {}".format(flops.total_float_ops, params.total_parameters))

In [84]:
class ShuffleNetV2():

    first_conv_channel = 24
    
    def __init__(self, input_holder, var_scope, model_scale=1.0, shuffle_group=2, is_training=True):
        self.input = input_holder
        self.output = None
        self.shuffle_group = shuffle_group
        self.channel_sizes = self._select_channel_size(model_scale)
        self.var_scope = var_scope
        self.is_training = is_training

    def _select_channel_size(self, model_scale):
        # [(out_channel, repeat_times), (out_channel, repeat_times), ...]
        if model_scale == 0.5:
            return [(48, 4), (96, 8), (192, 4), (1024, 1)]
        elif model_scale == 1.0:
            return [(116, 4), (232, 8), (464, 4), (1024, 1)]
        elif model_scale == 1.5:
            return [(176, 4), (352, 8), (704, 4), (1024, 1)]
        elif model_scale == 2.0:
            return [(244, 4), (488, 8), (976, 4), (2048, 1)]
        else:
            raise ValueError('Unsupported model size.')

    def build_model(self):
        with tf.variable_scope(self.var_scope) as sc:
            with slim.arg_scope([slim.batch_norm], is_training=self.is_training):
                skip = []
                with tf.variable_scope('encoding'):
                    with tf.variable_scope('init_block'):
                        out = conv_bn_relu(self.input, self.first_conv_channel, 3, 2)
                        skip.append(out)
                        out = slim.max_pool2d(skip[0], 3, 2, padding='SAME')
                        skip.append(out)
                        
                    for idx, block in enumerate(self.channel_sizes[:-1]):
                        with tf.variable_scope('shuffle_block_{}'.format(idx)):
                            out_channel, repeat = block

                            # First block is downsampling
                            print("[Downsample] out, out_channel:", out.shape[-1], out_channel)
                            out = shufflenet_v2_block(out, out_channel, 3, 2, shuffle_group=self.shuffle_group)

                            # Rest blocks
                            for i in range(repeat-1):
                                print("[Rest] out, out_channel:", out.shape[-1], out_channel)
                                out = shufflenet_v2_block(out, out_channel, 3, shuffle_group=self.shuffle_group)

                            skip.append(out)


                    with tf.variable_scope('end_block'):
                        out = conv_bn_relu(out, self.channel_sizes[-1][0], 1)
                        skip.append(out)
                for idx, sk in enumerate(skip):
                    print("skip[%d]:" % idx, sk.shape)
                with tf.variable_scope('decoding'):
                    # DECODING
                    upconv6 = upconv_sep(skip[5],   512, 3, 2) #H/32
                    upconv6 = resize_like(upconv6, skip[4])
                    concat6 = tf.concat([upconv6, skip[4]], 3)
#                     iconv6  = conv(concat6,   512, 3, 1)
                    iconv6  = shufflenet_v2_block(concat6, 512, 3, stride=1, dilation=1, shuffle_group=2)
        
                    upconv5 = upconv_sep(iconv6, 256, 3, 2) #H/16
                    upconv5 = resize_like(upconv5, skip[3])
                    concat5 = tf.concat([upconv5, skip[3]], 3)
#                     iconv5  = conv(concat5,   256, 3, 1)
                    iconv5  = shufflenet_v2_block(concat5, 256, 3, stride=1, dilation=1, shuffle_group=2)
        
                    upconv4 = upconv_sep(iconv5,  128, 3, 2) #H/8
                    upconv4 = resize_like(upconv4, skip[2])
                    concat4 = tf.concat([upconv4, skip[2]], 3)
#                     iconv4  = conv(concat4,   128, 3, 1)
                    iconv4  = shufflenet_v2_block(concat4, 128, 3, stride=1, dilation=1, shuffle_group=2)
                    pred4 = get_pred(iconv4)
                    upred4  = upsample_nn(pred4, 2)

                    upconv3 = upconv_sep(iconv4,   64, 3, 2) #H/4
                    concat3 = tf.concat([upconv3, skip[1], upred4], 3)
#                     iconv3  = conv(concat3,    64, 3, 1)
                    iconv3  = shufflenet_v2_block(concat3, 64, 3, stride=1, dilation=1, shuffle_group=2)
                    pred3 = get_pred(iconv3)
                    upred3  = upsample_nn(pred3, 2)

                    upconv2 = upconv_sep(iconv3,   32, 3, 2) #H/2
                    concat2 = tf.concat([upconv2, skip[0], upred3], 3)
#                     iconv2  = conv(concat2,    32, 3, 1)
                    iconv2  = shufflenet_v2_block(concat2, 32, 3, stride=1, dilation=1, shuffle_group=2)
                    pred2 = get_pred(iconv2)
                    upred2  = upsample_nn(pred2, 2)

                    upconv1 = upconv_sep(iconv2,  16, 3, 2) #H
                    concat1 = tf.concat([upconv1, upred2], 3)
#                     iconv1  = conv(concat1,   16, 3, 1)
                    iconv1  = shufflenet_v2_block(concat1, 16, 3, stride=1, dilation=1, shuffle_group=2)
                    pred1 = get_pred(iconv1)

                    return [pred1, pred2, pred3, pred4], skip[5]

                # with tf.variable_scope('prediction'):
                #     out = global_avg_pool2D(out)
                #     out = slim.conv2d(out, self.cls, 1, activation_fn=None, biases_initializer=None)
                #     out = tf.reshape(out, shape=[-1, self.cls])
                #     out = tf.identity(out, name='cls_prediction')
                #     self.output = out

def shuffle_unit(x, groups):
    with tf.variable_scope('shuffle_unit'):
        n, h, w, c = x.get_shape().as_list()
        if c % groups == 0:
            x = tf.reshape(x, shape=tf.convert_to_tensor([tf.shape(x)[0], h, w, groups, c // groups]))
            x = tf.transpose(x, tf.convert_to_tensor([0, 1, 2, 4, 3]))
            x = tf.reshape(x, shape=tf.convert_to_tensor([tf.shape(x)[0], h, w, c]))
    return x

def conv_bn_relu(x, out_channel, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'conv_bn_relu'):
        x = slim.conv2d(x, out_channel, kernel_size, stride, rate=dilation,
                        biases_initializer=None, activation_fn=None)
        x = slim.batch_norm(x, activation_fn=tf.nn.relu, fused=False)
    return x

def conv_bn(x, out_channel, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'conv_bn'):
        x = slim.conv2d(x, out_channel, kernel_size, stride, rate=dilation,
                        biases_initializer=None, activation_fn=None)
        x = slim.batch_norm(x, activation_fn=None, fused=False)
    return x

def depthwise_conv_bn(x, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'depthwise_conv_bn'):
        x = slim.separable_conv2d(x, None, kernel_size, depth_multiplier=1, stride=stride,
                                  rate=dilation, activation_fn=None, biases_initializer=None)
        x = slim.batch_norm(x, activation_fn=None, fused=False)
    return x

def resolve_shape(x):
    with tf.variable_scope(None, 'resolve_shape'):
        n, h, w, c = x.get_shape().as_list()
        if h is None or w is None:
            kernel_size = tf.convert_to_tensor([tf.shape(x)[1], tf.shape(x)[2]])
        else:
            kernel_size = [h, w]
    return kernel_size

def global_avg_pool2D(x):
    with tf.variable_scope(None, 'global_pool2D'):
        kernel_size = resolve_shape(x)
        x = slim.avg_pool2d(x, kernel_size, stride=1)
        x.set_shape([None, 1, 1, None])
    return x

def se_unit(x, bottleneck=2):
    with tf.variable_scope(None, 'SE_module'):
        n, h, w, c = x.get_shape().as_list()

        kernel_size = resolve_shape(x)
        x_pool = slim.avg_pool2d(x, kernel_size, stride=1)
        x_pool = tf.reshape(x_pool, shape=[-1, c])
        fc = slim.fully_connected(x_pool, bottleneck, activation_fn=tf.nn.relu,
                                  biases_initializer=None)
        fc = slim.fully_connected(fc, c, activation_fn=tf.nn.sigmoid,
                                  biases_initializer=None)
        if n is None:
            channel_w = tf.reshape(fc, shape=tf.convert_to_tensor([tf.shape(x)[0], 1, 1, c]))
        else:
            channel_w = tf.reshape(fc, shape=[n, 1, 1, c])

        x = tf.multiply(x, channel_w)
    return x

def shufflenet_v2_block(x, out_channel, kernel_size, stride=1, dilation=1, shuffle_group=2):
    with tf.variable_scope(None, 'shuffle_v2_block'):
        if stride == 1 and x.shape[-1] == out_channel:
#             if x.shape[-1] != out_channel:
#                 x = conv_bn_relu(x, out_channel, 1)
                
            top, bottom = tf.split(x, num_or_size_splits=2, axis=3)

            half_channel = out_channel // 2

            top = conv_bn_relu(top, half_channel, 1)
            top = depthwise_conv_bn(top, kernel_size, stride, dilation)
            top = conv_bn_relu(top, half_channel, 1)

            out = tf.concat([top, bottom], axis=3)

            out = shuffle_unit(out, shuffle_group)


        else:

            half_channel = out_channel // 2
            b0 = conv_bn_relu(x, half_channel, 1)
            b0 = depthwise_conv_bn(b0, kernel_size, stride, dilation)
            b0 = conv_bn_relu(b0, half_channel, 1)

            b1 = depthwise_conv_bn(x, kernel_size, stride, dilation)
            b1 = conv_bn_relu(b1, half_channel, 1)

            out = tf.concat([b0, b1], axis=3)
            out = shuffle_unit(out, shuffle_group)
       
        return out
    

def resize_like(inputs, ref):
    iH, iW = inputs.get_shape()[1], inputs.get_shape()[2]
    rH, rW = ref.get_shape()[1], ref.get_shape()[2]
    if iH == rH and iW == rW:
        return inputs
    return tf.image.resize_nearest_neighbor(inputs, [rH.value, rW.value])

def upconv(x, num_out_layers, kernel_size, scale):
    upsample = upsample_nn(x, scale)
    cnv = conv(upsample, num_out_layers, kernel_size, 1)
    return cnv

def upsample_nn(x, ratio):
    h = x.get_shape()[1].value
    w = x.get_shape()[2].value
    return tf.image.resize_nearest_neighbor(x, [h * ratio, w * ratio])

def upconv_sep(x, num_out_layers, kernel_size, scale):
    upsample = upsample_nn(x, scale)
#     cnv = conv(upsample, num_out_layers, kernel_size, 1)
    cnv  = shufflenet_v2_block(upsample, num_out_layers, kernel_size, stride=1, dilation=1, shuffle_group=2)
    return cnv

def conv(x, num_out_layers, kernel_size, stride, activation_fn=tf.nn.elu, normalizer_fn=slim.batch_norm):
    p = np.floor((kernel_size - 1) / 2).astype(np.int32)
    p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]], mode='REFLECT')
    return slim.conv2d(p_x, num_out_layers, kernel_size, stride, 'VALID', activation_fn=activation_fn, normalizer_fn=normalizer_fn)

def maxpool(x, kernel_size):
    p = np.floor((kernel_size - 1) / 2).astype(np.int32)
    p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]], mode='REFLECT')
    return slim.max_pool2d(p_x, kernel_size)

def get_pred(x):
    disp = 5 * conv(x, 1, 3, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) + 0.01
    return disp

# def Separable(x, in_channels, out_channels, stride=1, is_training=True,
#         scope='separable'):
#     with tf.variable_scope(scope):
#         # Diagonalwise Refactorization
#         # groups = in_channels
#         # groups = 16
#         groups = max(in_channels / 32, 1)
#         x = DiagonalwiseRefactorization(x, in_channels, stride, groups,
#                                         is_training, 'depthwise')

#         # Specialized Kernel
#         # x = Depthwise(x, in_channels, stride, is_training, 'depthwise')

#         # Standard Convolution
#         # x = Conv3x3(x, in_channels, in_channels, stride, is_training,
#                     # 'convolution')
#         x = Pointwise(x, in_channels, out_channels, is_training, 'pointwise')
#         return x

# def Pointwise(x, in_channels, out_channels, is_training=True,
#         scope='pointwise'):
#     with tf.variable_scope(scope):
#         w = tf.get_variable('weights', (1, 1, in_channels, out_channels),
#                             initializer=tf.contrib.layers.xavier_initializer())
#         x = tf.nn.conv2d(x, w, (1, 1, 1, 1), 'SAME', data_format='NCHW')
#         x = tf.contrib.layers.batch_norm(x, center=True, scale=True,
#                                          data_format='NCHW', fused=True,
#                                          is_training=is_training)
#         x = tf.nn.relu(x)
#         return x

# def DiagonalwiseRefactorization(x, in_channels, stride=1, groups=4,
#         is_training=True, scope='depthwise'):
#     with tf.variable_scope(scope):
#         channels = in_channels / groups
#         mask = tf.constant(get_mask(channels).tolist(), dtype=tf.float32,
#                            shape=(3, 3, channels, channels))
#         splitw = [
#             tf.get_variable('weights_%d' % _, (3, 3, channels, channels),
#                             initializer=tf.contrib.layers.xavier_initializer())
#             for _ in range(groups)
#         ]
#         splitw = [tf.multiply(w, mask) for w in splitw]
#         splitx = tf.split(x, groups, 1)
#         splitx = [tf.nn.conv2d(x, w, (1, 1, stride, stride), 'SAME',
#                                data_format='NCHW')
#                   for x, w in zip(splitx, splitw)]
#         x = tf.concat(splitx, 1)
#         x = tf.contrib.layers.batch_norm(x, center=True, scale=True,
#                                          data_format='NCHW', fused=True,
#                                          is_training=is_training)
#         x = tf.nn.relu(x)
#         return x

In [85]:
def P_Net3(image_stack, disp_bottleneck_stack, joint_encoder, weight_reg=0.0004):
    with tf.variable_scope('pose_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                          normalizer_fn=None,
                          weights_regularizer=slim.l2_regularizer(weight_reg),
                          normalizer_params=None,
                          activation_fn=tf.nn.relu,
                          outputs_collections=end_points_collection):
            if not joint_encoder:
                cnv1 = slim.conv2d(image_stack, 16, [7, 7], stride=2, scope='cnv1')
                cnv1b = slim.conv2d(cnv1, 16, [7, 7], stride=1, scope='cnv1b')
                cnv2 = slim.conv2d(cnv1b, 32, [5, 5], stride=2, scope='cnv2')
                cnv2b = slim.conv2d(cnv2, 32, [5, 5], stride=1, scope='cnv2b')
                cnv3 = slim.conv2d(cnv2b, 64, [3, 3], stride=2, scope='cnv3')
                cnv3b = slim.conv2d(cnv3, 64, [3, 3], stride=1, scope='cnv3b')
                cnv4 = slim.conv2d(cnv3b, 128, [3, 3], stride=2, scope='cnv4')
                cnv4b = slim.conv2d(cnv4, 128, [3, 3], stride=1, scope='cnv4b')
                cnv5 = slim.conv2d(cnv4b, 256, [3, 3], stride=2, scope='cnv5')
                cnv5b = slim.conv2d(cnv5, 256, [3, 3], stride=1, scope='cnv5b')

            inputs = disp_bottleneck_stack if joint_encoder else cnv5b

            # Pose specific layers
            cnv6 = slim.conv2d(inputs, 256, [3, 3], stride=2, scope='cnv6')
            cnv6b = slim.conv2d(cnv6, 256, [3, 3], stride=1, scope='cnv6b')
            cnv7 = slim.conv2d(cnv6b, 256, [3, 3], stride=2, scope='cnv7')
            cnv7b = slim.conv2d(cnv7, 256, [3, 3], stride=1, scope='cnv7b')

            pose_pred = slim.conv2d(
                cnv7b,
                6*6, [1, 1],
                scope='pred',
                stride=1,
                normalizer_fn=None,
                activation_fn=None)
            pose_avg = tf.reduce_mean(pose_pred, [1, 2])
            pose_final = tf.reshape(pose_avg, [-1, 1, 6*6])

            tran_mag = 0.001 if joint_encoder else 1.0
            rot_mag= 0.01

            pose_final = tf.concat(
                [tran_mag * pose_final[:, :, 0:3],   rot_mag * pose_final[:, :, 3:6],    # 0: src0 -> tgt
                 tran_mag * pose_final[:, :, 6:9],   rot_mag * pose_final[:, :, 9:12],   # 1: tgt -> src1
                 tran_mag * pose_final[:, :, 12:15], rot_mag * pose_final[:, :, 15:18],  # 2: src0 -> src1
                 tran_mag * pose_final[:, :, 18:21], rot_mag * pose_final[:, :, 21:24],  # 3: tgt -> src0
                 tran_mag * pose_final[:, :, 24:27], rot_mag * pose_final[:, :, 27:30],  # 4: src1 -> tgt
                 tran_mag * pose_final[:, :, 30:33], rot_mag * pose_final[:, :, 33:36]], # 5: src1 -> src0
                axis=2)

            return pose_final

In [86]:
Gd=tf.Graph()
Gp=tf.Graph()
Gf=tf.Graph()

batch_size = 8
img_height = 256
img_width = 832

with Gd.as_default():
    depth_inputs = tf.random_uniform((batch_size, img_height, img_width, 3))

#     tgt_pred_disp, tgt_disp_bottlenecks = D_Net(depth_inputs, weight_reg=0.05, is_training=True, reuse=False)
    D_Model = ShuffleNetV2(input_holder=depth_inputs, 
                           var_scope='depth_net', 
                           model_scale=1.0, 
                           shuffle_group=2, 
                           is_training=True)
    tgt_pred_disp, tgt_disp_bottlenecks = D_Model.build_model()
    
    #Get layers
    var_depth = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*(depth_net|feature_net_disp).*")))
    var_enc = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*encoding.*")))
    var_dec = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*decoding.*")))
    #Get param
    pc_depth = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_depth])
    pc_enc = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_enc])
    pc_dec = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_dec])
    
with Gp.as_default():    
    pose_inputs = tf.random_uniform((batch_size, img_height, img_width, 9))
    pred_poses = P_Net3(pose_inputs, None, False, 0.05)
    
    var_pose = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*pose_net.*")))
    pc_pose = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_pose])

    
with Gf.as_default():  
    
    flow_inputs1 = tf.random_uniform((batch_size, img_height, img_width, 3))
    flow_inputs2 = tf.random_uniform((batch_size, img_height, img_width, 3))
    
    #Flow
    feature_tgt_flow = feature_pyramid_flow(flow_inputs1, reuse=False)
    feature_src0_flow = feature_pyramid_flow(flow_inputs2, reuse=True)
    flow_fw0 = construct_model_pwc_full(flow_inputs2, flow_inputs1, feature_src0_flow, feature_tgt_flow)
    
    var_flow = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*(flow_net|feature_net_flow).*")))
    pc_flow = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_flow])

[Downsample] out, out_channel: 24 116
[Rest] out, out_channel: 116 116
[Rest] out, out_channel: 116 116
[Rest] out, out_channel: 116 116
[Downsample] out, out_channel: 116 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Rest] out, out_channel: 232 232
[Downsample] out, out_channel: 232 464
[Rest] out, out_channel: 464 464
[Rest] out, out_channel: 464 464
[Rest] out, out_channel: 464 464
skip[0]: (8, 128, 416, 24)
skip[1]: (8, 64, 208, 24)
skip[2]: (8, 32, 104, 116)
skip[3]: (8, 16, 52, 232)
skip[4]: (8, 8, 26, 464)
skip[5]: (8, 8, 26, 1024)
(8, 256, 832, 16)


In [83]:
with tf.Session(graph=Gd) as sess_d:
    print("[Info]  depth size: {:.5f}M".format(sess_d.run(pc_depth)/1000000.0))
    print("[Info] encode size: {:.5f}M".format(sess_d.run(pc_enc)/1000000.0))
    print("[Info] decode size: {:.5f}M".format(sess_d.run(pc_dec)/1000000.0))
    
    
with tf.Session(graph=Gp) as sess_p:
    print("[Info] pose size: {:.5f}M".format(sess_p.run(pc_pose)/1000000.0))
    
with tf.Session(graph=Gf) as sess_f:
    print("[Info] flow size: {:.5f}M".format(sess_f.run(pc_flow)/1000000.0))

[Info]  depth size: 2.83692M
[Info] encode size: 1.24551M
[Info] decode size: 1.59140M
[Info] pose size: 3.58978M
[Info] flow size: 5.11574M


In [None]:
stats_graph(Gd)
stats_graph(Gp)
stats_graph(Gf)