In [1]:
import os
from data_loader.data_loader import DataLoader
import tensorflow as tf
from nets.depth_net import D_Net
from nets.flow_net import feature_pyramid_flow, construct_model_pwc_full
from nets.pose_net import P_Net3
from tensorflow.keras.layers import Conv2D, DepthwiseConv2D
from tensorflow.keras.layers import MaxPool2D, GlobalAveragePooling2D, Dense
from tensorflow.keras.layers import BatchNormalization, Activation
import tensorflow.contrib.slim as slim
import numpy as np

In [2]:
# manually select one or several free gpu
os.environ['CUDA_VISIBLE_DEVICES'] = '0, 1'
# use CPU only
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
def stats_graph(graph):
    flops = tf.profiler.profile(graph,
    options=tf.profiler.ProfileOptionBuilder.float_operation())
    params = tf.profiler.profile(graph,    options=tf.profiler.ProfileOptionBuilder.trainable_variables_parameter())
    print("FLOPs: {}; Trainable params: {}".format(flops.total_float_ops, params.total_parameters))

In [4]:
class ShuffleNetV2():

    first_conv_channel = 24
    
    def __init__(self, input_holder, var_scope, model_scale=1.0, shuffle_group=2, is_training=True):
        self.input = input_holder
        self.output = None
        self.shuffle_group = shuffle_group
        self.channel_sizes = self._select_channel_size(model_scale)
        self.var_scope = var_scope
        self.is_training = is_training

    def _select_channel_size(self, model_scale):
        # [(out_channel, repeat_times), (out_channel, repeat_times), ...]
        if model_scale == 0.5:
            return [(48, 4), (96, 8), (192, 4), (1024, 1)]
        elif model_scale == 1.0:
            return [(116, 4), (232, 8), (464, 4), (1024, 1)]
        elif model_scale == 1.5:
            return [(176, 4), (352, 8), (704, 4), (1024, 1)]
        elif model_scale == 2.0:
            return [(244, 4), (488, 8), (976, 4), (2048, 1)]
        else:
            raise ValueError('Unsupported model size.')

    def build_model(self):
        with tf.variable_scope(self.var_scope) as sc:
            with slim.arg_scope([slim.batch_norm], is_training=self.is_training):
                skip = []
                with tf.variable_scope('encoding'):
                    with tf.variable_scope('init_block'):
                        out = conv_bn_relu(self.input, self.first_conv_channel, 3, 2)
                        skip.append(out)
                        out = slim.max_pool2d(skip[0], 3, 2, padding='SAME')
                        skip.append(out)
                    for idx, block in enumerate(self.channel_sizes[:-1]):
                        with tf.variable_scope('shuffle_block_{}'.format(idx)):
                            out_channel, repeat = block

                            # First block is downsampling
                            out = shufflenet_v2_block(out, out_channel, 3, 2, shuffle_group=self.shuffle_group)

                            # Rest blocks
                            for i in range(repeat-1):
                                out = shufflenet_v2_block(out, out_channel, 3, shuffle_group=self.shuffle_group)

                            skip.append(out)


                    with tf.variable_scope('end_block'):
                        out = conv_bn_relu(out, self.channel_sizes[-1][0], 1)
                        skip.append(out)

                with tf.variable_scope('decoding'):
                    # DECODING
                    upconv6 = upconv(skip[5],   512, 3, 2) #H/32
                    upconv6 = resize_like(upconv6, skip[4])
                    concat6 = tf.concat([upconv6, skip[4]], 3)
                    iconv6  = conv(concat6,   512, 3, 1)

                    upconv5 = upconv(iconv6, 256, 3, 2) #H/16
                    upconv5 = resize_like(upconv5, skip[3])
                    concat5 = tf.concat([upconv5, skip[3]], 3)
                    iconv5  = conv(concat5,   256, 3, 1)

                    upconv4 = upconv(iconv5,  128, 3, 2) #H/8
                    upconv4 = resize_like(upconv4, skip[2])
                    concat4 = tf.concat([upconv4, skip[2]], 3)
                    iconv4  = conv(concat4,   128, 3, 1)
                    pred4 = get_pred(iconv4)
                    upred4  = upsample_nn(pred4, 2)

                    upconv3 = upconv(iconv4,   64, 3, 2) #H/4
                    concat3 = tf.concat([upconv3, skip[1], upred4], 3)
                    iconv3  = conv(concat3,    64, 3, 1)
                    pred3 = get_pred(iconv3)
                    upred3  = upsample_nn(pred3, 2)

                    upconv2 = upconv(iconv3,   32, 3, 2) #H/2
                    concat2 = tf.concat([upconv2, skip[0], upred3], 3)
                    iconv2  = conv(concat2,    32, 3, 1)
                    pred2 = get_pred(iconv2)
                    upred2  = upsample_nn(pred2, 2)

                    upconv1 = upconv(iconv2,  16, 3, 2) #H
                    concat1 = tf.concat([upconv1, upred2], 3)
                    iconv1  = conv(concat1,   16, 3, 1)
                    pred1 = get_pred(iconv1)

                    return [pred1, pred2, pred3, pred4], skip[5]

                # with tf.variable_scope('prediction'):
                #     out = global_avg_pool2D(out)
                #     out = slim.conv2d(out, self.cls, 1, activation_fn=None, biases_initializer=None)
                #     out = tf.reshape(out, shape=[-1, self.cls])
                #     out = tf.identity(out, name='cls_prediction')
                #     self.output = out

def shuffle_unit(x, groups):
    with tf.variable_scope('shuffle_unit'):
        n, h, w, c = x.get_shape().as_list()
        x = tf.reshape(x, shape=tf.convert_to_tensor([tf.shape(x)[0], h, w, groups, c // groups]))
        x = tf.transpose(x, tf.convert_to_tensor([0, 1, 2, 4, 3]))
        x = tf.reshape(x, shape=tf.convert_to_tensor([tf.shape(x)[0], h, w, c]))
    return x

def conv_bn_relu(x, out_channel, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'conv_bn_relu'):
        x = slim.conv2d(x, out_channel, kernel_size, stride, rate=dilation,
                        biases_initializer=None, activation_fn=None)
        x = slim.batch_norm(x, activation_fn=tf.nn.relu, fused=False)
    return x

def conv_bn(x, out_channel, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'conv_bn'):
        x = slim.conv2d(x, out_channel, kernel_size, stride, rate=dilation,
                        biases_initializer=None, activation_fn=None)
        x = slim.batch_norm(x, activation_fn=None, fused=False)
    return x

def depthwise_conv_bn(x, kernel_size, stride=1, dilation=1):
    with tf.variable_scope(None, 'depthwise_conv_bn'):
        x = slim.separable_conv2d(x, None, kernel_size, depth_multiplier=1, stride=stride,
                                  rate=dilation, activation_fn=None, biases_initializer=None)
        x = slim.batch_norm(x, activation_fn=None, fused=False)
    return x

def resolve_shape(x):
    with tf.variable_scope(None, 'resolve_shape'):
        n, h, w, c = x.get_shape().as_list()
        if h is None or w is None:
            kernel_size = tf.convert_to_tensor([tf.shape(x)[1], tf.shape(x)[2]])
        else:
            kernel_size = [h, w]
    return kernel_size

def global_avg_pool2D(x):
    with tf.variable_scope(None, 'global_pool2D'):
        kernel_size = resolve_shape(x)
        x = slim.avg_pool2d(x, kernel_size, stride=1)
        x.set_shape([None, 1, 1, None])
    return x

def se_unit(x, bottleneck=2):
    with tf.variable_scope(None, 'SE_module'):
        n, h, w, c = x.get_shape().as_list()

        kernel_size = resolve_shape(x)
        x_pool = slim.avg_pool2d(x, kernel_size, stride=1)
        x_pool = tf.reshape(x_pool, shape=[-1, c])
        fc = slim.fully_connected(x_pool, bottleneck, activation_fn=tf.nn.relu,
                                  biases_initializer=None)
        fc = slim.fully_connected(fc, c, activation_fn=tf.nn.sigmoid,
                                  biases_initializer=None)
        if n is None:
            channel_w = tf.reshape(fc, shape=tf.convert_to_tensor([tf.shape(x)[0], 1, 1, c]))
        else:
            channel_w = tf.reshape(fc, shape=[n, 1, 1, c])

        x = tf.multiply(x, channel_w)
    return x

def shufflenet_v2_block(x, out_channel, kernel_size, stride=1, dilation=1, shuffle_group=2):
    with tf.variable_scope(None, 'shuffle_v2_block'):
        if stride == 1:
            top, bottom = tf.split(x, num_or_size_splits=2, axis=3)

            half_channel = out_channel // 2

            top = conv_bn_relu(top, half_channel, 1)
            top = depthwise_conv_bn(top, kernel_size, stride, dilation)
            top = conv_bn_relu(top, half_channel, 1)

            out = tf.concat([top, bottom], axis=3)
            out = shuffle_unit(out, shuffle_group)

        else:
            half_channel = out_channel // 2
            b0 = conv_bn_relu(x, half_channel, 1)
            b0 = depthwise_conv_bn(b0, kernel_size, stride, dilation)
            b0 = conv_bn_relu(b0, half_channel, 1)

            b1 = depthwise_conv_bn(x, kernel_size, stride, dilation)
            b1 = conv_bn_relu(b1, half_channel, 1)

            out = tf.concat([b0, b1], axis=3)
            out = shuffle_unit(out, shuffle_group)
        return out
    

def resize_like(inputs, ref):
    iH, iW = inputs.get_shape()[1], inputs.get_shape()[2]
    rH, rW = ref.get_shape()[1], ref.get_shape()[2]
    if iH == rH and iW == rW:
        return inputs
    return tf.image.resize_nearest_neighbor(inputs, [rH.value, rW.value])

def upconv(x, num_out_layers, kernel_size, scale):
    upsample = upsample_nn(x, scale)
    cnv = conv(upsample, num_out_layers, kernel_size, 1)
    return cnv

def upsample_nn(x, ratio):
    h = x.get_shape()[1].value
    w = x.get_shape()[2].value
    return tf.image.resize_nearest_neighbor(x, [h * ratio, w * ratio])

def conv(x, num_out_layers, kernel_size, stride, activation_fn=tf.nn.elu, normalizer_fn=slim.batch_norm):
    p = np.floor((kernel_size - 1) / 2).astype(np.int32)
    p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]], mode='REFLECT')
    return slim.conv2d(p_x, num_out_layers, kernel_size, stride, 'VALID', activation_fn=activation_fn, normalizer_fn=normalizer_fn)

def maxpool(x, kernel_size):
    p = np.floor((kernel_size - 1) / 2).astype(np.int32)
    p_x = tf.pad(x, [[0, 0], [p, p], [p, p], [0, 0]], mode='REFLECT')
    return slim.max_pool2d(p_x, kernel_size)

def get_pred(x):
    disp = 5 * conv(x, 1, 3, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) + 0.01
    return disp

In [None]:
Gd=tf.Graph()
Gp=tf.Graph()
Gf=tf.Graph()

with Gd.as_default():
    loader = DataLoader(dataset_dir='../datasets/kitti_3frames_128_416',
                                img_height=128,
                                img_width=416,
                                batch_size=8,
                                num_scales=4,
                                num_source=2,
                                ext='jpg',
                                mode='train_dp')
    
    image_stack, image_stack_norm, proj_cam2pix, proj_pix2cam = loader.load_train_batch()
    tgt_image = image_stack[:, :, :, 3:6]
    src0_image = image_stack[:, :, :, 0:3]
    src1_image = image_stack[:, :, :, 6:9]
    src_image_stack = tf.concat([src0_image, src1_image], axis=3)

    tgt_image_norm = image_stack_norm[:, :, :, 3:6]
    src0_image_norm = image_stack_norm[:, :, :, 0:3]
    src1_image_norm = image_stack_norm[:, :, :, 6:9]
    src_image_stack_norm = tf.concat([src0_image_norm, src1_image_norm], axis=3)
    
    #Depth
#     tgt_pred_disp, tgt_disp_bottlenecks = D_Net(tgt_image_norm, weight_reg=0.05, is_training=True, reuse=False)
    D_Model = ShuffleNetV2(input_holder=tgt_image_norm, 
                           var_scope='depth_net', 
                           model_scale=1.0, 
                           shuffle_group=2, 
                           is_training=True)
    tgt_pred_disp, tgt_disp_bottlenecks = D_Model.build_model()
    
    #Get layers
    var_depth = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*(depth_net|feature_net_disp).*")))
    var_enc = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*encoding.*")))
    var_dec = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*decoding.*")))
    #Get param
    pc_depth = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_depth])
    pc_enc = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_enc])
    pc_dec = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_dec])
    
with Gp.as_default():    
    loader = DataLoader(dataset_dir='../datasets/kitti_3frames_128_416',
                                img_height=128,
                                img_width=416,
                                batch_size=8,
                                num_scales=4,
                                num_source=2,
                                ext='jpg',
                                mode='train_dp')
    
    image_stack, image_stack_norm, proj_cam2pix, proj_pix2cam = loader.load_train_batch()
    tgt_image = image_stack[:, :, :, 3:6]
    src0_image = image_stack[:, :, :, 0:3]
    src1_image = image_stack[:, :, :, 6:9]
    src_image_stack = tf.concat([src0_image, src1_image], axis=3)

    tgt_image_norm = image_stack_norm[:, :, :, 3:6]
    src0_image_norm = image_stack_norm[:, :, :, 0:3]
    src1_image_norm = image_stack_norm[:, :, :, 6:9]
    src_image_stack_norm = tf.concat([src0_image_norm, src1_image_norm], axis=3)
    
    
    #Pose
    pose_inputs = tf.concat([src_image_stack_norm[:,:,:,0:3], tgt_image_norm, src_image_stack_norm[:,:,:,3:6]], axis=3)
    pred_poses = P_Net3(pose_inputs, None, False, 0.05)
    var_pose = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*pose_net.*")))
    pc_pose = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_pose])
    
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    config.gpu_options.per_process_gpu_memory_fraction = 0.8
    sessp = tf.Session(config=config)
    
    pc_p = sessp.run(pc_pose)
    
    
with Gf.as_default():  
    loader = DataLoader(dataset_dir='../datasets/kitti_3frames_128_416',
                                img_height=128,
                                img_width=416,
                                batch_size=8,
                                num_scales=4,
                                num_source=2,
                                ext='jpg',
                                mode='train_dp')
    
    image_stack, image_stack_norm, proj_cam2pix, proj_pix2cam = loader.load_train_batch()
    tgt_image = image_stack[:, :, :, 3:6]
    src0_image = image_stack[:, :, :, 0:3]
    src1_image = image_stack[:, :, :, 6:9]
    src_image_stack = tf.concat([src0_image, src1_image], axis=3)

    tgt_image_norm = image_stack_norm[:, :, :, 3:6]
    src0_image_norm = image_stack_norm[:, :, :, 0:3]
    src1_image_norm = image_stack_norm[:, :, :, 6:9]
    src_image_stack_norm = tf.concat([src0_image_norm, src1_image_norm], axis=3)
    #Flow
    feature_tgt_flow = feature_pyramid_flow(tgt_image_norm, reuse=False)
    feature_src0_flow = feature_pyramid_flow(src_image_stack_norm[:,:,:,0:3], reuse=True)
    flow_fw0 = construct_model_pwc_full(src_image_stack_norm[:,:,:,0:3], tgt_image_norm, feature_src0_flow, feature_tgt_flow)
    
    var_flow = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*(flow_net|feature_net_flow).*")))
    pc_flow = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_flow])

In [None]:
with tf.Session(graph=Gd) as sess_d:
    print("[Info]  depth size: {:.5f}M".format(sess_d.run(pc_depth)/1000000.0))
    print("[Info] encode size: {:.5f}M".format(sess_d.run(pc_enc)/1000000.0))
    print("[Info] decode size: {:.5f}M".format(sess_d.run(pc_dec)/1000000.0))
    
    
with tf.Session(graph=Gp) as sess_p:
    print("[Info] pose size: {:.5f}M".format(sess_p.run(pc_pose)/1000000.0))
    
with tf.Session(graph=Gf) as sess_f:
    print("[Info] flow size: {:.5f}M".format(sess_f.run(pc_flow)/1000000.0))

In [None]:
stats_graph(Gd)
stats_graph(Gp)
stats_graph(Gf)

## test other parameters 

In [5]:
DISP_SCALING_RESNET50 = 5
def build_resnet50(inputs, get_pred, is_training, var_scope, weight_reg=0.0001, reuse=False):
    batch_norm_params = {'is_training': is_training}
    with tf.variable_scope(var_scope) as sc:
        if reuse:
            sc.reuse_variables()
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                            normalizer_fn=slim.batch_norm,
                            normalizer_params=batch_norm_params,
                            weights_regularizer=slim.l2_regularizer(weight_reg),
                            activation_fn=tf.nn.elu):
            with tf.variable_scope('encoding'):
                conv1 = conv(inputs, 64, 7, 2)      # H/2  -   64D
                pool1 = maxpool(conv1,           3) # H/4  -   64D
                conv2 = resblock(pool1,      64, 3) # H/8  -  256D
                conv3 = resblock(conv2,     128, 4) # H/16 -  512D
                conv4 = resblock(conv3,     256, 6) # H/32 - 1024D
                conv5 = resblock(conv4,     512, 3) # H/64 - 2048D

                skip1 = conv1
                skip2 = pool1
                skip3 = conv2
                skip4 = conv3
                skip5 = conv4
            
#             # DECODING
#             with tf.variable_scope('decoding'):
#                 upconv6 = upconv(conv5,   512, 3, 2) #H/32
#                 upconv6 = resize_like(upconv6, skip5)
#                 concat6 = tf.concat([upconv6, skip5], 3)
#                 iconv6  = conv(concat6,   512, 3, 1)

#                 upconv5 = upconv(iconv6, 256, 3, 2) #H/16
#                 upconv5 = resize_like(upconv5, skip4)
#                 concat5 = tf.concat([upconv5, skip4], 3)
#                 iconv5  = conv(concat5,   256, 3, 1)

#                 upconv4 = upconv(iconv5,  128, 3, 2) #H/8
#                 upconv4 = resize_like(upconv4, skip3)
#                 concat4 = tf.concat([upconv4, skip3], 3)
#                 iconv4  = conv(concat4,   128, 3, 1)
#                 pred4 = get_pred(iconv4)
#                 upred4  = upsample_nn(pred4, 2)

#                 upconv3 = upconv(iconv4,   64, 3, 2) #H/4
#                 concat3 = tf.concat([upconv3, skip2, upred4], 3)
#                 iconv3  = conv(concat3,    64, 3, 1)
#                 pred3 = get_pred(iconv3)
#                 upred3  = upsample_nn(pred3, 2)

#                 upconv2 = upconv(iconv3,   32, 3, 2) #H/2
#                 concat2 = tf.concat([upconv2, skip1, upred3], 3)
#                 iconv2  = conv(concat2,    32, 3, 1)
#                 pred2 = get_pred(iconv2)
#                 upred2  = upsample_nn(pred2, 2)

#                 upconv1 = upconv(iconv2,  16, 3, 2) #H
#                 concat1 = tf.concat([upconv1, upred2], 3)
#                 iconv1  = conv(concat1,   16, 3, 1)
#                 pred1 = get_pred(iconv1)

            return skip5

def get_disp_resnet50(x):
    disp = DISP_SCALING_RESNET50 * conv(x, 1, 3, 1, activation_fn=tf.nn.sigmoid, normalizer_fn=None) + 0.01
    return disp

def resblock(x, num_layers, num_blocks):
    out = x
    for i in range(num_blocks - 1):
        out = resconv(out, num_layers, 1)
    out = resconv(out, num_layers, 2)
    return out

def resconv(x, num_layers, stride):
    # Actually here exists a bug: tf.shape(x)[3] != num_layers is always true,
    # but we preserve it here for consistency with Godard's implementation.
    do_proj = tf.shape(x)[3] != num_layers or stride == 2
    shortcut = []
    conv1 = conv(x,         num_layers, 1, 1)
    conv2 = conv(conv1,     num_layers, 3, stride)
    conv3 = conv(conv2, 4 * num_layers, 1, 1, None)
    if do_proj:
        shortcut = conv(x, 4 * num_layers, 1, stride, None)
    else:
        shortcut = x
    return tf.nn.elu(conv3 + shortcut)

In [6]:
Gtest=tf.Graph()
with Gtest.as_default():
    loader = DataLoader(dataset_dir='../datasets/kitti_3frames_128_416',
                                img_height=128,
                                img_width=416,
                                batch_size=8,
                                num_scales=4,
                                num_source=2,
                                ext='jpg',
                                mode='train_dp')
    
    image_stack, image_stack_norm, proj_cam2pix, proj_pix2cam = loader.load_train_batch()
    tgt_image = image_stack[:, :, :, 3:6]
    src0_image = image_stack[:, :, :, 0:3]
    src1_image = image_stack[:, :, :, 6:9]
    src_image_stack = tf.concat([src0_image, src1_image], axis=3)

    tgt_image_norm = image_stack_norm[:, :, :, 3:6]
    src0_image_norm = image_stack_norm[:, :, :, 0:3]
    src1_image_norm = image_stack_norm[:, :, :, 6:9]
    src_image_stack_norm = tf.concat([src0_image_norm, src1_image_norm], axis=3)
    
    #Depth
#     tgt_pred_disp, tgt_disp_bottlenecks = D_Net(tgt_image_norm, weight_reg=0.05, is_training=True, reuse=False)
    skip = build_resnet50(tgt_image_norm, get_disp_resnet50, is_training=True, var_scope='depth_net', reuse=False)
#     D_Model = ShuffleNetV2(input_holder=tgt_image_norm, 
#                            var_scope='depth_net', 
#                            model_scale=1.0, 
#                            shuffle_group=2, 
#                            is_training=True)
#     tgt_pred_disp, tgt_disp_bottlenecks = D_Model.build_model()
    
    #Get layers
    var_depth = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*(depth_net|feature_net_disp).*")))
    var_enc = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*encoding.*")))
    var_dec = list(set(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=".*decoding.*")))
    #Get param
    pc_depth = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_depth])
    pc_enc = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_enc])
    pc_dec = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in var_dec])

In [7]:
with tf.Session(graph=Gtest) as sess_d:
    print("[Info]  depth size: {:.5f}M".format(sess_d.run(pc_depth)/1000000.0))
    print("[Info] encode size: {:.5f}M".format(sess_d.run(pc_enc)/1000000.0))
    print("[Info] decode size: {:.5f}M".format(sess_d.run(pc_dec)/1000000.0))
stats_graph(Gtest)

[Info]  depth size: 38.04173M
[Info] encode size: 38.04173M
[Info] decode size: 0.00000M


4 ops no flops stats due to incomplete shapes.
4 ops no flops stats due to incomplete shapes.


FLOPs: 88737443928; Trainable params: 38041728
