In [1]:
import numpy as np
import tensorflow as tf

<h3>Kinetics-i3d video action classifier</h3>

See https://github.com/deepmind/kinetics-i3d

In [3]:
# after i3d.py

pad_same = "SAME"
pad_valid = "VALID"

class Unit3D(tf.Module):
    #Basic unit containing Conv3D + BatchNorm + non-linearity.

    def __init__(self, output_channels,
                   kernel_shape = [1,1,1],
                   stride=[1, 1, 1],
                   activation_fn=tf.nn.relu,
                   use_batch_norm=True,
                   use_bias=False,
                   name='unit_3d'):
        super(Unit3D, self).__init__()
        self._output_channels = output_channels
        self._kernel_shape = kernel_shape
        self._padding = pad_same
        self._stride = [1] + stride + [1]
        self._use_batch_norm = use_batch_norm
        self._activation_fn = activation_fn
        self._use_bias = use_bias
        self._name = name

        # vardict refers to a global dictionary of tf.Variables loaded from the file containing the weights
        if self._use_batch_norm:
            self.bn_beta = vardict[self._name + "/batch_norm/beta"]
            self.bn_moving_mean = vardict[self._name + "/batch_norm/moving_mean"]
            self.bn_moving_variance = vardict[self._name + "/batch_norm/moving_variance"]
        self.conv_w = vardict[self._name + "/conv_3d/w"]
        if self._use_bias:
            self.conv_b = vardict[self._name+"/conv_3d/b"]
        
    def __call__(self, inputs, is_training):
         # input shape is [batch, depth, height, width, channels]
        net = tf.nn.conv3d(inputs, filters=self.conv_w, strides=self._stride, padding=self._padding)
        if self._use_bias:
            net = tf.nn.bias_add(net, self.conv_b)
        if self._use_batch_norm:
            net = tf.nn.batch_normalization(net, 
                                            self.bn_moving_mean, 
                                            self.bn_moving_variance, 
                                            self.bn_beta, 
                                            scale=1, 
                                            variance_epsilon=0.01)
        if self._activation_fn is not None:
            net = self._activation_fn(net)
        return net

class InceptionI3d(tf.Module):
#  """Inception-v1 I3D architecture.

#  The model is introduced in:

#    Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
#    Joao Carreira, Andrew Zisserman
#    https://arxiv.org/pdf/1705.07750v1.pdf.

#  See also the Inception architecture, introduced in:

#    Going deeper with convolutions
#    Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
#    Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
#    http://arxiv.org/pdf/1409.4842v1.pdf.
#  """

  # Endpoints of the model in order. During construction, all the endpoints up
  # to a designated `final_endpoint` are returned in a dictionary as the
  # second return value.
    VALID_ENDPOINTS = (
      'Conv3d_1a_7x7',
      'MaxPool3d_2a_3x3',
      'Conv3d_2b_1x1',
      'Conv3d_2c_3x3',
      'MaxPool3d_3a_3x3',
      'Mixed_3b',
      'Mixed_3c',
      'MaxPool3d_4a_3x3',
      'Mixed_4b',
      'Mixed_4c',
      'Mixed_4d',
      'Mixed_4e',
      'Mixed_4f',
      'MaxPool3d_5a_2x2',
      'Mixed_5b',
      'Mixed_5c',
      'Logits',
      'Predictions',
  )
    
    # In the paper referenced above, notations are made of the receptive field after each pooling layer, i.e. the
    # size of the input data that each of its outputs depends on. They are (time is the first dim listed):
    #   MaxPool3d_2a_3x3   7x11x11
    #   MaxPool3d_3a_3x3   11x27x27
    #   MaxPool3d_4a_3x3   23x75x75
    #   MaxPool3d_5a_2x2   59x219x219
    #   AvgPool3d_2x7x7    99x539x539
    # The AvgPool layer is immediately prior to the logits (which are linear combinations of its outputs).
    # Since the net uses only convolutional layers the dimensions of its input are not fixed (they can even vary
    # between calls, since the net calls tf.nn.conv3d directly rather than keras.layers.Conv3D).
    

    def __init__(self, var_prefix='RGB', num_classes=400, spatial_squeeze=True,
               final_endpoint='Logits', name='inception_i3d'):
#    """Initializes I3D model instance.

#    Args:
#      num_classes: The number of outputs in the logit layer (default 400, which
#          matches the Kinetics dataset).
#      spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
#          before returning (default True).
#      final_endpoint: The model contains many possible endpoints.
#          `final_endpoint` specifies the last endpoint for the model to be built
#          up to. In addition to the output at `final_endpoint`, all the outputs
#          at endpoints up to `final_endpoint` will also be returned, in a
#          dictionary. `final_endpoint` must be one of
#          InceptionI3d.VALID_ENDPOINTS (default 'Logits').
#      name: A string (optional). The name of this module.
#    Raises:
#      ValueError: if `final_endpoint` is not recognized.
#    """

        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__(name=name)
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self._var_prefix = var_prefix

        # except for the first and last entries here all this (output channels and kernel shape) is already implicit 
        # in the weights passed to the modules. the important part is the correspondence between modules and the 
        # names of variables contained in the checkpoint data
        arg_dict = {'Conv3d_1a_7x7' : {'output_channels': 64, 'kernel_shape' : [7,7,7], 'stride' : [2,2,2]}, 
                     'Conv3d_2b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Conv3d_2c_3x3' : {'output_channels' : 192, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_3b/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3b/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 96, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3b/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 128, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_3b/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 16, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3b/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 32, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_3b/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 32, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3c/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3c/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3c/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 192, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_3c/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 32, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_3c/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 96, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_3c/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4b/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 192, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4b/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 96, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4b/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 208, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4b/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 16, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4b/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 48, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4b/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4c/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 160, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4c/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 112, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4c/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 224, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4c/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 24, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4c/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 64, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4c/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4d/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4d/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4d/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 256, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4d/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 24, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4d/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 64, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4d/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4e/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 112, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4e/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 144, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4e/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 288, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4e/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 32, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4e/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 64, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4e/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 64, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4f/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 256, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4f/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 160, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4f/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 320, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4f/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 32, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_4f/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 128, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_4f/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5b/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 256, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5b/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 160, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5b/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 320, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_5b/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 32, 'kernel_shape' : [1, 1, 1]},
                    # typo here: in other modules the name is Branch2/Conv3d_0b_3x3 !
                     'Mixed_5b/Branch_2/Conv3d_0a_3x3' : {'output_channels' : 128, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_5b/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5c/Branch_0/Conv3d_0a_1x1' : {'output_channels' : 384, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5c/Branch_1/Conv3d_0a_1x1' : {'output_channels' : 192, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5c/Branch_1/Conv3d_0b_3x3' : {'output_channels' : 384, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_5c/Branch_2/Conv3d_0a_1x1' : {'output_channels' : 48, 'kernel_shape' : [1, 1, 1]},
                     'Mixed_5c/Branch_2/Conv3d_0b_3x3' : {'output_channels' : 128, 'kernel_shape' : [3, 3, 3]},
                     'Mixed_5c/Branch_3/Conv3d_0b_1x1' : {'output_channels' : 128, 'kernel_shape' : [1, 1, 1]},
                     'Logits/Conv3d_0c_1x1' : {'output_channels' : self._num_classes, 'kernel_shape' : [1, 1, 1],
                                              'activation_fn' : None, 'use_batch_norm' : False, 'use_bias' : True}
                    }
        
        self.module_dict = {}
        model_prefix = self._var_prefix + "/" + self.name + "/" 
        for module_name in list(arg_dict.keys()):
            self.module_dict[module_name] = Unit3D(**arg_dict[module_name],
                                              name = model_prefix+module_name)

    def __call__(self, inputs, is_training, dropout_prob=0.0):
#    """Connects the model to inputs.

#    Args:
#      inputs: Inputs to the model, which should have dimensions
#          `batch_size` x `num_frames` x 224 x 224 x `num_channels`.
#      is_training: whether to use training mode for snt.BatchNorm (boolean).
#      dropout_prob: Probability for the tf.nn.dropout layer (float in
#          [0, 1)).

#    Returns:
#      A tuple consisting of:
#        1. Network output at location `self._final_endpoint`.
#        2. Dictionary containing all endpoints up to `self._final_endpoint`,
#           indexed by endpoint name.

#    Raises:
#      ValueError: if `self._final_endpoint` is not recognized.
#    """
        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % self._final_endpoint)

        net = inputs
        end_points = {}
        end_point = 'Conv3d_1a_7x7'        
        net = self.module_dict[end_point](net, is_training=is_training)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points
        
        end_point = 'MaxPool3d_2a_3x3'
        net = tf.nn.max_pool3d(net, ksize=[1, 1, 3, 3, 1], strides=[1, 1, 2, 2, 1],
                               padding=pad_same, name=end_point)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points
        end_point = 'Conv3d_2b_1x1'
        net = self.module_dict[end_point](net, is_training=is_training)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points
        end_point = 'Conv3d_2c_3x3'
        net = self.module_dict[end_point](net, is_training=is_training)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points
        end_point = 'MaxPool3d_3a_3x3'
        net = tf.nn.max_pool3d(net, ksize=[1, 1, 3, 3, 1], strides=[1, 1, 2, 2, 1],
                               padding=pad_same, name=end_point)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_3b'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)

        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_3c'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'MaxPool3d_4a_3x3'
        net = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1], strides=[1, 2, 2, 2, 1],
                               padding=pad_same, name=end_point)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_4b'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_4c'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_4d'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_4e'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_4f'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'MaxPool3d_5a_2x2'
        net = tf.nn.max_pool3d(net, ksize=[1, 2, 2, 2, 1], strides=[1, 2, 2, 2, 1],
                               padding=pad_same, name=end_point)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_5b'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        # typo here: in other modules the name is Branch2/Conv3d_0b_3x3 !
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Mixed_5c'
        branch_0 = self.module_dict[end_point+'/Branch_0/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_1 = self.module_dict[end_point+'/Branch_1/Conv3d_0b_3x3'](branch_1, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0a_1x1'](net, is_training=is_training)
        branch_2 = self.module_dict[end_point+'/Branch_2/Conv3d_0b_3x3'](branch_2, is_training=is_training)
        branch_3 = tf.nn.max_pool3d(net, ksize=[1, 3, 3, 3, 1],
                                            strides=[1, 1, 1, 1, 1], padding=pad_same,
                                            name='MaxPool3d_0a_3x3')
        branch_3 = self.module_dict[end_point+'/Branch_3/Conv3d_0b_1x1'](branch_3, is_training=is_training)
        net = tf.concat([branch_0, branch_1, branch_2, branch_3], 4)
        end_points[end_point] = net
        if self._final_endpoint == end_point: return net, end_points

        end_point = 'Logits'
        net = tf.nn.avg_pool3d(net, ksize=[1, 2, 7, 7, 1],
                                 strides=[1, 1, 1, 1, 1], padding=pad_valid)
        net = tf.nn.dropout(net, dropout_prob)
        logits = self.module_dict[end_point+'/Conv3d_0c_1x1'](net, is_training=is_training)
        if self._spatial_squeeze:
            logits = tf.squeeze(logits, [2, 3], name='SpatialSqueeze')
        averaged_logits = tf.reduce_mean(logits, axis=1)
        end_points[end_point] = averaged_logits
        if self._final_endpoint == end_point: return averaged_logits, end_points

        end_point = 'Predictions'
        predictions = tf.nn.softmax(averaged_logits)
        end_points[end_point] = predictions
        return predictions, end_points

In [4]:
# after evaluate_sample.py

_SAMPLE_VIDEO_FRAMES = 79
_SAMPLE_PATHS = {
    'rgb': 'i3d/data/v_CricketShot_g04_c01_rgb.npy',
    'flow': 'i3d/data/v_CricketShot_g04_c01_flow.npy',
}

_CHECKPOINT_PATHS = {
    'rgb': 'i3d/data/checkpoints/rgb_scratch/model.ckpt',
    'rgb600': 'i3d/data/checkpoints/rgb_scratch_kin600/model.ckpt',
    'flow': 'i3d/data/checkpoints/flow_scratch/model.ckpt',
    'rgb_imagenet': 'i3d/data/checkpoints/rgb_imagenet/model.ckpt',
    'flow_imagenet': 'i3d/data/checkpoints/flow_imagenet/model.ckpt',
}

_LABEL_MAP_PATH = 'i3d/data/label_map.txt'
_LABEL_MAP_PATH_600 = 'i3d/data/label_map_600.txt'
kinetics_classes = [x.strip() for x in open(_LABEL_MAP_PATH)]

# if you turn this on, tf will treat all the weights of this network (about 25 million floats altogether) as trainable
fine_tuning = False

In [5]:
# this list has the names of all the weights in the Flow net and their shapes
flow_varlist = tf.train.list_variables(_CHECKPOINT_PATHS['flow_imagenet'])
flow_varlist

[('Flow/inception_i3d/Conv3d_1a_7x7/batch_norm/beta', [64]),
 ('Flow/inception_i3d/Conv3d_1a_7x7/batch_norm/moving_mean', [64]),
 ('Flow/inception_i3d/Conv3d_1a_7x7/batch_norm/moving_variance', [64]),
 ('Flow/inception_i3d/Conv3d_1a_7x7/conv_3d/w', [7, 7, 7, 2, 64]),
 ('Flow/inception_i3d/Conv3d_2b_1x1/batch_norm/beta', [64]),
 ('Flow/inception_i3d/Conv3d_2b_1x1/batch_norm/moving_mean', [64]),
 ('Flow/inception_i3d/Conv3d_2b_1x1/batch_norm/moving_variance', [64]),
 ('Flow/inception_i3d/Conv3d_2b_1x1/conv_3d/w', [1, 1, 1, 64, 64]),
 ('Flow/inception_i3d/Conv3d_2c_3x3/batch_norm/beta', [192]),
 ('Flow/inception_i3d/Conv3d_2c_3x3/batch_norm/moving_mean', [192]),
 ('Flow/inception_i3d/Conv3d_2c_3x3/batch_norm/moving_variance', [192]),
 ('Flow/inception_i3d/Conv3d_2c_3x3/conv_3d/w', [3, 3, 3, 64, 192]),
 ('Flow/inception_i3d/Logits/Conv3d_0c_1x1/conv_3d/b', [400]),
 ('Flow/inception_i3d/Logits/Conv3d_0c_1x1/conv_3d/w', [1, 1, 1, 1024, 400]),
 ('Flow/inception_i3d/Mixed_3b/Branch_0/Conv3d_0a

In [6]:
flow_vardict = {}
# make variables to load the saved weights into
for variable in flow_varlist:
    flow_vardict[variable[0]] = tf.Variable(initial_value = np.zeros(variable[1], dtype=np.float32),
                                            shape=tf.TensorShape(variable[1]),
                                            trainable=fine_tuning,
                                            name=variable[0])
    
flow_saver = tf.compat.v1.train.Saver(var_list=flow_vardict)
flow_saver.restore(sess=None, save_path=_CHECKPOINT_PATHS['flow_imagenet'])
# the warning message here is irrelevant

INFO:tensorflow:Restoring parameters from i3d/data/checkpoints/flow_imagenet/model.ckpt


In [7]:
rgb_varlist = tf.train.list_variables("i3d/data/checkpoints/rgb_imagenet/model.ckpt")
rgb_varlist

[('RGB/inception_i3d/Conv3d_1a_7x7/batch_norm/beta', [64]),
 ('RGB/inception_i3d/Conv3d_1a_7x7/batch_norm/moving_mean', [64]),
 ('RGB/inception_i3d/Conv3d_1a_7x7/batch_norm/moving_variance', [64]),
 ('RGB/inception_i3d/Conv3d_1a_7x7/conv_3d/w', [7, 7, 7, 3, 64]),
 ('RGB/inception_i3d/Conv3d_2b_1x1/batch_norm/beta', [64]),
 ('RGB/inception_i3d/Conv3d_2b_1x1/batch_norm/moving_mean', [64]),
 ('RGB/inception_i3d/Conv3d_2b_1x1/batch_norm/moving_variance', [64]),
 ('RGB/inception_i3d/Conv3d_2b_1x1/conv_3d/w', [1, 1, 1, 64, 64]),
 ('RGB/inception_i3d/Conv3d_2c_3x3/batch_norm/beta', [192]),
 ('RGB/inception_i3d/Conv3d_2c_3x3/batch_norm/moving_mean', [192]),
 ('RGB/inception_i3d/Conv3d_2c_3x3/batch_norm/moving_variance', [192]),
 ('RGB/inception_i3d/Conv3d_2c_3x3/conv_3d/w', [3, 3, 3, 64, 192]),
 ('RGB/inception_i3d/Logits/Conv3d_0c_1x1/conv_3d/b', [400]),
 ('RGB/inception_i3d/Logits/Conv3d_0c_1x1/conv_3d/w', [1, 1, 1, 1024, 400]),
 ('RGB/inception_i3d/Mixed_3b/Branch_0/Conv3d_0a_1x1/batch_norm

In [8]:
rgb_vardict = {}
for variable in rgb_varlist:
    rgb_vardict[variable[0]] = tf.Variable(initial_value = np.zeros(variable[1], dtype=np.float32),
                                           shape=tf.TensorShape(variable[1]), 
                                           trainable=fine_tuning,
                                           name=variable[0])
rgb_saver = tf.compat.v1.train.Saver(var_list=rgb_vardict, reshape=True)
rgb_saver.restore(sess=None, save_path=_CHECKPOINT_PATHS['rgb_imagenet'])

INFO:tensorflow:Restoring parameters from i3d/data/checkpoints/rgb_imagenet/model.ckpt


In [9]:
# now vardict will contain all the weights
vardict = {}
vardict.update(rgb_vardict)
vardict.update(flow_vardict)
vardict

{'RGB/inception_i3d/Conv3d_1a_7x7/batch_norm/beta': <tf.Variable 'RGB/inception_i3d/Conv3d_1a_7x7/batch_norm/beta:0' shape=(64,) dtype=float32, numpy=
 array([ 1.8795129 , -1.3898429 ,  1.8669838 ,  0.28919575,  0.27221254,
         1.6464163 ,  2.087254  ,  2.117959  , -0.8038748 ,  2.4320054 ,
         1.759102  ,  1.7645339 ,  1.465036  ,  1.3237163 ,  1.9335492 ,
        -1.3406926 ,  1.78503   ,  2.4907627 ,  2.5260837 ,  2.207292  ,
         0.27849734,  2.168709  , -0.8655198 ,  2.1681845 ,  2.3320682 ,
         1.4199437 ,  1.9736975 ,  0.37483513, -1.1506578 ,  0.368478  ,
         2.23737   , -1.1911454 , -0.07817524,  2.0920424 ,  4.7544007 ,
        -0.08812667, -0.7181662 , -0.39324987, -0.8849777 ,  2.2050202 ,
        -0.9242949 ,  5.6755657 ,  2.2218184 , -0.6854658 , -0.4371273 ,
         1.0771866 ,  1.609359  ,  1.3407111 , -0.8547968 , -1.3461387 ,
         0.29010373, -0.57551354,  2.100426  ,  2.03961   ,  2.1551318 ,
         2.4727721 ,  2.235383  , -0.77025265,

In [10]:
_NUM_CLASSES = 400 # size of the output ('logits') layer
rgb_model = InceptionI3d(var_prefix='RGB', num_classes=_NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits')
flow_model = InceptionI3d(var_prefix='Flow', num_classes=_NUM_CLASSES, spatial_squeeze=True, final_endpoint='Logits')

In [11]:
# the provided sample data to test the network. the expected output is recorded in i3d/out. as you'll see it varies
# slightly from what I obtain here. it's possible some part of the network is incorrectly configured.
cricket_sample_rgb = np.load("i3d/data/v_CricketShot_g04_c01_rgb.npy")
cricket_sample_flow = np.load("i3d/data/v_CricketShot_g04_c01_flow.npy")
results_rgb = rgb_model(cricket_sample_rgb, False)
results_flow = flow_model(cricket_sample_flow, False)

In [13]:
rgb_logits = results_rgb[0][0] 
flow_logits = results_flow[0][0]
# the ensemble is defined by averaging the logits of the two nets
joint_logits = rgb_logits+flow_logits
rgb_predictions= tf.nn.softmax(rgb_logits)
flow_predictions = tf.nn.softmax(flow_logits)
joint_predictions = tf.nn.softmax(joint_logits)

In [22]:
print('RGB logits norm: ', np.linalg.norm(rgb_logits))
sorted_indices = np.argsort(rgb_predictions)[::-1]
print('\nTop classes and probabilities')
for index in sorted_indices[:20]:
    print(rgb_predictions[index].numpy(), rgb_logits[index].numpy(), kinetics_classes[index])

RGB logits norm:  82.22748

Top classes and probabilities
0.99999356 24.439657 playing cricket
1.867471e-06 11.248738 playing kickball
1.0207867e-06 10.644727 catching or throwing baseball
7.317671e-07 10.31186 shooting goal (soccer)
3.890843e-07 9.680194 catching or throwing softball
2.9659031e-07 9.40875 throwing discus
2.812762e-07 9.355735 golf putting
2.2379594e-07 9.127132 javelin throw
1.8174018e-07 8.918976 hitting baseball
1.6635353e-07 8.830513 jogging
1.5847925e-07 8.7820215 triple jump
9.767191e-08 8.298012 hurling (sport)
7.101792e-08 7.97933 skateboarding
5.3876825e-08 7.703098 playing tennis
5.086984e-08 7.645668 golf driving
4.8518956e-08 7.598353 breakdancing
4.6196377e-08 7.5492992 hurdling
4.388894e-08 7.49806 shot put
4.2333035e-08 7.4619656 hammer throw
4.1440746e-08 7.440663 headbutting


In [24]:
print('Flow logits norm: ', np.linalg.norm(flow_logits))
sorted_indices = np.argsort(flow_predictions)[::-1]
print('\nTop classes and probabilities')
for index in sorted_indices[:20]:
    print(flow_predictions[index].numpy(), flow_logits[index].numpy(), kinetics_classes[index])

Flow logits norm:  60.69956

Top classes and probabilities
0.93996334 15.352985 playing cricket
0.047061183 12.358593 hurling (sport)
0.00417177 9.935485 playing tennis
0.002486601 9.418061 playing squash or racquetball
0.0018447473 9.119487 hitting baseball
0.0010174264 8.524421 catching or throwing baseball
0.0008785112 8.377618 sword fighting
0.00086664734 8.364021 catching or throwing softball
0.00031376272 7.3480268 hammer throw
0.00023573413 7.0620937 playing badminton
8.246747e-05 6.0117936 tai chi
8.0938684e-05 5.993081 pumping fist
5.786033e-05 5.657421 training dog
5.1758932e-05 5.545986 golf putting
5.0176906e-05 5.5149436 catching or throwing frisbee
4.6635956e-05 5.441761 shot put
4.3011554e-05 5.360858 celebrating
4.06844e-05 5.305234 throwing ball
3.935577e-05 5.2720323 pole vault
3.292958e-05 5.0937605 shooting goal (soccer)


In [25]:
print('Joint logits norm: ', np.linalg.norm(joint_logits))
sorted_indices = np.argsort(joint_predictions)[::-1]
print('\nTop classes and probabilities')
for index in sorted_indices[:20]:
    print(joint_predictions[index].numpy(), joint_logits[index].numpy(), kinetics_classes[index])

Joint logits norm:  131.71538

Top classes and probabilities
1.0 39.79264 playing cricket
4.8901843e-09 20.656605 hurling (sport)
1.1049197e-09 19.169147 catching or throwing baseball
3.5873954e-10 18.044216 catching or throwing softball
3.566818e-10 18.038464 hitting baseball
2.391193e-10 17.638582 playing tennis
5.1048447e-11 16.094395 playing kickball
3.053473e-11 15.580484 playing squash or racquetball
2.5636087e-11 15.405621 shooting goal (soccer)
1.548857e-11 14.901721 golf putting
1.4131018e-11 14.809992 hammer throw
6.6411768e-12 14.054908 throwing discus
6.531621e-12 14.038275 javelin throw
2.7994696e-12 13.19105 pumping fist
2.1775526e-12 12.939821 shot put
1.7227076e-12 12.705517 celebrating
1.4000173e-12 12.498103 applauding
7.030067e-13 11.80923 throwing ball
5.100975e-13 11.488466 dodgeball
4.9124886e-13 11.450815 breakdancing
