### This jupyter notebook compare performance between Naive convolution (using loop), Eisum with stride tricks of numpy and convolution of tensorflow.

In [3]:
import numpy as np
import tensorflow as tf

tf.enable_eager_execution()

##### 1. I start with small number of dimension of X corresponding to input of convolutional layer. X has shape (batch_size, input_height, input_width, input_channel)

In [4]:
X = np.random.normal(size=(2, 12, 12, 3))

In [5]:
X

array([[[[-8.42369238e-02, -8.39172567e-02, -2.06087593e+00],
         [-1.27462006e+00, -3.91472839e-01, -3.90692634e-01],
         [-8.99637890e-01, -2.97561931e-02,  3.01073527e+00],
         [-2.21441019e-02,  3.82271977e-01,  7.93727604e-01],
         [-7.52311861e-01, -1.15706901e+00,  1.57118770e-01],
         [ 6.86956964e-01, -4.78831352e-01,  4.84364239e-02],
         [ 9.76102260e-01, -5.18100850e-02,  4.66845023e-01],
         [-3.11858427e-01,  4.02836370e-01, -3.56317523e-01],
         [-8.98159085e-01,  3.43734062e-01,  3.14688252e-01],
         [-7.07130431e-01,  1.63266961e+00,  1.53487439e+00],
         [-5.08881310e-01,  1.52238242e-01, -7.85785830e-01],
         [-1.91721209e+00,  6.15122984e-01, -2.01891347e+00]],

        [[ 1.54992872e+00, -1.36809721e+00,  1.05062608e+00],
         [ 1.50536314e-01, -7.94081962e-01,  2.80683782e-01],
         [-9.06299933e-01,  2.61347914e+00, -7.14962048e-01],
         [ 1.02651045e+00,  8.99745467e-01, -2.60619929e-01],
      

#### 2. Kernel has shape = (kernel_height, kernel_width, input_channel, output_channel) and I assume that stride = 1, padding = "VALID" 

In [6]:
kernel_height = 3
kernel_width = 3
output_channel = 16
stride = 1
padding = "VALID"

In [7]:
kernel = np.random.normal(size=(kernel_height, kernel_width, 3, output_channel))

#### 3. Define function split X to do with einsum of numpy

What does this function do? It computes height, width of the output base on stride, padding and transform 4D tensor input shape = `(batch_size, input_height, input_width, input_channel)` to 6D tensor input that has shape = `(batch_size, output_height, output_width, filter_height, filter_width, input_channel)`. This function is important to do convolution with einsum. 

In [8]:
def _split_X(X, filter_size, stride):
    """
    Preprocess input X to avoid for-loop.
    """
    m, iW, iH, iC = X.shape
    fW, fH = filter_size
    oW = int((iW - fW)/stride + 1)
    oH = int((iH - fH)/stride + 1)
    batch_strides, width_strides, height_strides, channel_strides = X.strides
    view_shape = (m, oW, oH, fW, fH, iC)
    X = np.lib.stride_tricks.as_strided(X, shape=view_shape, strides=(batch_strides, stride*width_strides, 
                                                                      stride*height_strides, width_strides, 
                                                                      height_strides, channel_strides), writeable=False)
    return X

#### 4. Intuitive approach by using for loop

In [9]:
def naive_conv(X, kernel):
    m, iW, iH, iC = X.shape
    fW, fH, iC, fC = kernel.shape
    oW = iW - fW + 1
    oH = iH - fH + 1
    out = np.zeros(shape=(m, oW, oH, fC))
    for f in range(fC):
        for i in range(m):
            for j in range(oW):
                for k in range(oH):
                        out[i, j, k, f] = np.sum(X[i, j:j+fW, k:k+fH, :]*kernel[:, :, :, f])
    return out

#### 5. Convolution with einsum of numpy

[np.einsum](https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html) is a powerful function that numpy has implemented for us

In [10]:
def einsum_conv(X, kernel):
    X = _split_X(X, (kernel_height, kernel_width), stride)
    return np.einsum("bwhijk,ijkl->bwhl", X, kernel)

In [11]:
def tf_conv(X, kernel):
    with tf.device("/cpu:0"):
        return tf.nn.conv2d(X, kernel, strides=1, padding="VALID")

In [12]:
%timeit naive_conv(X, kernel)

16 ms ± 83.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%timeit einsum_conv(X, kernel)

172 µs ± 607 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
%timeit tf_conv(X, kernel)

143 µs ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
np.allclose(naive_conv(X, kernel), tf_conv(X, kernel))

True

In [16]:
np.allclose(einsum_conv(X, kernel), tf_conv(X, kernel))

True

### Try for a larger X

In [17]:
X = np.random.normal(size=(32, 28, 28, 3))

In [18]:
%timeit naive_conv(X, kernel)

1.75 s ± 34.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit einsum_conv(X, kernel)

18.1 ms ± 90.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
%timeit tf_conv(X, kernel)

1.74 ms ± 47.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


##### We can see that einsum convolution is ~100 times faster than naive convolution by using pure python loop.