# Speed comparison of new vs old conv2d implementation

In [1]:
import numpy as np

class Conv2D_new:
    def __init__(self, kernel, bias, input_shape):
        self.kernel = kernel
        self.bias = bias
        k1, k2, c_in, c_out = kernel.shape
        n, h, w, c = input_shape

        template = np.arange(h*w*c).reshape(1,h,w,c) # Template for 1D indexing of input image
        snips = []
        for y in range(h-(k1-1)):
            for x in range(h-(k2-1)):
                snip = template[0, y:y+k1, x:x+k2,:]
                snips.append(snip)
        self.img_inds = np.asarray(snips).reshape(-1, k1 * k2* c_in)
        self.kernel = kernel.reshape(k1*k2*c_in, c_out)
        self.output_shape = (-1, h-k1+1, w-k2+1, c_out)

    def __call__(self, img):
        img = img.reshape(-1) # Flatten for indexing
        img = img[self.img_inds]
        img = img @ self.kernel
        img = img.reshape(*self.output_shape)
        return img + self.bias

class Conv2D_old:
    def __init__(self, kernel, bias):
        self.kernel = kernel
        self.bias = bias

    def __call__(self, img):
        kernel = self.kernel
        n, h, w, c = img.shape
        k, k, c_i, c_o = kernel.shape
        res = np.zeros((n, h-k+1, w-k+1, c_o))
        for n_ in range(n):
            for c_out in range(c_o):
                for y in range(h-(k-1)):
                    for x in range(h-(k-1)):
                        snip = img[n_, y:y+k, x:x+k,:]
                        res[n_, y,x,c_out] = np.sum(snip * kernel[:,:,:,c_out])
        return res + self.bias

In [2]:
img = np.random.rand(1,128,128,3)
kernel = np.random.rand(5,5,3,32)
input_shape = img.shape
conv2d_new = Conv2D_new(kernel, 0 , input_shape)
conv2d_old = Conv2D_old(kernel, 0)

In [3]:
%%timeit
conv2d_new(img)

7.45 ms ± 346 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
%%timeit
conv2d_old(img)

3.27 s ± 73.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Substantially faster but takes more memory

In [5]:
#!pip install memory_profiler
%load_ext memory_profiler

In [6]:
del img, conv2d_new, conv2d_old, kernel, input_shape

In [9]:
def test_new():
    img = np.random.rand(1,128,128,3)
    kernel = np.random.rand(5,5,3,32)
    input_shape = img.shape
    conv2d_new = Conv2D_new(kernel, 0 , input_shape)
    conv2d_new(img)
def test_old():
    img = np.random.rand(1,128,128,3)
    kernel = np.random.rand(5,5,3,32)
    input_shape = img.shape
    conv2d_old = Conv2D_old(kernel, 0)
    conv2d_old(img)

Old function memory usage:
peak memory: 103.09 MiB, increment: 0.00 MiB

New function memory usage:
peak memory: 103.04 MiB, increment: 0.00 MiB


In [11]:

print('Old function memory usage:')
%memit test_old()

Old function memory usage:
peak memory: 88.95 MiB, increment: 7.18 MiB


In [17]:
print('\nNew function memory usage:')
%memit test_new()


New function memory usage:
peak memory: 81.82 MiB, increment: 0.09 MiB
