Author: Ethan Herron 3-4-2020

This is my jupyter notebook for the book Deep Learning from Scratch by Seth Weidman. I will be following along with all of the code, and adding insights or questions I have along the way in these markdown cells.

In [2]:
#import libraries
import numpy as np
from numpy import ndarray

### Helper objects

In [3]:
def assert_same_shape(output: ndarray, output_grad: ndarray):
    
    assert output.shape == output_grad.shape, \
    '''
    Two ndarray should have the same shape
    '''.format(tuple(output_grad.shape), tuple(output.shape))
    return None

In [4]:
def assert_dim(t: ndarray, dim: ndarray):
    
    assert len(t.shape) == dim, \
    '''
    Tensor expected to have dimension{0}, instead has dimension {1}
    '''.format(dim, len(t.shape))
    return None    

# 1D Convolution
1 input, 1 output

## Padding

In [5]:
input_1d = np.array([1,2,3,4,5])
param_1d = np.array([1,1,1])

In [6]:
def _pad_1d(inp: ndarray,
            num: int) -> ndarray:
    z = np.array([0])
    z = np.repeat(z, num)
    return np.concatenate([z, inp, z])

In [7]:
_pad_1d(input_1d, 1)

array([0, 1, 2, 3, 4, 5, 0])

## Forward

In [8]:
def conv_1d(inp: ndarray,
            param: ndarray) -> ndarray:
    
    #assert correct dimensions
    assert_dim(inp, 1)
    assert_dim(param, 1)
    
    #pad the input
    param_len = param.shape[0]
    param_mid = param_len // 2
    inp_pad = _pad_1d(inp, param_mid)
    
    #initialize the output
    out = np.zeros(inp.shape)
    
    #perform the 1d convolution
    for o in range(out.shape[0]):
        for p in range(param_len):
            out[o] += param[p] * inp_pad[o+p]
            
    #ensure shapes didn't change
    assert_same_shape(inp, out)
    
    return out    

In [9]:
def conv_1d_sum(inp: ndarray,
                param: ndarray) -> ndarray:
    
    out = conv_1d(inp, param)
    return np.sum(out)

In [10]:
conv_1d_sum(input_1d, param_1d)

39.0

## Testing gradients

In [11]:
np.random.seed(190220)
print(np.random.randint(0, input_1d.shape[0]))
print(np.random.randint(0, param_1d.shape[0]))

4
0


In [12]:
input_1d_2 = np.array([1,2,3,4,6])
param_1d = np.array([1,1,1])

In [13]:
print(conv_1d_sum(input_1d_2, param_1d) - conv_1d_sum(input_1d_2, param_1d))

0.0


In [14]:
input_1d = np.array([1,2,3,4,5])
param_1d_2 = np.array([2,1,1])

print(conv_1d_sum(input_1d, param_1d_2) - conv_1d_sum(input_1d, param_1d))

10.0


## Gradients

In [15]:
def _param_grad_1d(inp: ndarray,
                   param: ndarray,
                   output_grad: ndarray = None) -> ndarray:
    
    param_len = param.shape[0]
    param_mid = param_len // 2
    input_pad = _pad_1d(inp, param_mid)
    
    if output_grad is None:
        output_grad = np.ones_like(inp)
    else:
        assert_same_shape(inp, output_grad)
        
    #Zero padded 1 dim convul
    param_grad = np.zeros_like(param)
    input_grad = np.zeros_like(inp)
    
    for o in range(inp.shape[0]):
        for p in range(param.shape[0]):
            param_grad[p] += input_pad[o+p] * output_grad[o]
            
    assert_same_shape(param_grad, param)
    
    return param_grad

In [16]:
def _input_grad_1d(inp: ndarray,
                  param: ndarray,
                  output_grad: ndarray = None) -> ndarray:
    
    param_len = param.shape[0]
    param_mid = param_len // 2
    inp_pad = _pad_1d(inp, param_mid)
    
    if output_grad is None:
        output_grad = np.ones_like(inp)
    else:
        assert_same_shape(inp, output_grad)
        
    output_pad = _pad_1d(output_grad, param_mid)
    
    #Zero padded 1 dim convol
    param_grad = np.zeros_like(param)
    input_grad = np.zeros_like(inp)
    
    for o in range(inp.shape[0]):
        for f in range(param.shape[0]):
            input_grad[o] += output_pad[o+param_len-f-1] * param[f]
    
    assert_same_shape(param_grad, param)
    
    return input_grad

In [17]:
_input_grad_1d(input_1d, param_1d)

array([2, 3, 3, 3, 2])

In [18]:
_param_grad_1d(input_1d, param_1d)

array([10, 15, 14])

## Batch size of 2
### Pad

In [19]:
input_1d_batch = np.array([[0,1,2,3,4,5,6],
                           [1,2,3,4,5,6,7]])

In [20]:
def _pad_1d(inp: ndarray,
            num: int) -> ndarray:
    z = np.array([0])
    z = np.repeat(z, num)
    return np.concatenate([z, inp, z])

In [21]:
def _pad_1d_batch(inp: ndarray,
                  num: int) -> ndarray:
    outs = [_pad_1d(obs, num) for obs in inp]
    return np.stack(outs)

In [22]:
_pad_1d_batch(input_1d_batch, 1)

array([[0, 0, 1, 2, 3, 4, 5, 6, 0],
       [0, 1, 2, 3, 4, 5, 6, 7, 0]])

### Forward

In [23]:
def conv_1d_batch(inp: ndarray,
                  param: ndarray) -> ndarray:
    
    outs = [conv_1d(obs, param) for obs in inp]
    return np.stack(outs)

In [24]:
conv_1d_batch(input_1d_batch, param_1d)

array([[ 1.,  3.,  6.,  9., 12., 15., 11.],
       [ 3.,  6.,  9., 12., 15., 18., 13.]])

### Gradient

In [25]:
def input_grad_1d_batch(inp: ndarray,
                        param: ndarray) -> ndarray:
    
    out = conv_1d_batch(inp, param)
    
    out_grad = np.ones_like(out)
    
    batch_size = out_grad.shape[0]
    
    grads = [_input_grad_1d(inp[i], param, out_grad[i]) for i in range(batch_size)]
    
    return np.stack(grads)

In [36]:
def param_grad_1d_batch(inp: ndarray,
                        param: ndarray) -> ndarray:
    
    output_grad = np.ones_like(inp)
    
    inp_pad = _pad_1d_batch(inp, 1)
    out_pad = _pad_1d_batch(inp, 1)
    
    param_grad = np.zeros_like(param)
    
    for i in range(inp.shape[0]):
        for o in range(inp.shape[1]):
            for p in range(param.shape[0]):
                param_grad[p] += inp_pad[i][o+p] * output_grad[i][o]
                
    return param_grad

## Checking gradients for conv_1d_batch

In [27]:
def conv_1d_batch_sum(inp: ndarray,
                      fil: ndarray) -> ndarray:
    out = conv_1d_batch(inp, fil)
    return np.sum(out)

In [28]:
conv_1d_batch_sum(input_1d_batch, param_1d)

133.0

In [29]:
print(np.random.randint(0, input_1d_batch.shape[0]))
print(np.random.randint(0, input_1d_batch.shape[1]))

0
2


In [30]:
input_1d_batch_2 = input_1d_batch.copy()
input_1d_batch_2[0][2] += 1
conv_1d_batch_sum(input_1d_batch_2, param_1d) - conv_1d_batch_sum(input_1d_batch, param_1d)

3.0

In [31]:
input_grad_1d_batch(input_1d_batch, param_1d)

array([[2, 3, 3, 3, 3, 3, 2],
       [2, 3, 3, 3, 3, 3, 2]])

In [33]:
print(np.random.randint(0, param_1d.shape[0]))

2


In [34]:
param_1d_2 = param_1d.copy()
param_1d_2[2] += 1
conv_1d_batch_sum(input_1d_batch, param_1d_2) - conv_1d_batch_sum(input_1d_batch, param_1d)

48.0

In [37]:
param_grad_1d_batch(input_1d_batch, param_1d)

array([36, 49, 48])

# 2D Convolutions

In [38]:
imgs_2d_batch = np.random.randn(3, 28, 28)

In [39]:
param_2d = np.random.randn(3, 3)

## Padding

In [40]:
def _pad_2d(inp: ndarray,
            num: int):
    '''
    input is a 3-D tensor, first dimension is batch size
    '''
    outs = [_pad_2d_obs(obs, num) for obs in inp]
    
    return np.stack(outs)

In [41]:
def _pad_2d_obs(inp: ndarray,
                num: int):
    '''
    input is a 2-D, square tensor
    '''
    inp_pad = _pad_1d_batch(inp, num)
    
    other = np.zeros((num, inp.shape[0] + num * 2))
    
    return np.concatenate([other, inp_pad, other])

In [42]:
_pad_2d(imgs_2d_batch, 1).shape

(3, 30, 30)

## Compute output

In [43]:
def _compute_output_obs_2d(obs: ndarray,
                           param: ndarray):
    '''
    obs is a 2d tensor, as is param
    '''
    param_mid = param.shape[0] // 2
    
    obs_pad = _pad_2d_obs(obs, param_mid)
    
    out = np.zeros_like(obs)
    
    for o_w in range(out.shape[0]):
        for o_h in range(out.shape[1]):
            for p_w in range(param.shape[0]):
                for p_h in range(param.shape[1]):
                    out[o_w][o_h] += param[p_w][p_h] * obs_pad[o_w+p_w][o_h+p_h]
    
    return out

In [44]:
def _compute_output_2d(img_batch: ndarray,
                       param: ndarray):
    
    assert_dim(img_batch, 3)
    
    outs = [_compute_output_obs_2d(obs, param) for obs in img_batch]
    
    return np.stack(outs)

In [45]:
_compute_output_2d(imgs_2d_batch, param_2d).shape

(3, 28, 28)

## Param grads

In [46]:
def _compute_grads_obs_2d(input_obs: ndarray,
                          output_grad_obs: ndarray,
                          param: ndarray) -> ndarray:
    '''
    input_obs: 2d tensor representing the input observation
    output_grad_obs: 2d tensor representing the output gradient
    param: 2d filter
    '''
    
    param_size = param.shape[0]
    output_obs_pad = _pad_2d_obs(output_grad_obs, param_size // 2)
    input_grad = np.zeros_like(input_obs)
    
    for i_w in range(input_obs.shape[0]):
        for i_h in range(input_obs.shape[1]):
            for p_w in range(param_size):
                for p_h in range(param_size):
                    input_grad[i_w][i_h] += output_obs_pad[i_w+param_size-p_w-1][i_h+param_size-p_h-1] \
                    * param[p_w][p_h]
                    
    return input_grad

def _compute_grads_2d(inp: ndarray,
                      output_grad: ndarray,
                      param: ndarray) -> ndarray:
    grads = [_compute_grads_obs_2d(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])]
    
    return np.stack(grads)

def _param_grad_2d(inp: ndarray,
                   output_grad: ndarray,
                   param: ndarray) -> ndarray:
    
    param_size = param.shape[0]
    inp_pad = _pad_2d(inp, param_size // 2)
    
    param_grad = np.zeros_like(param)
    img_shape = output_grad.shape[1:]
    
    for i in range(inp.shape[0]):
        for o_w in range(img_shape[0]):
            for o_h in range(img_shape[1]):
                for p_w in range(param_size):
                    for p_h in range(param_size):
                        param_grad[p_w][p_h] += inp_pad[i][o_w+p_w][o_h+p_h]\
                        * output_grad[i][o_w][o_h]
    return param_grad

In [47]:
img_grads = _compute_grads_2d(imgs_2d_batch,
                              np.ones_like(imgs_2d_batch),
                              param_2d)

In [48]:
img_grads.shape

(3, 28, 28)

In [49]:
param_grad = _param_grad_2d(imgs_2d_batch,
                            np.ones_like(imgs_2d_batch),
                            param_2d)
param_grad.shape

(3, 3)

## Testing gradients
### input

In [50]:
print(np.random.randint(0, imgs_2d_batch.shape[0]))
print(np.random.randint(0, imgs_2d_batch.shape[1]))
print(np.random.randint(0, imgs_2d_batch.shape[2]))

0
6
18


In [51]:
imgs_2d_batch_2 = imgs_2d_batch.copy()
imgs_2d_batch_2[0][6][18] += 1

In [52]:
def _compute_output_2d_sum(img_batch: ndarray,
                           param: ndarray):
    
    out = _compute_output_2d(img_batch, param)
    
    return out.sum()

In [53]:
_compute_output_2d_sum(imgs_2d_batch_2, param_2d) - \
_compute_output_2d_sum(imgs_2d_batch, param_2d)

-3.1843477398599163

In [54]:
img_grads[0][6][18]

-3.184347739859924

### Param

In [56]:
print(np.random.randint(0, param_2d.shape[0]))
print(np.random.randint(0, param_2d.shape[1]))

0
2


In [58]:
param_2d_2 = param_2d.copy()
param_2d_2[0][2] += 1

In [59]:
_compute_output_2d_sum(imgs_2d_batch, param_2d_2) - _compute_output_2d_sum(imgs_2d_batch, param_2d)

5.53349015923007

In [60]:
param_grad[0][2]

5.533490159230001

## Channels + batch size

### Helper

In [61]:
def _pad_2d_channel(inp: ndarray,
                    num: int):
    '''
    inp has dimension [num_channels, image_width, image_height]
    '''
    return np.stack([_pad_2d_obs(channel, num) for channel in inp])

def _pad_conv_input(inp: ndarray,
                    num: int):
    '''
    inp has dimension [batch_size, num_channels, image_width, image_height]
    '''
    return np.stack([_pad_2d_channel(obs, num) for obs in inp])

### Forward

In [62]:
def _compute_output_obs(obs: ndarray,
                        param: ndarray):
    '''
    obs: [channels, img_width, img_height]
    param: [in_channels, out_channels, fil_width, fil_height]
    '''
    assert_dim(obs, 3)
    assert_dim(param, 4)
    
    param_size = param.shape[0]
    param_mid = param_size // 2
    obs_pad = _pad_2d_channel(obs, param_mid)
    
    in_channels = param.shape[0]
    out_channels = param.shape[1]
    img_size = obs.shape[1]
    
    out = np.zeros((out_channels,) + obs.shape[1:])
    for c_in in range(in_channels):
        for c_out in range(out_channels):
            for o_w in range(img_size):
                for o_h in range(img_size):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                            out[c_out][o_w][o_h] += \
                            param[c_in][c_out][p_w][p_h] * obs_pad[c_in][o_w+p_w][o_h+p_h]
    
    return out

def _output(inp: ndarray,
            param: ndarray) -> ndarray:
    '''
    obs: [batch_size, channels, img_width, img_height]
    fil: [in_channels, out_channels, fil_width, fil_height]
    '''
    outs = [_compute_output_obs(obs, param) for obs in inp]
    
    return np.stack(outs)

### Backward

In [63]:
def _compute_grads_obs(input_obs: ndarray,
                       output_grad_obs: ndarray,
                       param: ndarray) -> ndarray:
    '''
    input_obs - [in_channels, img_width, img_height]
    output_grad_obs - [out_channel, img_width, img_height]
    param - [in_channels, out_channels, img_width, img_height]
    '''
    input_grad = np.zeros_like(input_obs)
    param_size = param.shape[2]
    param_mid = param_size // 2
    img_size = input_obs.shape[1]
    in_channels = input_obs.shape[0]
    out_channels = param.shape[1]
    output_obs_pad = _pad_2d_channel(output_grad_obs, param_mid)
    
    for c_in in range(in_channels):
        for c_out in range(out_channels):
            for i_w in range(input_obs.shape[1]):
                for i_h in range(input_obs.shape[2]):
                    for p_w in range(param_size):
                        for p_h in range(param_size):
                            input_grad[c_in][i_w][i_h] += \
                            output_obs_pad[c_out][i_w+param_size-p_w-1][i_h+param_size-p_h-1] \
                            * param[c_in][c_out][p_w][p_h]
                            
    return input_grad

def _input_grad(inp: ndarray,
                output_grad: ndarray,
                param: ndarray) -> ndarray:
    
    grads = [_compute_grads_obs(inp[i], output_grad[i], param) for i in range(output_grad.shape[0])]
    
    return np.stack(grads)

In [64]:
def _param_grad(inp: ndarray,
                output_grad: ndarray,
                param: ndarray) -> ndarray:
    '''
    inp: [in_channels, img_width, img_height]
    output_grad_obs: [out_channels, img_width, img_height]
    param: [in_channels, out_channels, img_width, img_height]
    '''
    param_grad = np.zeros_like(param)
    param_size = param.shape[2]
    param_mid = param_size // 2
    img_size = inp.shape[2]
    in_channels = inp.shape[1]
    out_channels = output_grad.shape[1]
    
    inp_pad = _pad_conv_input(inp, param_mid)
    img_shape = output_grad.shape[2:]
    
    for i in range(inp.shape[0]):
        for c_in in range(in_channels):
            for c_out in range(out_channels):
                for o_w in range(img_shape[0]):
                    for o_h in range(img_shape[1]):
                        for p_w in range(param_size):
                            for p_h in range(param_size):
                                param_grad[c_in][c_out][p_w][p_h] += \
                                inp_pad[i][c_in][o_w+p_w][o_h+p_h] \
                                * output_grad[i][c_out][o_w][o_h]
                                
    return param_grad

## Testing gradients

In [65]:
cifar_imgs = np.random.randn(10, 3, 32, 32)
cifar_param = np.random.randn(3, 16, 5, 5)

In [66]:
print(np.random.randint(0, cifar_imgs.shape[0]))
print(np.random.randint(0, cifar_imgs.shape[1]))
print(np.random.randint(0, cifar_imgs.shape[2]))
print(np.random.randint(0, cifar_imgs.shape[3]))
print()
print(np.random.randint(0, cifar_param.shape[0]))
print(np.random.randint(0, cifar_param.shape[1]))
print(np.random.randint(0, cifar_param.shape[2]))
print(np.random.randint(0, cifar_param.shape[3]))

3
1
2
19

0
8
0
2


In [None]:
def _compute_output_sum(imgs: ndarray,
                        param: ndarray):
    return _output(imgs, param).sum()

### input gradients

In [69]:
cifar_imgs_2 = cifar_imgs.copy()
cifar_imgs_2[3][1][2][19] += 1

In [73]:
_compute_output_sum(cifar_imgs_2, cifar_param) - _compute_output_sum(cifar_imgs, cifar_param)

-4.9421924733578635

In [71]:
_input_grad(cifar_imgs,
            np.ones((10, 16, 32, 32)),
            cifar_param)[3][1][2][19]

2.3452987587074423

### Param grad

In [74]:
cifar_param_2 = cifar_param.copy()
cifar_param_2[0][8][0][2] += 1

In [75]:
_compute_output_sum(cifar_imgs, cifar_param_2) - _compute_output_sum(cifar_imgs, cifar_param)

11.019642398145606

In [76]:
_param_grad(cifar_imgs,
            np.ones((10, 16, 32, 32)),
            cifar_param)[0][8][0][2]

-47.0912312415532

Very important note.
something is wrong with the _compute_output_sum() object. I am going to leave it for now, but will revisit and correct when I get a chance. 
I will update the readme file once I have found the error.