In [None]:
import numpy as np

class conv_layer:
    
    def __init__(self, in_dim, conv_size=(3,3), kernel_num=4, debug = False):
        self.kernel_num = kernel_num
        self.conv_size = conv_size
        self.conv_kernels = [None] * self.kernel_num

        for i in range(0, self.kernel_num):
            if debug:
                self.conv_kernels[i] = self.debug_conv((conv_size[0], conv_size[1], in_dim[2]))
            else:
                self.conv_kernels[i] = np.empty((conv_size[0], conv_size[1], in_dim[2]))

        self.in_dim = in_dim
        self.out_dim = (self.in_dim[0] - (conv_size[0] - 1), self.in_dim[1] - (self.conv_size[1] - 1), self.kernel_num)
    
    def forward(self, img):
        self.input = img #save the input for backward pass
        out_img = np.zeros(self.out_dim)
        for k in range(0, self.kernel_num):
            for h in range(0, self.out_dim[0]):
                for w in range(0, self.out_dim[1]):
                    out_img[h, w, k] = np.sum(img[h:h+self.conv_size[0], w:w+self.conv_size[1],:] * self.conv_kernels[k])
        return out_img


    """
    Computes the gradients with respect to the input and updates the layer's parameters (weights and biases) 
    using the gradient descent update rule.

    conv_layer.backward(grad_output, learning_rate):

    - grad_output is the gradient of the loss with respect to the output of the conv_layer.
    - learning_rate is the learning rate used for the gradient descent update of the conv_kernels.
    - grad_input is initialized as an array of zeros with the same shape as the input to the conv_layer.
    - For each kernel in the conv_layer:
        - Iterate over the output dimensions (height and width).
        - Compute the gradient of the input with respect to the output of the conv_layer by applying the chain rule 
        and summing the gradients across the spatial dimensions.
        - Update grad_input by adding the accumulated gradient for each location in the input.
        - Update the kernel weights using the gradient descent update rule: 
        self.conv_kernels[k] -= learning_rate * grad_output[h, w, k] * self.input[h:h+self.conv_size[0], w:w+self.conv_size[1], :].
    - Return grad_input, which represents the gradient of the loss with respect to the input of the conv_layer.
    """
    def backward(self, grad_output, learning_rate):
        grad_input = np.zeros(self.in_dim)
        for k in range(self.kernel_num):
            for h in range(self.out_dim[0]):
                for w in range(self.out_dim[1]):
                    grad_input[h:h+self.conv_size[0], w:w+self.conv_size[1], :] += grad_output[h, w, k] * self.conv_kernels[k]
                    self.conv_kernels[k] -= learning_rate * grad_output[h, w, k] * self.input[h:h+self.conv_size[0], w:w+self.conv_size[1], :]
        return grad_input
    
    def get_out_dim(self):
        return self.out_dim
    
    def debug_conv(self, size):
        kernel = np.zeros(size)
        for i in range(0, size[0],2):
            kernel[i,:,:] = 1
        return kernel

class max_pooling_layer:
    
    def __init__(self, in_dim, pooling_size=(3,3)):
        self.pooling_size = pooling_size
        self.in_dim = in_dim
        self.out_dim = (self.in_dim[0] - (self.pooling_size[0] - 1), self.in_dim[1] - (self.pooling_size[1] - 1), self.in_dim[2])

    def forward(self, img):
        self.input = img
        out_img = np.empty(self.out_dim)
        
        for d in range(0, self.out_dim[2]):
            for w in range(0, self.out_dim[0]):
                for h in range(0, self.out_dim[1]):
                    out_img[w,h,d] = np.max(img[w:w+self.pooling_size[0], h:h+self.pooling_size[1],d])
        return out_img

    def backward(self, grad_output):
        grad_input = np.zeros(self.in_dim)
        for d in range(self.out_dim[2]):
            for w in range(self.out_dim[0]):
                for h in range(self.out_dim[1]):
                    pool_slice = self.input[w:w+self.pooling_size[0], h:h+self.pooling_size[1], d]
                    mask = (pool_slice == np.max(pool_slice))
                    grad_input[w:w+self.pooling_size[0], h:h+self.pooling_size[1], d] += mask * grad_output[w, h, d]
        return grad_input

    def get_out_dim(self):
        return self.out_dim

class activation_layer:
    
    def __init__(self, in_dim):
        self.in_dim = in_dim

    def forward(self, img):
        self.input = img
        out_img = np.stack(np.vectorize(self.relu)(img), axis=0)
        return out_img

    def backward(self, grad_output):
        grad_input = grad_output * np.vectorize(self.relu_prime)(self.input)
        return grad_input

    def relu(self, el):
        return(np.maximum(0, el))

class fully_connected_layer:
    
    def __init__(self, in_dim, out_dim):
        self.in_dim = in_dim
        self.out_dim = out_dim
        w_dim = 1
        for d in in_dim:
            w_dim = w_dim * d
        self.weights = np.ones((self.out_dim, w_dim))
    
    
    def forward(self, img):
        self.input = img
        out_vec = np.zeros(self.out_dim)
        img_vec = img.flatten()

        for i in range(0, self.out_dim):
            out_vec[i] = np.sum(img_vec * self.weights[i])
        return out_vec

    def backward(self, grad_output, learning_rate):
        grad_input = np.zeros(self.in_dim)
        for i in range(self.out_dim):
            grad_input += grad_output[i] * self.weights[i]
            self.weights[i] -= learning_rate * grad_output[i] * self.input.flatten()
        return grad_input