Before you submit this notebook, make sure everything runs as expected in the local test cases. 
Please, paste the solution to the designed cell and do not change anything else.

Also, please, leave your first and last names below

In [None]:
FirstName = ""
LastName = ""

---

In [None]:
import numpy as np

In [None]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box) 
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`: 
        
        output = module.forward(input)
    
    The module should be able to perform a backward pass: to differentiate the `forward` function. 
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule. 
    
        gradInput = module.backward(input, gradOutput)
    """
    # YOUR CODE HERE
    raise NotImplementedError()

### Test 0: Initialization (0.01 points)

In [None]:
# do not change this cell
import sys

import numpy
import numpy as np
import unittest

import collections
import pickle
import io


numpy.variance = numpy.var


### Test 1: Linear layer (0.04 points)

In [None]:
batch_size, n_in, n_out = 2, 3, 4
for d in data['test_Linear']:
    # layers initialization
    custom_layer = Linear(n_in, n_out)
    custom_layer.W = d['init_w']
    custom_layer.b = d['init_w_b']

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    gt_output = d['output']
    assert np.allclose(gt_output, custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    gt_layer_grad = d['grad_output']
    assert np.allclose(gt_layer_grad, custom_layer_grad, atol=1e-6)

    # 3. check layer parameters grad
    custom_layer.accGradParameters(layer_input, next_layer_grad)
    weight_grad = custom_layer.gradW
    bias_grad = custom_layer.gradb
    torch_weight_grad = d['w_grad']
    torch_bias_grad = d['b_grad']
    assert np.allclose(torch_weight_grad, weight_grad, atol=1e-6)
    assert np.allclose(torch_bias_grad, bias_grad, atol=1e-6)

### Test 2: Softmax (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_SoftMax']:
    # layers initialization
    custom_layer = SoftMax()

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    gt_output = d['output']
    assert np.allclose(gt_output, custom_layer_output, atol=1e-5)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['grad_output'], custom_layer_grad, atol=1e-5)

### Test 3: LogSoftMax (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_LogSoftMax']:
    # layers initialization
    custom_layer = LogSoftMax()

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    gt_output = d['output']
    np.allclose(gt_output, custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    gt_grad = d['grad_output']
    np.allclose(gt_grad, custom_layer_grad, atol=1e-6)

### Test 4: BatchNormalization (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 32, 16
for d in data['test_BatchNormalization']:
    # layers initialization
    alpha = 0.9
    custom_layer = BatchNormalization(alpha)
    custom_layer.train()
    init_moving_mean = d['init_moving_mean']
    init_moving_variance = d['init_moving_variance']
    custom_layer.moving_mean = init_moving_mean
    custom_layer.moving_variance = init_moving_variance

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    gt_output = d['output']
    np.allclose(gt_output, custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    gt_grad = d['grad_output']
    # please, don't increase `atol` parameter, it's garanteed that you can implement batch norm layer
    # with tolerance 1e-5
    np.allclose(gt_grad, custom_layer_grad, atol=1e-5)

    # 3. check moving mean
    gt_running_mean = d['gt_moving_mean']
    gt_running_var = d['gt_moving_var']
    np.allclose(custom_layer.moving_mean, gt_running_mean)
    # we don't check moving_variance because pytorch uses slightly different formula for it:
    # it computes moving average for unbiased variance (i.e var*N/(N-1))
    #self.assertTrue(np.allclose(custom_layer.moving_variance, torch_layer.running_var.numpy()))

    # 4. check evaluation mode
    custom_layer.moving_variance = gt_running_var
    custom_layer.evaluate()
    custom_layer_output = custom_layer.updateOutput(layer_input)
    eval_output = d['eval_output']
    np.allclose(eval_output, custom_layer_output, atol=1e-6)

### Test 5: Sequential (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_Sequential']:
    # layers initialization
    alpha = 0.9
    custom_layer = Sequential()
    bn_layer = BatchNormalization(alpha)
    bn_layer.moving_mean = d['init_moving_mean']
    bn_layer.moving_variance = d['init_moving_var']

    custom_layer.add(bn_layer)
    scaling_layer = ChannelwiseScaling(n_in)
    scaling_layer.gamma = d['init_gamma']
    scaling_layer.beta = d['init_beta']
    custom_layer.add(scaling_layer)
    custom_layer.train()

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-5)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.backward(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-5)

    # 3. check layer parameters grad
    weight_grad, bias_grad = custom_layer.getGradParameters()[1]
    torch_weight_grad = d['gt_weight_grad']
    torch_bias_grad = d['gt_bias_grad']
    assert np.allclose(torch_weight_grad, weight_grad, atol=1e-5)
    assert np.allclose(torch_bias_grad, bias_grad, atol=1e-5)

### Test 6: Dropout (0.075 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for _ in range(100):
    # layers initialization
    p = np.random.uniform(0.3, 0.7)
    layer = Dropout(p)
    layer.train()

    layer_input = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)
    next_layer_grad = np.random.uniform(-5, 5, (batch_size, n_in)).astype(np.float32)

    # 1. check layer output
    layer_output = layer.updateOutput(layer_input)
    assert np.all(np.logical_or(np.isclose(layer_output, 0),
                                np.isclose(layer_output*(1.-p), layer_input)))

    # 2. check layer input grad
    layer_grad = layer.updateGradInput(layer_input, next_layer_grad)
    assert np.all(np.logical_or(np.isclose(layer_grad, 0),
                                np.isclose(layer_grad*(1.-p), next_layer_grad)))

    # 3. check evaluation mode
    layer.evaluate()
    layer_output = layer.updateOutput(layer_input)
    assert np.allclose(layer_output, layer_input)

    # 4. check mask
    p = 0.0
    layer = Dropout(p)
    layer.train()
    layer_output = layer.updateOutput(layer_input)
    assert np.allclose(layer_output, layer_input)

    p = 0.5
    layer = Dropout(p)
    layer.train()
    layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
    next_layer_grad = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
    layer_output = layer.updateOutput(layer_input)
    zeroed_elem_mask = np.isclose(layer_output, 0)
    layer_grad = layer.updateGradInput(layer_input, next_layer_grad)
    assert np.all(zeroed_elem_mask == np.isclose(layer_grad, 0))

    # 5. dropout mask should be generated independently for every input matrix element, not for row/column
    batch_size, n_in = 1000, 1
    p = 0.8
    layer = Dropout(p)
    layer.train()

    layer_input = np.random.uniform(5, 10, (batch_size, n_in)).astype(np.float32)
    layer_output = layer.updateOutput(layer_input)
    assert np.sum(np.isclose(layer_output, 0)) != layer_input.size

    layer_input = layer_input.T
    layer_output = layer.updateOutput(layer_input)
    assert np.sum(np.isclose(layer_output, 0)) != layer_input.size

### Test 7: LeakyReLU (0.05 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_LeakyReLU']:
    # layers initialization
    slope = d['slope']
    custom_layer = LeakyReLU(slope)

    layer_input = d['layer_input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)

### Test 8: ELU (0.075 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_ELU']:
    # layers initialization
    alpha = 1.0
    custom_layer = ELU(alpha)

    layer_input = d['layer_input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)

### Test 9: SoftPlus (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_SoftPlus']:
    # layers initialization
    custom_layer = SoftPlus()

    layer_input = d['layer_input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)


### Test 10: ClassNLLCriterionUnstable (0.1 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_ClassNLLCriterionUnstable']:
    # layers initialization
    custom_layer = ClassNLLCriterionUnstable()

    layer_input = d['layer_input']
    target = d['target']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input, target)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)

### Test 11: ClassNLLCriterion (0.05 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 4
for d in data['test_ClassNLLCriterion']:
    # layers initialization
    custom_layer = ClassNLLCriterion()

    layer_input = d['layer_input']
    target = d['target']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input, target)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, target)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)

### Test 12: Adam (0.1 points)

In [None]:
state = {}
config = {'learning_rate': 1e-3, 'beta1': 0.9, 'beta2':0.999, 'epsilon':1e-8}
variables = [[np.arange(10).astype(np.float64)]]
gradients = [[np.arange(10).astype(np.float64)]]
adam_optimizer(variables, gradients, config, state)
assert np.allclose(state['m'][0], np.array([0. , 0.1, 0.2, 0.3, 0.4, 0.5,
                                                     0.6, 0.7, 0.8, 0.9]))
assert np.allclose(state['v'][0], np.array([0., 0.001, 0.004, 0.009, 0.016, 0.025,
                                                     0.036, 0.049, 0.064, 0.081]))
assert state['t'] == 1
assert np.allclose(variables[0][0], np.array([0., 0.999, 1.999, 2.999, 3.999, 4.999,
                                                       5.999, 6.999, 7.999, 8.999]))
adam_optimizer(variables, gradients, config, state)
assert np.allclose(state['m'][0], np.array([0., 0.19, 0.38, 0.57, 0.76, 0.95, 1.14,
                                                     1.33, 1.52, 1.71]))
assert np.allclose(state['v'][0], np.array([0., 0.001999, 0.007996, 0.017991,
                                                     0.031984, 0.049975, 0.071964, 0.097951,
                                                     0.127936, 0.161919]))
assert state['t'] == 2
assert np.allclose(variables[0][0], np.array([0., 0.998, 1.998, 2.998, 3.998, 4.998,
                                                       5.998, 6.998, 7.998, 8.998]))

### Test 13: Conv2d (0.35 points)

In [None]:
np.random.seed(42)

batch_size, n_in, n_out = 2, 3, 4
h,w = 5,6
kern_size = 3
for d in data['test_Conv2d']:
    # layers initialization
    custom_layer = Conv2d(n_in, n_out, kern_size)
    custom_layer.W = d['init_w']
    custom_layer.b = d['init_b']

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)

    # 3. check layer parameters grad
    custom_layer.accGradParameters(layer_input, next_layer_grad)
    weight_grad = custom_layer.gradW
    bias_grad = custom_layer.gradb
    assert np.allclose(d['gt_weight_grad'], weight_grad, atol=1e-6, )
    assert np.allclose(d['gt_bias_grad'], bias_grad, atol=1e-6)


### Test 13: MaxPool2d (0.15 points)

In [None]:
np.random.seed(42)

batch_size, n_in = 2, 3
h,w = 4,6
kern_size = 2
for d in data['test_MaxPool2d']:
    # layers initialization
    custom_layer = MaxPool2d(kern_size)

    layer_input = d['input']
    next_layer_grad = d['next_layer_grad']

    # 1. check layer output
    custom_layer_output = custom_layer.updateOutput(layer_input)
    assert np.allclose(d['gt_output'], custom_layer_output, atol=1e-6)

    # 2. check layer input grad
    custom_layer_grad = custom_layer.updateGradInput(layer_input, next_layer_grad)
    assert np.allclose(d['gt_grad'], custom_layer_grad, atol=1e-6)