# Task 2
Implement basic backward pass using only numpy:
 - for your last week's Single Layer Perceptron.
 - for all activation functions
 - loss functions

Perform forward pass and backward pass, and use the gradient check function to verify your implementation

In [81]:
import numpy as np
from utils import Module

## Linear Layer

In your previous task you defined `forward(input)` pass for your Linear class. Now we continue in creation of your own framework a little further with defining the `backward(dNet)` function. In this little framework is activation and linear unit separated. This separation is benefit in backward propagation and optimization. (If you want to know why, take a look on implementation of forward and backward propagation in class Model.)

Note, that you are implementing backward pass for the whole dataset of `m` samples.

In [82]:
#------------------------------------------------------------------------------
#   Linear class
#------------------------------------------------------------------------------
class Linear(Module):
    def __init__(self, in_features, out_features):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.W = np.random.randn(out_features, in_features)
        self.dW = np.zeros_like(self.W)
        self.b = np.zeros((out_features, 1)) # Watch-out for the shape
        self.db = np.zeros_like(self.b)

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_inputs = input
        self.m = self.fw_inputs.shape[1]
        net = np.matmul(self.W, input) + self.b #z
        return net

    def backward(self, dz: np.ndarray) -> np.ndarray:
        # >>>>>>>>> add here
        self.db = 1/self.m * np.sum(dz,axis=1, keepdims=1)
        self.dW = 1/self.m * np.matmul(dz, self.fw_inputs.T)
        print("aaaaaaaa")
        print(np.shape(self.W.T))
        print(np.shape(dz))
        matmul = np.matmul(self.W.T, dz)
        print(np.shape(matmul))
        return matmul
        # <<<<<<<<<

## Activations
Implement backward pass for Sigmoid, Tanh and ReLU activation functions.

In [83]:
#------------------------------------------------------------------------------
#   SigmoidActivationFunction class
#------------------------------------------------------------------------------
class Sigmoid(Module):
    def __init__(self):
        super(Sigmoid, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return 1.0 / (1.0 + np.exp(-input))

    def backward(self, da) -> np.ndarray:
        # >>>>>>>>> add here
        f = self.fw_input
        return da * f * (1 - f)
        # <<<<<<<<<

#------------------------------------------------------------------------------
#   HyperbolicTangentActivationFunction class
#------------------------------------------------------------------------------
def htan(x):
    return (np.exp(x) - np.exp(-x))/(np.exp(x) + np.exp(-x))
class Tanh(Module):
    def __init__(self):
        super(Tanh, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return (np.exp(2 * input) - 1) / (np.exp(2 * input) + 1)


    def backward(self, da) -> np.ndarray:
        # >>>>>>>>> add here
        x = self.fw_input
        return da * ( 1 - htan(x) * htan(x))
        pass
        # <<<<<<<<<

#------------------------------------------------------------------------------
#   RELUActivationFunction class
#------------------------------------------------------------------------------
class ReLU(Module):
    def __init__(self):
        super(ReLU, self).__init__()

    def forward(self, input: np.ndarray) -> np.ndarray:
        self.fw_input = input
        return np.maximum(input, 0)

    def backward(self, da) -> np.ndarray:
        # >>>>>>>>> add here
        x =self.fw_input
        return da * ((x > 0) * 1)
        # <<<<<<<<<

## Loss functions
For successful backward pass, the computation and derivation of Loss function is necessary.
The most common Loss functions are **Mean Square Error** _(MSE, L2)_ **Mean Absolute Error** _(MAE, L1)_ and **Binary Cross Entropy** _(BCE, Log Loss)_ and their modifications according to what is better for the current dataset.

Implement MSE and BCE Loss functions as Modules of our little framework.

Remember the difference between Loss and Cost.

In [84]:
#------------------------------------------------------------------------------
#   MeanSquareErrorLossFunction class
#------------------------------------------------------------------------------
class MSELoss(Module):
    def __init__(self):
        super(MSELoss, self).__init__()

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        # >>>>>>>>> add here
        return np.square(np.subtract(target,input)).mean()
        # <<<<<<<<<

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        # >>>>>>>>> add here
        return 2*(input-target)/input.shape[1]
        # <<<<<<<<<

#------------------------------------------------------------------------------
#   BinaryCrossEntropyLossFunction class
#------------------------------------------------------------------------------
class BCELoss(Module):
    def __init__(self):
        super(BCELoss, self).__init__()

    def forward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        # >>>>>>>>> add here
        y = target
        a= input
        return -(y*np.log(a) + (1-y)* log(1-a))

        y_pred = input
        y_true = target
        y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
        term_0 = (1-y_true) * np.log(1-y_pred + 1e-7)
        term_1 = y_true * np.log(y_pred + 1e-7)
        return -np.mean(term_0+term_1, axis=0)
        # <<<<<<<<<

    def backward(self, input: np.ndarray, target: np.ndarray) -> np.ndarray:
        # >>>>>>>>> add here
        y = target
        a= input
        return -y / a + (1-y) / (1-a)

        # return np.asscalar((-1/len(Z)) * (np.dot(Y, np.log(Z + (1.e-10))) + np.dot((1 - Y), np.log(1 - Z + (1.e-10)))))

    pass
        # <<<<<<<<<

## Model
As in previous task, use `Model` class to encapsulate all layers of your MLP and define backward pass.
Iterate over its modules stored in parameter OrderedDict `modules` -> `self.modules` in the correct order.

Use call `.add_module(...)` to add layers of your MLP (network). Define MLP that could classify data from Circles dataset `dataset_Circles(...)`.


In [85]:
#------------------------------------------------------------------------------
#   Model class
#------------------------------------------------------------------------------
class Model(Module):
    def __init__(self):
        super(Model, self).__init__()

    def forward(self, input) -> np.ndarray:
        for name, module in self.modules.items():
            print(f'Layer fw:{name}, a.shape = {input.shape} \n')
            input = module.forward(input)
            print(f'Layer fw:{name} OUT, a.shape = {input.shape} \n')

            # print(f'z.shape = {input.shape} \n{input}')
        return input

    def backward(self, z: np.ndarray):
        # >>>>>>>>> add here
        for name, module in reversed(self.modules.items()):
            print(f'Layer bw:{name}, a.shape = {z.shape} \n')
            input = module.backward(z)
            print(f'Layer bw:{name} OUT, a.shape = {z.shape} \n')

            # print(f'z.shape = {input.shape} \n{input}')
        return input
        pass
        # <<<<<<<<<

## Main Processing Cell

 1. Initialize dataset (`dataset_Circles`). [x]
 2. Declare a simple model (at least 3 hidden layer). [x]
 3. Perform forward pass through the network. [x]
 4. Compute loss. []
 5. Backward prop loss. []
 6. Backward pass MLP. []
 7. Check your computation of gradients via [`gradient_check`](https://datascience-enthusiast.com/DL/Improving_DeepNeural_Networks_Gradient_Checking.html) []
 8. Start crying. []
 9. Repeat until correct ;) []
 10. ... (if error founds -> blame lecturer) []

In [86]:
import pandas
from dataset import dataset_Circles
from utils import gradient_check


In [87]:
dataset_features_X, dataset_labels_Y = dataset_Circles(m=128, radius=0.7, noise=0.0)

mlp = Model()
mlp.add_module(Linear(2, 3), 'Dense_1')
mlp.add_module(Tanh(), 'Tanh_1')
mlp.add_module(Linear(3, 4), 'Dense_2')
mlp.add_module(Tanh(), 'Tanh_2')
mlp.add_module(Linear(4, 5), 'Dense_3')
mlp.add_module(Tanh(), 'Tanh_3')
mlp.add_module(Linear(5, 1), 'Dense_4_out')
mlp.add_module(Sigmoid(), 'Sigmoid')

predicted_Y_hat = mlp.forward(dataset_features_X) # Be careful with the shape - Loss vs Cost

Layer fw:Dense_1, a.shape = (2, 128) 

Layer fw:Dense_1 OUT, a.shape = (3, 128) 

Layer fw:Tanh_1, a.shape = (3, 128) 

Layer fw:Tanh_1 OUT, a.shape = (3, 128) 

Layer fw:Dense_2, a.shape = (3, 128) 

Layer fw:Dense_2 OUT, a.shape = (4, 128) 

Layer fw:Tanh_2, a.shape = (4, 128) 

Layer fw:Tanh_2 OUT, a.shape = (4, 128) 

Layer fw:Dense_3, a.shape = (4, 128) 

Layer fw:Dense_3 OUT, a.shape = (5, 128) 

Layer fw:Tanh_3, a.shape = (5, 128) 

Layer fw:Tanh_3 OUT, a.shape = (5, 128) 

Layer fw:Dense_4_out, a.shape = (5, 128) 

Layer fw:Dense_4_out OUT, a.shape = (1, 128) 

Layer fw:Sigmoid, a.shape = (1, 128) 

Layer fw:Sigmoid OUT, a.shape = (1, 128) 



In [88]:
###>>> start of solution

# TODO add loss function!!!
loss_fn = MSELoss()
loss_fn.forward( predicted_Y_hat,dataset_labels_Y)
mlp.backward(loss_fn.backward( predicted_Y_hat,dataset_labels_Y))
###<<< end of solution

Layer bw:Sigmoid, a.shape = (1, 128) 

Layer bw:Sigmoid OUT, a.shape = (1, 128) 

Layer bw:Dense_4_out, a.shape = (1, 128) 

aaaaaaaa
(5, 1)
(1, 128)
(5, 128)
Layer bw:Dense_4_out OUT, a.shape = (1, 128) 

Layer bw:Tanh_3, a.shape = (1, 128) 

Layer bw:Tanh_3 OUT, a.shape = (1, 128) 

Layer bw:Dense_3, a.shape = (1, 128) 

aaaaaaaa
(4, 5)
(1, 128)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 1 is different from 5)

In [None]:
# Verify your solution!
gradient_check(mlp, loss_fn, dataset_features_X, dataset_labels_Y)
