# MLP 多层感知机

- 在utils中定义了sigmoid，ReLU激活函数。
- 二分类使用NLL损失，即使用模型预测概率计算负对数似然
  - $$\mathcal L = -\sum y\log p$$
  - 计算损失函数到输入值的梯度为$$\frac{\partial \mathcal L}{\partial p} = - \sum y \frac{1}{p}$$
  - 在二分类中两者分别为inputs和1-inputs。

In [1]:
import utils
import numpy as np

# 定义损失函数及其梯度
class NLLLoss(object):
    """ Negative Log Likelihood Loss"""
    def __init__(self, reduction:str=None):
        self.reduction = reduction

        # 保留上下文用于计算梯度
        self.ctx_inputs = None
        self.ctx_labels = None 

    def forward(self, inputs, labels):
        """ calculate loss """
        assert inputs.shape[0] == labels.shape[0], \
            f"inputs and labels should be in same number of samples \
              but get {inputs.shape[0]} inputs and {labels.shape[0]} lables"
        assert np.min(inputs) > 0 and np.max(inputs) < 1.0, \
            f"inputs should between 0 and 1, \
                but get value from {np.min(inputs):.4f} to {np.max(inputs):.4f}"

        if len(inputs.shape) == 2:
            labels = labels[:, None]
        assert inputs.shape == labels.shape

        # 计算损失
        likelihood = np.multiply(labels, np.log(inputs)) \
             + np.multiply((1 - labels), np.log(1 - inputs)) 
        loss = - likelihood

        # 保存上下文信息
        self.ctx_inputs = inputs
        self.ctx_labels = labels

        if self.reduction == 'mean':
            return np.mean(loss)
        elif self.reduction == 'sum':
            return np.sum(loss)
        else:
            return loss

    def backward(self, prev_grad):
        """ calculate gradient """
        grad = np.multiply(self.ctx_labels, np.reciprocal(self.ctx_inputs)) \
             + np.multiply((1 - self.ctx_labels), - np.reciprocal((1 - self.ctx_inputs)))

        return  -grad * prev_grad

对于mlp中的每一层,定义一层感知器.
包括正向传播,反向传播和梯度更新.

其中正向传播公式为
$$y = \sigma(z) = \sigma(w^Tx + b)$$
其中z为输入特征的加权和,保存在ctx_hidden变量中用于计算残差

反向传播过程中,首先计算残差
$$\frac{\partial \mathcal L}{\partial z} = \frac{\partial \mathcal L}{\partial y} \frac{\partial y}{\partial z}$$

可以得到损失函数对权重的梯度,保存在上下文中用于梯度更新.
同时需要返回损失函数对前一层输出的梯度,以便前一层的梯度计算.

In [None]:
class LinearLayer:
    """ Perceptron Layer in MLP """
    def __init__(
        self,
        c_in: int, c_out: int,
        init_mean: float, init_var: float,
        bias:bool = False,
        activation: str = 'relu'
    ) -> None:

        self.bias = bias
        if self.bias:
            weight_size = (c_in+1, c_out)
        else:
            weight_size = (c_in, c_out)

        self.grad = None
        self.weight = np.random.normal(
            init_mean, init_var, 
            size=weight_size
        )
        
        self.activation = _get_activation(activation)

        self.ctx_inputs = None
        self.ctx_hidden = None
        self.ctx_outputs = None

    def forward(self, inputs):
        """ forward the network """
        bsz = inputs.shape[0]

        if self.bias:
            self.ctx_inputs = np.concatenate([
                inputs,
                np.ones(shape=(bsz, 1))
            ], axis=1)
        else:
            self.ctx_inputs = inputs

        self.ctx_hidden = np.matmul(self.ctx_inputs, self.weight)
        self.ctx_outputs = self.activation(self.ctx_hidden)

        return self.ctx_outputs

    def backward(self, prev_grad):
        """ calculate the gradient """
        assert self.ctx_hidden.shape == prev_grad.shape, \
            f"expected same shape of ctx_hidden and prev_grad, \
              but get ctx_hidden in {self.ctx_hidden.shape} and \
              prev_grad in {prev_grad.shape}"

        if self.activation.__name__ == 'relu':
            residual = prev_grad * np.where(self.ctx_hidden >= 0, 1, 0)
        elif self.activation.__name__ == 'sigmoid':
            residual = prev_grad * np.multiply(self.ctx_outputs, 1 - self.ctx_outputs)
        else:
            raise NotImplementedError(f"activation not supported")

        self.grad = np.matmul(self.ctx_inputs.transpose(1, 0), residual)

        if self.bias:
            return np.matmul(residual, self.weight.transpose(1, 0))[:, :-1]
        else:
            return np.matmul(residual, self.weight.transpose(1, 0))

    def update(self, learning_rate):
        """ update weight with the gradient """
        assert self.grad is not None

        self.weight = self.weight - learning_rate * self.grad
        self.grad = None