<a href="https://colab.research.google.com/github/frankausberlin/notebook-collection/blob/main/backpropagationnotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table><tr><td width=800 align=center>

# <font size='+3'/><b>Backpropagation of error

</b>is an algorithm that implements the<br><br>
<font size='+3'/><b>extended delta rule
</td></tr></table>

<br>
<table width=800>
<tr></tr>
<tr><td align=center>


<table><tr></tr><tr><td align=center width=100>
<font size='+2'/>Terms</td><td><table>
<tr><td><p>$w$; $b$</td><td><p>weights; bias</td><td><p>$\}$ model parameter</td><td></td></tr>
<tr><td><p>$x$</td><td><p>signals, inputs</td><td><p>$\}$ features, independent variables</td></tr>
<tr><td><p>$y$ or $t$</td><td><p>actual, expected, target</td><td width=50 rowspan=2><p>$\Huge \}$ labels, dependent variables</td></tr>
<tr><td><p>$\hat y$</td><td><p>forecast, predicted</td></tr>
</table> </td></tr></table>

</td></tr>
</table>
<br>


<table width=800>
<tr></tr>
<tr><td align=center>

<table>
<tr><td colspan=5 align=center><font size='+1'/>History</td></tr>
<tr></tr>
<tr><td><b>1949</td><td><b>1957</td><td><b>1960</td><td><b>1969</td><td><b>1986</td></tr>
<tr><td><br>Hebb-rule<br><br>Donald O. Hebb</td><td><br>Perceptron-rule<br><br>Frank Rosenblat</td><td><br>Delta-rule (single layer)<br><br>Widrow & Hoff</td><td><br>Bashing Delta-rule<br><br>Minsky & Papert</td><td><br>Backpropagation (multi layer)<br><br>McClelland & Rumelhart</td></tr>
</table>

</td></tr>
</table>


<table>
<tr><td colspan=3 align='center'>

##<font size='+3' color='purple'><b>Extended delta rule formula</td></tr><tr><td colspan=3 allign='center'>

perceptron_simple.svg
</td></tr><tr></tr><tr><td>

$$\large net = x \bullet w + b$$<br>
$$\large \hat y = \varphi (net)$$<br>
$$\large loss = \mathcal L (y, \hat y)$$
</td><td>

$$\quad \quad \quad \left| \begin{array}{ll}
\quad net = \text{transfered net input}\\
\quad x = \text{input signals / stimulation}\\
\quad w, b = \text{parameter: weights, bias}\\ 
\quad \varphi = \text {activation function}\\
\quad \mathcal L = \text {error function}\\
\end{array}\right.$$
</td><td>

$$\quad \left| \begin{array}{ll}
\quad y = \text {expected output}\\
\quad \hat y = \text{predicted output}\\
\quad \eta = \text{learningrate eta}\\
\quad \Delta = \text {change value}\\
\quad \small{\left[w_{jk}^{new} = w_{jk}^{old}+\Delta_{jk}\right]}\\
\end{array}\right.$$
</td></tr><tr><td colspan=3 align=center>

<table>
<tr><td>
<p>$$\color{purple}{\Huge\Delta_{jk} = -\eta \;\; \delta_j \;\; \hat y_k \qquad}$$
</td><td>
<p>$\left| 
\color{purple}{\begin{array}{ll}
\huge \qquad \delta_j = \large \begin{cases}
\varphi\,'(net_j)\cdot \mathcal L\,' (y, \hat y) &\color{black}{: \small \text{output layer}}\\
\varphi\,'(net_j)\cdot \sum_i{\delta_i w_{ij}} &\color{black}{: \small \text{hidden layer}}
\end{cases}\\
\qquad  \rule{12cm}{0.4pt}\\
\Large \qquad \color{purple}{\Delta_j ^b = - \eta \delta_j b_j} \color{black}{\small \qquad \text{for bias}}
\end{array}}
\right.$
</td></tr>
</table>

</td></tr>
</table>

## Exemplary implementation with numpy



In [169]:
from numpy            import array, sum, mean, exp, arange
from numpy.random     import randn, seed 

seed     (42)
act_h    = sigmoid = lambda x: 1 / (1 + exp(-x))               # activation hidden layer: sigmoid
act_o    = sigmoid                                             # activation output layer: sigmoid
loss     = lambda t, y: mean (1/2*(t-y)**2)                    # error function: mse (mean squared error)
loss_    = lambda t, y: y - t                                  # derivation error function
act_h_   = lambda x: sigmoid(x) * (1 - sigmoid(x))             # derivation activation hidden
act_o_   = lambda x: sigmoid(x) * (1 - sigmoid(x))             # derivation activation output

nrH, nrO = 3, 2                                                # nr hiddens / outputs perceptrons
eta, bs  = 0.5, 1                                              # learning rate, batch size

inp      = array([0.1, 0.2, 0.3, 0.4, 0.5]*bs).reshape (-1,5)  # x : 5 input signals              # mini batch
tar      = array([1.0, 0.0]*bs).reshape (-1,2)                 # y : 2 expected outputs           # with bs entries

wei_h    = randn (nrH, inp.shape[-1])                          # 
bia_h    = randn (nrH)                                         # hidden layer                     # forward:
net_h    = inp @ wei_h.T + bia_h                               # with 3 perceptrons               # hidden -> output
out_h    = act_h (net_h)                                       #

wei_o    = randn (nrO, out_h.shape[-1])                        # 
bia_o    = randn (nrO)                                         # output layer
net_o    = out_h @ wei_o.T + bia_o                             # with 2 perceptrons
out_o    = act_o (net_o)                                       # 

error    = loss (tar, out_o)                                   # calculate error
#######################################################################################################################
del_o    = act_o_ (net_o) * loss_ (tar, out_o)                 # deltas output layer                                  #
J, I     = range (wei_h.shape[0]), range (wei_o.shape[0])      # J / I ranges hidden / output     # backward:         #
del_h    = act_h_ (net_h) * \
           array ([sum(del_o * wei_o[:,j])       for j in J ]) # deltas hidden layer              # output -> hidden  #
                                                                                                                      #
cha_hw   = array ([-eta * delta_h[j] * inp       for j in J ]) #                                                      #
cha_hb   = array ([-eta * delta_h[j] * bias_h[j] for j in J ]) # change values for                                    #
cha_ow   = array ([-eta * delta_o[i] * out_h     for i in I])  # every data in mini batch                             #
cha_ob   = array ([-eta * delta_o[i] * bias_o[i] for i in I])  #                                                      #
                                                                                                                      #
cha_hw   = sum (cha_hw, axis=1)                                # stochastic step: sum up                              #
cha_ow   = sum (cha_ow, axis=1)                                # change values for mini batch                         #
########################## BACKPROPAGATION OF ERROR IMPLEMENTS THE EXTENDED DELTA-RULE ################################
wei_h    += cha_hw                                             #
bia_h    += cha_hb                                             # update weights
wei_o    += cha_ow                                             # and bias
bia_o    += cha_ob                                             #

err_new  = loss ( tar, act_o (act_h (inp @ wei_h.T + bia_h) @ wei_o.T + bia_o) )

print ((err_new < error), error.round(4), '<', err_new.round(4))

True 0.2014 < 0.1997


# Formulas
<br>

<table>
<tr><td colspan=2><font size='+2'><b>Exponential functions</b></td><td colspan=3 align='center'>$\Large{f(x)=a^x}$</td></tr>
<tr><td><font size='+1'>$$a^{-x} = \frac{1}{a^x}\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^{x+y}=a^x \cdot a^y\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^{x-y}=\frac{a^x}{a^y}\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^0=1\;\;\;\Bigg|$$</td><td><font size='+1'>$a^x=a^y <=> x=y$</td></tr>
</table>

<br><br>

<table width=800>
<tr><td colspan=1><font size='+2'><b>Logarithm</b></td><td colspan=3 align='center'>$\Large y=\log_a(x) <=> x=a^y$</td><td colspan=2 align='left'>$\log_a(x)$: logartihm for <b>x</b> with base <b>a</b><br>(with base <b>10</b>: $\log(x)$, with base <b>e</b>: $\ln(x)$)<br>in Numpy: np.log ->ln / np.log10 ->log</td></tr>
<tr> <td colspan=6>

$\log_a(1)=0\;\;\;\Bigg|\;\;\;\log_a(a)=1\;\;\;\Bigg|\;\;\;\log_a(a^x)=x\;\;\;\Bigg|\;\;\;a^{log_a(x)}=x$

$\log_a(bc)=\log_a(b)+\log_a(c)\;\;\;\Bigg|\;\;\;\log_a(\frac b c)=\log_a(b)-\log_a(c)\;\;\;\Bigg|\;\;\;\log_a(x^d)=d\cdot\log_a(x)$

</td> </tr>
</table>
<br>

<table align='left'>
<tr><td colspan=3>

<font size='+2'><b>Derivation
</td></tr>
<tr></tr>
<tr><td align='center' width=120>
<font size='+1'><b>Rule</b>
</td><td align='center' width=220>
<font size='+1'><b>Function</b>
</td><td align='center' width=400>
<font size='+1'><b>Derivation</b>
</td></tr>

<tr><td>

_Notation_
</td><td>
$\Large{\color{red}f(\color{blue}x)}$<br><br><br>
$\Large{\color{red}f(\color{blue}{x},\color{green}{y})}$
</td><td>

$\text{derivation:}\;\;\large{\frac{d}{d\color{blue}x}\color{red}f(\color{blue}x)=\frac{d\color{red}f(\color{blue}x)}{d\color{blue}x}=\frac{d\color{red}f}{d\color{blue}x}= \color{red}{f'}}$<br>

$\text{partial derivation:}\;\;\large{\frac{\partial \color{red}f}{\partial \color{blue}x}\;\Big | \; \frac{\partial \color{red}f}{\partial \color{green}y}}$

</td></tr>
<tr><td>

_Constant_
</td><td>
$\Large{f(x)=\color{red}c}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}0}$
</td></tr>
<tr><td>

_Identity_
</td><td>
$\Large{f(x)=\color{red}x}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}1}$
</td></tr>
<tr><td>

_Constant\
Multiple_
</td><td>
$\Large{f(x) = \color{red}c\, x}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}c}$
</td></tr>
<tr><td>

_Factor_
</td><td>
$\Large{f(x) = \color{red}c\, g(x)}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}c\,\frac{dg}{dx}}$
</td></tr>
<tr><td>

_Power_
</td><td>
$\Large{f(x)=x^{\color{red}n}}$
</td><td>
$\Large{\frac{df}{dx} = \color{red}n x^{\color{red}{n -1}}}$
</td></tr>
<tr><td>

_Summation_$\qquad$
</td><td>
$\Large{f(x)=\color{blue}g(x) \color{red}\pm \color{green}h(x)}$
</td><td>
$\Large{\frac{df}{dx} = \frac{d\color{blue}g}{dx} \color{red}\pm \frac{d\color{green}h}{dx}}$
</td></tr>
<tr><td>

_Product_
</td><td>
$\large{f(x)=\color{blue}g(x) \color{red}\cdot \color{green}h(x) \qquad}$
</td><td>
$\large{\frac{df}{dx} = \frac{d\color{green}h}{dx} \color{red}\cdot \color{blue}g(x) \color{red}+ \color{green}h(x) \color{red}\cdot \frac{d\color{blue}g}{dx}}$ 
</td></tr>
<tr><td>

_Quotient_
</td><td>
$\Large{f(x)=\color{red}{\frac{ \color{blue}g\color{black}{(x)} }{ \color{green}h\color{black}{(x)} }}}$

</td><td>
$\Large{ 
\frac{df}{dx} = \color{red}{
\frac{ \color{black}{\frac{d\color{blue}g}{dx} \color{red}\cdot \color{green}h(x)} - \color{black}{\color{blue}g(x) \color{red}\cdot \frac{d\color{green}h}{dx} }}
{ \color{green}h\color{black}{(x)}^2} }}$
</td></tr>
<tr><td>

_Chain_
</td><td>
$\Large{f(x)=\color{green}h\color{red}(\color{blue}g(x)\color{red})}$
</td><td>
$\Large{\frac{df}{dx} = \frac{d\color{green}h}{d\color{blue}g} \color{red}\cdot \frac{d\color{blue}g}{dx}}$
</td></tr>
<tr><td>

_Partial_
</td><td>
$\Large{f(\color{red}{x,y})=\color{green}h(\color{red}x)+\color{blue}g(\color{red}y)}$

</td><td>
$\Large{\left. \frac{\partial f}{\partial \color{red}x} = \frac{d\color{green}h}{d\color{red}x}\quad
\right |\quad
\frac{\partial f}{\partial \color{red}y} = \frac{d\color{blue}g}{d\color{red}y}}$
</td></tr>

</table>

<br><br>


