<a href="https://colab.research.google.com/github/frankausberlin/notebook-collection/blob/main/backpropagationnotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro

Backpropagation of error

1949: Hebb-rule - Donald O. Hebb  <br>
1957: Perceptron-rule - Frank Rosenblat<br>
1960: Delta-rule (single layer) - Widrow & Hoff<br>
1969: Bashing Delta-rule - Minsky & Papert<br>
1986: Backpropagation: extended Delta-rule (muti layer) - McClelland & Rumelhart's<br>

<table><tr></tr><tr><td>

<font size='+2'>Terms</td><td><table>
<tr><td><p>$w$; $b$</td><td><p>weights; bias</td><td><p>$\}$ model parameter</td><td></td></tr>
<tr><td><p>$x$</td><td><p>signals, inputs</td><td><p>$\}$ features, independent variables</td></tr>
<tr><td><p>$y$ or $t$</td><td><p>actual, expected, target</td><td width=50 rowspan=2><p>$\Huge \}$ labels, dependent variables</td></tr>
<tr><td><p>$\hat y$</td><td><p>forecast, predicted</td></tr>
</table> </td></tr></table>

<table>
<tr><td colspan=3 align='center'>

#<font size='+4' color='purple'><b>Backpropagation (extended delta rule)</td></tr><tr><td colspan=3 allign='center'>

perceptron_simple.svg
</td></tr><tr></tr><tr><td>

$$\large net = x \bullet w + b$$<br>
$$\large \hat y = \varphi (net)$$<br>
$$\large loss = \mathcal L (y, \hat y)$$
</td><td>

$$\quad \quad \quad \left| \begin{array}{ll}
\quad net = \text{transfered net input}\\
\quad x = \text{input signals / stimulation}\\
\quad w, b = \text{weights, bias: parameter}\\ 
\quad \varphi = \text {activation function}\\
\quad \mathcal L = \text {error function}\\
\end{array}\right.$$
</td><td>

$$\quad \left| \begin{array}{ll}
\quad y = \text {expected output}\\
\quad \hat y = \text{predicted output}\\
\quad \eta = \text{learningrate eta}\\
\quad \Delta = \text {change value}\\
\quad \small{\left[w_{jk}^{new} = w_{jk}^{old}+\Delta_{jk}\right]}\\
\end{array}\right.$$
</td></tr><tr><td colspan=3>

$\color{purple}{\Huge\quad \quad \Delta_{jk} = -\eta \;\; \delta_j \;\; \hat y_k \quad \quad \quad \quad |}$
$\color{purple}{\begin{array}{ll}
\huge \delta_j = \large \begin{cases}
\varphi\,'(net_j)\cdot \mathcal L\,' (y, \hat y) &\color{black}{: \small \text{output layer}}\\
\varphi\,'(net_j)\cdot \sum_i{\delta_i w_{ij}} &\color{black}{: \small \text{hidden layer}}
\end{cases}
\end{array}}$
</td></tr>
</table>

In [10]:
#@title code
import numpy as np
from numpy import array

np.random.seed (42)
eta           = 0.5                                           # learningrate
activation_h  = sigmoid = lambda x: 1 / (1 + np.exp(-x))      # activation hidden layer: sigmoid
activation_o  = sigmoid                                       # activation output layer: sigmoid
loss          = lambda y: np.mean (1/2*(expected-y)**2)       # error function: mse (mean squared error)
loss_         = lambda y: y-expected                          # derivation error function
activation_h_ = lambda x: sigmoid(x) * (1 - sigmoid(x))       # derivation activation h
activation_o_ = lambda x: sigmoid(x) * (1 - sigmoid(x))       # derivation activation o

input        = array ( [0.1, 0.2, 0.3, 0.4, 0.5] )            # x
expected     = array ( [1.0, 0.0] )                           # y

weights_h    = np.random.randn (3,5)                          #
bias_h       = np.random.randn (3)                            # hidden layer
net_h        = weights_h.dot (input) + bias_h                 #  
input_new    = activation_h (net_h)                           #

weights_o    = np.random.randn (2,3)                          #
bias_o       = np.random.randn (2)                            # output layer
net_o        = weights_o.dot (input_new) + bias_o             # 
output       = activation_o (net_o)                           # forward: hidden -> output

error_old    = loss (output)
############################ BACKPROPAGATION OF ERROR IMPLEMENTS THE EXTENDED DELTA-RULE #################################
delta_o      = activation_o_ (net_o) * loss_ (output)      # backward: output -> hidden                                  #
delta_h      = activation_h_ (net_h) *\
               array ([ delta_o[0]*weights_o[0][0] + delta_o[1]*weights_o[1][0],   # sum_i(d_i * w_ij)  | for            #
                        delta_o[0]*weights_o[0][1] + delta_o[1]*weights_o[1][1],   # sum_i(d_i * w_ij)  | every          #  
                        delta_o[0]*weights_o[0][2] + delta_o[1]*weights_o[1][2] ]) # sum_i(d_i * w_ij)  | j              #
                                                                                                                         #
changes_hw   = array ( [-eta * delta_h[0] * input,     -eta * delta_h[1] * input,     -eta * delta_h[2] * input    ] )   #
changes_hb   = array ( [-eta * delta_h[0] * bias_h[0], -eta * delta_h[1] * bias_h[1], -eta * delta_h[2] * bias_h[2]] )   #
changes_ow   = array ( [-eta * delta_o[0] * input_new, -eta * delta_o[1] * input_new] )                                  #
changes_ob   = array ( [-eta * delta_o[0] * bias_o[0], -eta * delta_o[1] * bias_o[1]] )                                  #
##########################################################################################################################
weights_h    += changes_hw
bias_h       += changes_hb
weights_o    += changes_ow
bias_o       += changes_ob

error_new    = loss ( activation_o ( weights_o.dot ( activation_h ( weights_h.dot (input) + bias_h ) ) + bias_o ) )

(error_new < error_old), error_old.round(4), error_new.round(4)


(True, 0.2014, 0.1995)

# Formulas
<br>

<table>
<tr><td colspan=2><font size='+2'><b>Exponential functions</b></td><td colspan=3 align='center'>$\Large{f(x)=a^x}$</td></tr>
<tr><td><font size='+1'>$$a^{-x} = \frac{1}{a^x}\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^{x+y}=a^x \cdot a^y\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^{x-y}=\frac{a^x}{a^y}\;\;\;\Bigg|$$</td><td><font size='+1'>$$a^0=1\;\;\;\Bigg|$$</td><td><font size='+1'>$a^x=a^y <=> x=y$</td></tr>
</table>

<br><br>

<table width=800>
<tr><td colspan=1><font size='+2'><b>Logarithm</b></td><td colspan=3 align='center'>$\Large y=\log_a(x) <=> x=a^y$</td><td colspan=2 align='left'>$\log_a(x)$: logartihm for <b>x</b> with base <b>a</b><br>(with base <b>10</b>: $\log(x)$, with base <b>e</b>: $\ln(x)$)<br>in Numpy: np.log ->ln / np.log10 ->log</td></tr>
<tr> <td colspan=6>

$\log_a(1)=0\;\;\;\Bigg|\;\;\;\log_a(a)=1\;\;\;\Bigg|\;\;\;\log_a(a^x)=x\;\;\;\Bigg|\;\;\;a^{log_a(x)}=x$

$\log_a(bc)=\log_a(b)+\log_a(c)\;\;\;\Bigg|\;\;\;\log_a(\frac b c)=\log_a(b)-\log_a(c)\;\;\;\Bigg|\;\;\;\log_a(x^d)=d\cdot\log_a(x)$

</td> </tr>
</table>
<br>

<table align='left'>
<tr><td colspan=3>

<font size='+2'><b>Derivation
</td></tr>
<tr></tr>
<tr><td align='center' width=120>
<font size='+1'><b>Rule</b>
</td><td align='center' width=220>
<font size='+1'><b>Function</b>
</td><td align='center' width=400>
<font size='+1'><b>Derivation</b>
</td></tr>

<tr><td>

_Notation_
</td><td>
$\Large{\color{red}f(\color{blue}x)}$<br><br><br>
$\Large{\color{red}f(\color{blue}{x},\color{green}{y})}$
</td><td>

$\text{derivation:}\;\;\large{\frac{d}{d\color{blue}x}\color{red}f(\color{blue}x)=\frac{d\color{red}f(\color{blue}x)}{d\color{blue}x}=\frac{d\color{red}f}{d\color{blue}x}= \color{red}{f'}}$<br>

$\text{partial derivation:}\;\;\large{\frac{\partial \color{red}f}{\partial \color{blue}x}\;\Big | \; \frac{\partial \color{red}f}{\partial \color{green}y}}$

</td></tr>
<tr><td>

_Constant_
</td><td>
$\Large{f(x)=\color{red}c}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}0}$
</td></tr>
<tr><td>

_Identity_
</td><td>
$\Large{f(x)=\color{red}x}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}1}$
</td></tr>
<tr><td>

_Constant\
Multiple_
</td><td>
$\Large{f(x) = \color{red}c\, x}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}c}$
</td></tr>
<tr><td>

_Factor_
</td><td>
$\Large{f(x) = \color{red}c\, g(x)}$
</td><td>
$\Large{\frac{df}{dx}=\color{red}c\,\frac{dg}{dx}}$
</td></tr>
<tr><td>

_Power_
</td><td>
$\Large{f(x)=x^{\color{red}n}}$
</td><td>
$\Large{\frac{df}{dx} = \color{red}n x^{\color{red}{n -1}}}$
</td></tr>
<tr><td>

_Summation_$\qquad$
</td><td>
$\Large{f(x)=\color{blue}g(x) \color{red}\pm \color{green}h(x)}$
</td><td>
$\Large{\frac{df}{dx} = \frac{d\color{blue}g}{dx} \color{red}\pm \frac{d\color{green}h}{dx}}$
</td></tr>
<tr><td>

_Product_
</td><td>
$\large{f(x)=\color{blue}g(x) \color{red}\cdot \color{green}h(x) \qquad}$
</td><td>
$\large{\frac{df}{dx} = \frac{d\color{green}h}{dx} \color{red}\cdot \color{blue}g(x) \color{red}+ \color{green}h(x) \color{red}\cdot \frac{d\color{blue}g}{dx}}$ 
</td></tr>
<tr><td>

_Quotient_
</td><td>
$\Large{f(x)=\color{red}{\frac{ \color{blue}g\color{black}{(x)} }{ \color{green}h\color{black}{(x)} }}}$

</td><td>
$\Large{ 
\frac{df}{dx} = \color{red}{
\frac{ \color{black}{\frac{d\color{blue}g}{dx} \color{red}\cdot \color{green}h(x)} - \color{black}{\color{blue}g(x) \color{red}\cdot \frac{d\color{green}h}{dx} }}
{ \color{green}h\color{black}{(x)}^2} }}$
</td></tr>
<tr><td>

_Chain_
</td><td>
$\Large{f(x)=\color{green}h\color{red}(\color{blue}g(x)\color{red})}$
</td><td>
$\Large{\frac{df}{dx} = \frac{d\color{green}h}{d\color{blue}g} \color{red}\cdot \frac{d\color{blue}g}{dx}}$
</td></tr>
<tr><td>

_Partial_
</td><td>
$\Large{f(\color{red}{x,y})=\color{green}h(\color{red}x)+\color{blue}g(\color{red}y)}$

</td><td>
$\Large{\left. \frac{\partial f}{\partial \color{red}x} = \frac{d\color{green}h}{d\color{red}x}\quad
\right |\quad
\frac{\partial f}{\partial \color{red}y} = \frac{d\color{blue}g}{d\color{red}y}}$
</td></tr>

</table>

<br><br>


