# Backpropagation learning for multilayer perceptron

In [None]:
using MNIST, Plots               

In [None]:
f(x) = 1./(1+exp(-x))     # logistic function activation (replace to define your own activation function)
df(y) = y.*(1-y)          # derivative of f composed with inverse of f

m = 60000  # number of examples in training set

n0 = 784       # widths of layers
n1 = 25
n2 = 10  

eta = 0.1       # learning rate parameter
epsinit = 0.01  # magnitude of initial conditions for synaptic weights

# two fully connected synaptic layers
W1 = epsinit*randn(n1,n0)
W2 = epsinit*randn(n2,n1)

# biases
b1 = epsinit*randn(n1,1)
b2 = epsinit*randn(n2,1)

tmax = 600000       # maximum number of learning updates
tshow = 1000         # how often to pause for visualization
errsq = zeros(tmax)
errcl = zeros(tmax);

In [None]:
gr(                        # GR backend for Plots
    size = (600,600),    # you may need to change the numbers to fit your screen
    legend = :none
)
for t = 1:tmax
    i = ceil(Int, m*rand())     # choose randomly from the training set
    x0 = trainfeatures(i)/255
    y = zeros(n2,1)
    if trainlabel(i) == 0       # use label to create desired output in one-hot representation
        y[10] = 1
    else
        y[convert(Int,trainlabel(i))] = 1
    end
    # forward pass   
    x1 = f(W1*x0+b1)
    x2 = f(W2*x1+b2)
    # error computation
    errsq[t] = sum((y-x2).^2)
    delta2 = (y-x2).*df(x2)
    errcl[t] = Float64(rem(indmax(x2),10) != trainlabel(i))
    # backward pass
    delta1 = (W2'*delta2).*df(x1)
    # learning updates
    W2 += eta*delta2*x1'
    W1 += eta*delta1*x0'
    b2 += eta*delta2
    b1 += eta*delta1
    if rem(t,tshow) == 0    # visualization every tshow steps
        avgerrsq = cumsum(errsq[1:t])./(1:t)
        avgerrcl = cumsum(errcl[1:t])./(1:t)
        IJulia.clear_output(true)
        plot(
            heatmap(
                reshape(x0,28,28),
                axis = nothing,
                yflip = true, color = :grays,
                title = @sprintf("x0 at t=%d", t)
            ),
            bar(x1, ylabel = "x1"),
            bar(x2, ylabel = "x2"),
            plot(
                [avgerrsq avgerrcl], 
                ylabel = "error", 
                label = ["squared","class"],
                legend = :topright, ylim = (0,1)
                ), 
            plot(
                [heatmap(reshape(W1',28,28,n1)[:,:,i],
                axis = nothing, 
                color = :grays, 
                yflip = true
                ) for i = 1:n1]...),   # first layer weight vectors
            layout = @layout [a b c d; e{0.75h}]
            ) |> display
        sleep(0.01)
    end
end