# Multilayer Perceptron (MLP) 
(c) Deniz Yuret, 2019

* Objectives: See the effect of nonlinearities, learn about regularization and dropout to combat overfitting.
* Prerequisites: Linear models (lin.ipynb), AutoGrad, Param, KnetArray, gpu, nll, zeroone
* Knet: zeroone, progress, sgd, load, save, gc (used by trainresults)
* Knet: xavier, param, param0 (defined and explained)
* Knet: Param, KnetArray, gpu (used by param, param0)
* Knet: Data, nll, relu, training (used in model definitions)

In [None]:
# Set display width, load packages, import symbols
ENV["COLUMNS"]=72
using Pkg; for p in ("Knet","Plots"); haskey(Pkg.installed(),p) || Pkg.add(p); end
using Knet: Knet, dir, zeroone, progress, sgd, load, save, gc, Param, KnetArray, gpu, Data, nll, relu, training # param, param0, xavier
using Statistics: mean
using Base.Iterators: flatten

In [None]:
# Load data (see 02.mnist.ipynb)
include(Knet.dir("data","mnist.jl"))  # Load data
dtrn,dtst = mnistdata(xsize=(784,:)); # dtrn and dtst = [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100

In [None]:
# For running experiments
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)
        r = ((model(dtrn), model(dtst), zeroone(model,dtrn), zeroone(model,dtst))
             for x in takeevery(length(dtrn), progress(sgd(model,repeat(dtrn,100)))))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

In [None]:
# Some utilities to make model definitions easier:
param(d...; init=xavier, atype=atype())=Param(atype(init(d...)))
param0(d...; atype=atype())=param(d...; init=zeros, atype=atype)
xavier(o,i) = (s = sqrt(2/(i+o)); 2s .* rand(o,i) .- s)
atype()=(gpu() >= 0 ? KnetArray{Float32} : Array{Float32})

## A generic multilayer model

In [None]:
# Let's define a chain of layers
struct Chain
    layers
    Chain(layers...) = new(layers)
end
(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y) = nll(c(x),y)
(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)

## Multiple linear layers do not improve over a single linear layer

In [None]:
# Define a linear layer (See lin.ipynb):
struct Layer0; w; b; end
Layer0(i::Int,o::Int) = Layer0(param(o,i),param0(o))
(l::Layer0)(x) = (l.w * x .+ l.b)

In [None]:
# Here is an example two layer model
model=Chain(Layer0(784,64), Layer0(64,10))
println.(summary.((l.w,l.b)) for l in model.layers);

In [None]:
# Train the two layer model
# 52s [0.240726; 0.281965; 0.0691833; 0.0794]
mlp1 = trainresults("mlp113a.jld2", model);

In [None]:
# Here is a single layer (linear) model
model=Chain(Layer0(784,10))
println.(summary.((l.w,l.b)) for l in model.layers);

In [None]:
# Train the single layer (linear) model
# 43s [0.242353; 0.267041; 0.0669667; 0.0749]
lin1 = trainresults("mlp113b.jld2", model);

In [None]:
using Plots; default(fmt=:png,ls=:auto)

In [None]:
# multilinear converges to a similar solution, not identical because problem is non-convex
plot([lin1[1,:], lin1[2,:], mlp1[1,:], mlp1[2,:]], ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Loss")  

In [None]:
# error results also close to the linear model
plot([lin1[3,:], lin1[4,:], mlp1[3,:], mlp1[4,:]], ylim=(0.0,0.1),
    labels=[:trnLin,:tstLin,:trnMulti,:tstMulti], xlabel="Epochs", ylabel="Error")  

## Multiple linear layers are useless because they are equivalent to a single linear layer
If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:

\begin{align*}
\hat{p} &= \mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\
&= \mbox{soft}((W_2 W_1)\, x + W_2 b_1 + b_2) \\
&= \mbox{soft}(W x + b)
\end{align*}

## Multi Layer Perceptron (MLP) adds non-linearities between layers

In [None]:
# Using nonlinearities between layers results in a model with higher capacity and helps underfitting
# relu(x)=max(0,x) is a popular function used for this purpose, it replaces all negative values with zeros.
struct Layer1; w; b; f; end
Layer1(i::Int,o::Int,f=identity) = Layer1(param(o,i),param0(o),f)
(l::Layer1)(x) = l.f.(l.w * x .+ l.b)

In [None]:
# We add a nonlinear activation function to all but the last layer
model = Chain(Layer1(784,64,relu), Layer1(64,10))
# 54s [0.00612065; 0.0864965; 0.00055; 0.0244]
mlp2 = trainresults("mlp113c.jld2", model);

## MLP solves underfitting but still has an overfitting problem

In [None]:
# MLP solves the underfitting problem!  A more serious overfitting problem remains.
plot([lin1[1,:], lin1[2,:], mlp2[1,:], mlp2[2,:]], ylim=(0.0,0.4),
     labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")

In [None]:
# Test error improves from 7.5% to 2.5%!
plot([lin1[3,:], lin1[4,:], mlp2[3,:], mlp2[4,:]], ylim=(0.0,0.1),
    labels=[:trnLin,:tstLin,:trnMLP,:tstMLP], xlabel="Epochs", ylabel="Error")  

## MLP with L1/L2 regularization

In [None]:
# We add two new fields for L1 and L2 regularization
struct Chain2
    layers; λ1; λ2
    Chain2(layers...; λ1=0, λ2=0) = new(layers, λ1, λ2)
end

# The prediction and average loss do not change
(c::Chain2)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain2)(d::Data) = mean(c(x,y) for (x,y) in d)

In [None]:
# The loss function penalizes the L1 and/or L2 norms of parameters during training
function (c::Chain2)(x,y)
    loss = nll(c(x),y)
    if training() # Only apply regularization during training, only to weights, not biases.
        c.λ1 != 0 && (loss += c.λ1 * sum(sum(abs, l.w) for l in c.layers))
        c.λ2 != 0 && (loss += c.λ2 * sum(sum(abs2,l.w) for l in c.layers))
    end
    return loss
end

In [None]:
model = Chain2(Layer1(784,64,relu), Layer1(64,10); λ1=4f-5)
# 61s [0.0259648; 0.0722113; 0.00625; 0.0212]
mlp3 = trainresults("mlp113d.jld2", model);

In [None]:
# overfitting less, test loss improves from 0.0865 to 0.0722
plot([mlp2[1,:], mlp2[2,:], mlp3[1,:], mlp3[2,:]], ylim=(0.0,0.15),
     labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs",ylabel="Loss")

In [None]:
# test error also improves: 0.0244 -> 0.0212
plot([mlp2[3,:], mlp2[4,:], mlp3[3,:], mlp3[4,:]], ylim=(0.0,0.04),
     labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs",ylabel="Error")

## MLP with dropout

In [None]:
using Knet: dropout
@doc dropout

In [None]:
# Dropout is another way to address overfitting
struct Layer2; w; b; f; pdrop; end
Layer2(i::Int,o::Int,f=identity; pdrop=0) = Layer2(param(o,i),param0(o),f,pdrop)
(l::Layer2)(x) = l.f.(l.w * dropout(x,l.pdrop) .+ l.b)

In [None]:
model = Chain(Layer2(784,64,relu,pdrop=0.2), Layer2(64,10))
# 55s [0.0134416; 0.0672397; 0.00371667; 0.0193]
mlp4 = trainresults("mlp113e.jld2", model);

In [None]:
# overfitting less, loss results improve 0.0865 -> 0.0672
plot([mlp2[1,:], mlp2[2,:], mlp4[1,:], mlp4[2,:]], ylim=(0.0,0.15),
     labels=[:trnMLP :tstMLP :trnDrop :tstDrop],xlabel="Epochs",ylabel="Loss")

In [None]:
# this time error also improves 0.0244 -> 0.0193
plot([mlp2[3,:], mlp2[4,:], mlp4[3,:], mlp4[4,:]], ylim=(0.0,0.04),
     labels=[:trnMLP :tstMLP :trnDrop :tstDrop],xlabel="Epochs",ylabel="Error")

In [None]:
(mlperr=minimum(mlp2[4,:]),L1err=minimum(mlp3[4,:]),dropouterr=minimum(mlp4[4,:]))

In [None]:
(mlploss=minimum(mlp2[2,:]),L1loss=minimum(mlp3[2,:]),dropoutloss=minimum(mlp4[2,:]))

## MLP with larger hidden layer and dropout

In [None]:
# The current trend is to use models with higher capacity tempered with dropout
model = Chain(Layer2(784,256,relu,pdrop=0.2), Layer2(256,10))
# 56s [0.00393102; 0.0491462; 0.0004; 0.0154]
mlp = trainresults("mlp113f.jld2", model);

In [None]:
# Both train and test loss is better with the larger model
plot([mlp4[1,:], mlp4[2,:], mlp[1,:], mlp[2,:]],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
# We are down to 1.5% error.
plot([mlp4[3,:], mlp4[4,:], mlp[3,:], mlp[4,:]],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")