In [None]:
using Knet, Plots, JLD, NBInclude
nbinclude("mnist.ipynb")  # loads MNIST, defines dtrn,dtst,Atype,train,softmax,zeroone
lin = load("lin.jld")     # loads linear model results for comparison
ENV["COLUMNS"]=80         # column width for array printing
plotlyjs();               # for interactive plots

## Multiple linear layers do not improve over linear model

In [None]:
# Let us try to concatenate multiple linear layers
function multilinear(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
    end
    return x
end;

In [None]:
# Weight initialization for multiple layers: h=array of layer sizes
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix and bias vector for the i'th layer
function winit(h...)  # use winit(x,h1,h2,...,hn,y) for n hidden layer model
    w = Any[]
    for i=2:length(h)
        push!(w, xavier(h[i],h[i-1]))
        push!(w, zeros(h[i],1))
    end
    map(Atype, w)
end;

In [4]:
w64=winit(784,64,10) # gives weights and biases for a multi layer model with a single hidden layer of size 64

4-element Array{Knet.KnetArray{Float32,2},1}:
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081052e0000, 200704, 0, nothing), (64, 784))
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081053e0000, 256, 0, nothing), (64, 1))     
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081054e0000, 2560, 0, nothing), (10, 64))   
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081053e0200, 40, 0, nothing), (10, 1))      

In [5]:
(x,y) = first(dtst)
softmax(w64,x,y,multilinear)

2.3143754f0

In [6]:
setseed(1)
@time weights=train(winit(784,64,10),dtrn,multilinear,lr=0.1)       # 33.9s
@time trnlossML = [ softmax(w,dtrn,multilinear) for w in weights ]  # 22.2s
@time tstlossML = [ softmax(w,dtst,multilinear) for w in weights ]  # 3.73s
@time trnerrML =  [ zeroone(w,dtrn,multilinear) for w in weights ]  # 22.8s
@time tsterrML =  [ zeroone(w,dtst,multilinear) for w in weights ]  # 3.84s
minimum(tstlossML),minimum(tsterrML)  # 0.2856, 0.0795

 36.005824 seconds (32.83 M allocations: 18.971 GiB, 4.87% gc time)
 23.275724 seconds (8.01 M allocations: 18.040 GiB, 5.73% gc time)
  3.857019 seconds (1.33 M allocations: 3.007 GiB, 5.94% gc time)
 22.546905 seconds (8.05 M allocations: 18.575 GiB, 6.25% gc time)
  3.726330 seconds (1.28 M allocations: 3.093 GiB, 6.39% gc time)


(0.2855976f0, 0.07950000000000002)

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossML tstlossML],ylim=(0.2,0.4),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Loss")  
# multilinear converges to a similar solution, not identical because problem is non-convex

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrML tsterrML],ylim=(0.06,0.12),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Error")  
# error results also close to the linear model

## Multiple linear layers are useless because they are equivalent to a single linear layer
If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:

\begin{align*}
\hat{p} &= \mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\
&= \mbox{soft}((W_2 W_1)\, x + W_2 b_1 + b_2) \\
&= \mbox{soft}(W x + b)
\end{align*}

## Multi Layer Perceptron (MLP) adds non-linearities between layers

In [None]:
# Using nonlinearities (relu) results in a model with higher capacity which helps underfitting
function mlp(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1  # Apply element-wise non-linearity after every layer except the last
            x = relu.(x)    # relu here is the only difference between this and multilinear
        end    
    end
    return x
end;

In [10]:
w64=winit(784,64,10) # gives weights and biases for an MLP with a single hidden layer of size 64

4-element Array{Knet.KnetArray{Float32,2},1}:
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x0000008107d11000, 200704, 0, nothing), (64, 784))
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081053e3600, 256, 0, nothing), (64, 1))     
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081054ec800, 2560, 0, nothing), (10, 64))   
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Nothing} @0x00000081053e3400, 40, 0, nothing), (10, 1))      

In [11]:
softmax(w64,x,y,mlp)

2.3136413f0

In [12]:
setseed(1)
@time weights=train(winit(784,64,10),dtrn,mlp,lr=0.1)        # 35.4s
@time trnlossMLP = [ softmax(w,dtrn,mlp) for w in weights ]  # 23.7s
@time tstlossMLP = [ softmax(w,dtst,mlp) for w in weights ]  # 3.99s
@time trnerrMLP =  [ zeroone(w,dtrn,mlp) for w in weights ]  # 23.3s
@time tsterrMLP =  [ zeroone(w,dtst,mlp) for w in weights ]  # 3.91s
minimum(tstlossMLP),minimum(tsterrMLP)  # 0.0808, 0.0235

 36.553602 seconds (33.81 M allocations: 18.993 GiB, 6.72% gc time)
 23.957431 seconds (8.54 M allocations: 18.050 GiB, 7.71% gc time)
  4.211379 seconds (1.43 M allocations: 3.008 GiB, 7.85% gc time)
 23.094740 seconds (8.20 M allocations: 18.565 GiB, 7.87% gc time)
  3.891167 seconds (1.37 M allocations: 3.095 GiB, 8.11% gc time)


(0.080823354f0, 0.023499999999999965)

## MLP solves underfitting but still has an overfitting problem

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossMLP tstlossMLP],ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")  
# Solves the underfitting problem!
# A more serious overfitting problem remains.

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrMLP tsterrMLP],ylim=(0,0.1),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Error")  
# test error improves from 7.5% to 2.3%!

## MLP with L1 regularization

In [None]:
# Redefine softmax loss function to accept keyword parameters l1 and l2 for regularization
# Use non-zero l1 or l2 for regularization (only on matrices not biases)
function softmax(w,x,y,predict;l1=0,l2=0,o...)
    J = nll(predict(w,x;o...),y)
    if l1 != 0; J += Float32(l1) * sum(sum(abs,wi)  for wi in w[1:2:end]); end
    if l2 != 0; J += Float32(l2) * sum(sum(abs2,wi) for wi in w[1:2:end]); end
    return J
end;

In [16]:
# We still have overfitting, let's try L1 regularization
srand(1)
@time weights=train(winit(784,64,10),dtrn,mlp;lr=0.1,l1=0.00004)  # 47.3s
@time trnlossL1= [ softmax(w,dtrn,mlp) for w in weights ]  # 24.8s
@time tstlossL1= [ softmax(w,dtst,mlp) for w in weights ]  # 4.17s
@time trnerrL1=  [ zeroone(w,dtrn,mlp) for w in weights ]  # 23.7s
@time tsterrL1=  [ zeroone(w,dtst,mlp) for w in weights ]  # 3.95s
minimum(tstlossL1),minimum(tsterrL1)  # 0.0759, 0.0220

 48.586289 seconds (47.57 M allocations: 19.531 GiB, 5.61% gc time)
 24.767664 seconds (8.54 M allocations: 18.049 GiB, 7.51% gc time)
  8.536473 seconds (1.42 M allocations: 3.008 GiB, 6.98% gc time)
 49.367339 seconds (8.19 M allocations: 18.565 GiB, 8.28% gc time)
  8.269202 seconds (1.37 M allocations: 3.095 GiB, 7.96% gc time)


(0.07594314f0, 0.02200000000000002)

In [None]:
plot([trnlossMLP tstlossMLP trnlossL1 tstlossL1],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Loss")  
# overfitting less, test loss improves from 0.0808 to 0.0759

In [None]:
plot([trnerrMLP tsterrMLP trnerrL1 tsterrL1],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Error")    
# however test error does not change significantly: 0.0235 -> 0.0220

## MLP with dropout

In [None]:
# Dropout is another way to address overfitting
function mlpdrop(w,x; pdrop=(0,0))
    for i=1:2:length(w)
        x = dropout(x, pdrop[i==1?1:2])  # apply one of two dropout rates
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1; x = relu.(x); end
    end
    return x
end;

In [20]:
@doc dropout

```
dropout(x, p)
```

Given an array `x` and probability `0<=p<=1`, just return `x` if `p==0`, or return an array `y` in which each element is 0 with probability `p` or `x[i]/(1-p)` with probability `1-p`.  Use `seed::Number` to set the random number seed for reproducible results. See [(Srivastava et al. 2014)](http://www.jmlr.org/papers/v15/srivastava14a.html) for a reference.


In [21]:
setseed(1)
@time weights=train(winit(784,64,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 38.9s
@time trnlossDR = [ softmax(w,dtrn,mlpdrop) for w in weights ]     # 25.7s
@time tstlossDR = [ softmax(w,dtst,mlpdrop) for w in weights ]     # 4.25s
@time trnerrDR =  [ zeroone(w,dtrn,mlpdrop) for w in weights ]     # 24.3s
@time tsterrDR =  [ zeroone(w,dtst,mlpdrop) for w in weights ]     # 4.11s
minimum(tstlossDR),minimum(tsterrDR)  # 0.0639, 0.0188

 68.963259 seconds (37.80 M allocations: 19.173 GiB, 6.43% gc time)
 24.454021 seconds (8.57 M allocations: 18.051 GiB, 7.76% gc time)
  4.008742 seconds (1.42 M allocations: 3.008 GiB, 7.85% gc time)
 23.039907 seconds (8.20 M allocations: 18.565 GiB, 8.04% gc time)
  3.870376 seconds (1.37 M allocations: 3.095 GiB, 8.27% gc time)


(0.06392153f0, 0.01880000000000004)

In [None]:
plot([trnlossMLP tstlossMLP trnlossDR tstlossDR],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Loss")
# overfitting less, loss results improve 0.0808 -> 0.0639

In [None]:
plot([trnerrMLP tsterrMLP trnerrDR tsterrDR],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Error")  
# this time error also improves 0.0235 -> 0.0188

In [24]:
:mlperr,minimum(tsterrMLP),:L1err,minimum(tsterrL1),:dropouterr,minimum(tsterrDR)

(:mlperr, 0.023499999999999965, :L1err, 0.02200000000000002, :dropouterr, 0.01880000000000004)

In [25]:
:mlploss,minimum(tstlossMLP),:L1loss,minimum(tstlossL1),:dropoutloss,minimum(tstlossDR)

(:mlploss, 0.080823354f0, :L1loss, 0.07594314f0, :dropoutloss, 0.06392153f0)

## MLP with larger hidden layer

In [26]:
# The current trend is to use models with higher capacity tempered with dropout
if !isfile("mlp.jld")
    setseed(1)
    @time weights=train(winit(784,256,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 34.6s
    @time trnloss = [ softmax(w,dtrn,mlpdrop) for w in weights ] # 21.2s
    @time tstloss = [ softmax(w,dtst,mlpdrop) for w in weights ] # 3.61s
    @time trnerr =  [ zeroone(w,dtrn,mlpdrop) for w in weights ] # 21.7s
    @time tsterr =  [ zeroone(w,dtst,mlpdrop) for w in weights ] # 3.63s
    @save "mlp.jld" trnloss tstloss trnerr tsterr
else
    @load "mlp.jld"
end
minimum(tstloss),minimum(tsterr)  # 0.0494, 0.0148

(0.04944369f0, 0.014800000000000035)

In [None]:
plot([trnlossDR tstlossDR trnloss tstloss],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
plot([trnerrDR tsterrDR trnerr tsterr],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")
# We are down to 0.015 error.