In [None]:
using Knet, Plots, JLD, NBInclude
nbinclude("mnist.ipynb")  # loads MNIST, defines dtrn,dtst,Atype,train,softmax,zeroone
lin = load("lin.jld")     # loads linear model results for comparison
ENV["COLUMNS"]=80         # column width for array printing
plotlyjs();               # for interactive plots

## Multiple linear layers do not improve over linear model

In [2]:
# Let us try to concatenate multiple linear layers
function multilinear(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
    end
    return x
end;

In [3]:
# Weight initialization for multiple layers: h=array of layer sizes
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix and bias vector for the i'th layer
function winit(h...)  # use winit(x,h1,h2,...,hn,y) for n hidden layer model
    w = Any[]
    for i=2:length(h)
        push!(w, xavier(h[i],h[i-1]))
        push!(w, zeros(h[i],1))
    end
    map(Atype, w)
end;

In [4]:
w64=winit(784,64,10) # gives weights and biases for a multi layer model with a single hidden layer of size 64

4-element Array{Knet.KnetArray{Float32,2},1}:
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081057e0000, 200704, 0, nothing), (64, 784))
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081053eca00, 256, 0, nothing), (64, 1))     
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081058e0000, 2560, 0, nothing), (10, 64))   
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081053ecc00, 40, 0, nothing), (10, 1))      

In [5]:
(x,y) = first(dtst)
softmax(w64,x,y,multilinear)

2.3366258f0

In [30]:
if !isfile("mlp1.jld")
    setseed(1)
    @time weightsML=train(winit(784,64,10),dtrn,multilinear,lr=0.1)       # 33.9s
    @time trnlossML = [ softmax(w,dtrn,multilinear) for w in weightsML ]  # 22.2s
    @time tstlossML = [ softmax(w,dtst,multilinear) for w in weightsML ]  # 3.73s
    @time trnerrML =  [ zeroone(w,dtrn,multilinear) for w in weightsML ]  # 22.8s
    @time tsterrML =  [ zeroone(w,dtst,multilinear) for w in weightsML ]  # 3.84s
    @save "mlp1.jld" weightsML trnlossML tstlossML trnerrML tsterrML
else
    @eval (@load "mlp1.jld")
end
minimum(tstlossML),minimum(tsterrML)  # 0.2856, 0.0795

(0.2855976f0, 0.07950000000000002)

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossML tstlossML],ylim=(0.2,0.4),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Loss")  
# multilinear converges to a similar solution, not identical because problem is non-convex

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrML tsterrML],ylim=(0.06,0.12),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Error")  
# error results also close to the linear model

In [56]:
weightsML = nothing; knetgc() # to save gpu memory

## Multiple linear layers are useless because they are equivalent to a single linear layer
If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:

\begin{align*}
\hat{p} &= \mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\
&= \mbox{soft}((W_2 W_1)\, x + W_2 b_1 + b_2) \\
&= \mbox{soft}(W x + b)
\end{align*}

## Multi Layer Perceptron (MLP) adds non-linearities between layers

In [33]:
# Using nonlinearities (relu) results in a model with higher capacity which helps underfitting
function mlp(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1  # Apply element-wise non-linearity after every layer except the last
            x = relu.(x)    # relu here is the only difference between this and multilinear
        end    
    end
    return x
end;

In [34]:
w64=winit(784,64,10) # gives weights and biases for an MLP with a single hidden layer of size 64

4-element Array{Knet.KnetArray{Float32,2},1}:
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x0000008105842000, 200704, 0, nothing), (64, 784))
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081053ee600, 256, 0, nothing), (64, 1))     
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081058e4400, 2560, 0, nothing), (10, 64))   
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081053ee800, 40, 0, nothing), (10, 1))      

In [35]:
softmax(w64,x,y,mlp)

2.3136413f0

In [36]:
if !isfile("mlp2.jld")
    setseed(1)
    @time weightsMLP=train(winit(784,64,10),dtrn,mlp,lr=0.1)        # 35.4s
    @time trnlossMLP = [ softmax(w,dtrn,mlp) for w in weightsMLP ]  # 23.7s
    @time tstlossMLP = [ softmax(w,dtst,mlp) for w in weightsMLP ]  # 3.99s
    @time trnerrMLP =  [ zeroone(w,dtrn,mlp) for w in weightsMLP ]  # 23.3s
    @time tsterrMLP =  [ zeroone(w,dtst,mlp) for w in weightsMLP ]  # 3.91s
    @save "mlp2.jld" weightsMLP trnlossMLP tstlossMLP trnerrMLP tsterrMLP
else
    @eval (@load "mlp2.jld")
end
minimum(tstlossMLP),minimum(tsterrMLP)  # 0.0808, 0.0235

 38.562059 seconds (33.81 M allocations: 18.993 GiB, 6.97% gc time)
 25.646538 seconds (8.54 M allocations: 18.050 GiB, 8.03% gc time)
  4.297011 seconds (1.42 M allocations: 3.008 GiB, 8.11% gc time)
 24.938192 seconds (8.20 M allocations: 18.565 GiB, 8.52% gc time)
  4.159445 seconds (1.37 M allocations: 3.095 GiB, 8.53% gc time)


(0.080823354f0, 0.023499999999999965)

## MLP solves underfitting but still has an overfitting problem

In [None]:
plot([lin["trnloss"] lin["tstloss"] trnlossMLP tstlossMLP],ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")  
# Solves the underfitting problem!
# A more serious overfitting problem remains.

In [None]:
plot([lin["trnerr"] lin["tsterr"] trnerrMLP tsterrMLP],ylim=(0,0.1),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Error")  
# test error improves from 7.5% to 2.3%!

In [57]:
weightsMLP = nothing; knetgc() # to save gpu memory

## MLP with L1 regularization

In [39]:
# Redefine softmax loss function to accept keyword parameters l1 and l2 for regularization
# Use non-zero l1 or l2 for regularization (only on matrices not biases)
function softmax(w,x,y,predict;l1=0,l2=0,o...)
    J = nll(predict(w,x;o...),y)
    if l1 != 0; J += Float32(l1) * sum(sum(abs,wi)  for wi in w[1:2:end]); end
    if l2 != 0; J += Float32(l2) * sum(sum(abs2,wi) for wi in w[1:2:end]); end
    return J
end;

In [40]:
# We still have overfitting, let's try L1 regularization
if !isfile("mlp3.jld")
    srand(1)
    @time weightsL1=train(winit(784,64,10),dtrn,mlp;lr=0.1,l1=0.00004)  # 47.3s
    @time trnlossL1= [ softmax(w,dtrn,mlp) for w in weightsL1 ]  # 24.8s
    @time tstlossL1= [ softmax(w,dtst,mlp) for w in weightsL1 ]  # 4.17s
    @time trnerrL1=  [ zeroone(w,dtrn,mlp) for w in weightsL1 ]  # 23.7s
    @time tsterrL1=  [ zeroone(w,dtst,mlp) for w in weightsL1 ]  # 3.95s
    @save "mlp3.jld" weightsL1 trnlossL1 tstlossL1 trnerrL1 tsterrL1
else
    @eval (@load "mlp3.jld")
end
minimum(tstlossL1),minimum(tsterrL1)  # 0.0759, 0.0220

 51.794006 seconds (48.03 M allocations: 19.536 GiB, 5.74% gc time)
 26.723961 seconds (8.53 M allocations: 18.049 GiB, 8.10% gc time)
  4.462811 seconds (1.42 M allocations: 3.008 GiB, 8.19% gc time)
 25.088010 seconds (8.19 M allocations: 18.565 GiB, 8.77% gc time)
  4.169910 seconds (1.37 M allocations: 3.095 GiB, 8.77% gc time)


(0.07594314f0, 0.02200000000000002)

In [None]:
plot([trnlossMLP tstlossMLP trnlossL1 tstlossL1],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Loss")  
# overfitting less, test loss improves from 0.0808 to 0.0759

In [None]:
plot([trnerrMLP tsterrMLP trnerrL1 tsterrL1],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnL1 :tstL1],xlabel="Epochs", ylabel="Error")    
# however test error does not change significantly: 0.0235 -> 0.0220

In [58]:
weightsL1 = nothing; knetgc() # to save gpu memory

## MLP with dropout

In [43]:
# Dropout is another way to address overfitting
function mlpdrop(w,x; pdrop=(0,0))
    for i=1:2:length(w)
        x = dropout(x, pdrop[i==1?1:2])  # apply one of two dropout rates
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1; x = relu.(x); end
    end
    return x
end;

In [44]:
@doc dropout

```
dropout(x, p)
```

Given an array `x` and probability `0<=p<=1`, just return `x` if `p==0`, or return an array `y` in which each element is 0 with probability `p` or `x[i]/(1-p)` with probability `1-p`.  Use `seed::Number` to set the random number seed for reproducible results. See [(Srivastava et al. 2014)](http://www.jmlr.org/papers/v15/srivastava14a.html) for a reference.


In [45]:
if !isfile("mlp4.jld")
    setseed(1)
    @time weightsDR=train(winit(784,64,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 38.9s
    @time trnlossDR = [ softmax(w,dtrn,mlpdrop) for w in weightsDR ]     # 25.7s
    @time tstlossDR = [ softmax(w,dtst,mlpdrop) for w in weightsDR ]     # 4.25s
    @time trnerrDR =  [ zeroone(w,dtrn,mlpdrop) for w in weightsDR ]     # 24.3s
    @time tsterrDR =  [ zeroone(w,dtst,mlpdrop) for w in weightsDR ]     # 4.11s
    @save "mlp4.jld" weightsDR trnlossDR tstlossDR trnerrDR tsterrDR
else
    @eval (@load "mlp4.jld")
end
minimum(tstlossDR),minimum(tsterrDR)  # 0.0639, 0.0188

 42.472094 seconds (37.79 M allocations: 19.173 GiB, 7.05% gc time)
 25.777243 seconds (8.57 M allocations: 18.051 GiB, 8.15% gc time)
  4.311885 seconds (1.43 M allocations: 3.008 GiB, 8.21% gc time)
 24.608715 seconds (8.20 M allocations: 18.565 GiB, 8.29% gc time)
  4.139163 seconds (1.37 M allocations: 3.095 GiB, 8.30% gc time)


(0.06392153f0, 0.01880000000000004)

In [None]:
plot([trnlossMLP tstlossMLP trnlossDR tstlossDR],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Loss")
# overfitting less, loss results improve 0.0808 -> 0.0639

In [None]:
plot([trnerrMLP tsterrMLP trnerrDR tsterrDR],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Error")  
# this time error also improves 0.0235 -> 0.0188

In [48]:
:mlperr,minimum(tsterrMLP),:L1err,minimum(tsterrL1),:dropouterr,minimum(tsterrDR)

(:mlperr, 0.023499999999999965, :L1err, 0.02200000000000002, :dropouterr, 0.01880000000000004)

In [49]:
:mlploss,minimum(tstlossMLP),:L1loss,minimum(tstlossL1),:dropoutloss,minimum(tstlossDR)

(:mlploss, 0.080823354f0, :L1loss, 0.07594314f0, :dropoutloss, 0.06392153f0)

In [59]:
weightsDR = nothing; knetgc() # to save gpu memory

## MLP with larger hidden layer

In [50]:
# The current trend is to use models with higher capacity tempered with dropout
if !isfile("mlp.jld")
    setseed(1)
    @time weights=train(winit(784,256,10),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 34.6s
    @time trnloss = [ softmax(w,dtrn,mlpdrop) for w in weights ] # 21.2s
    @time tstloss = [ softmax(w,dtst,mlpdrop) for w in weights ] # 3.61s
    @time trnerr =  [ zeroone(w,dtrn,mlpdrop) for w in weights ] # 21.7s
    @time tsterr =  [ zeroone(w,dtst,mlpdrop) for w in weights ] # 3.63s
    @save "mlp.jld" weights trnloss tstloss trnerr tsterr
else
    @eval (@load "mlp.jld")
end
minimum(tstloss),minimum(tsterr)  # 0.0494, 0.0148

 39.956995 seconds (37.86 M allocations: 19.173 GiB, 6.96% gc time)
 24.695883 seconds (8.50 M allocations: 18.047 GiB, 8.38% gc time)
  4.158595 seconds (1.42 M allocations: 3.008 GiB, 8.37% gc time)
 24.756995 seconds (8.19 M allocations: 18.565 GiB, 8.41% gc time)
  4.204890 seconds (1.37 M allocations: 3.095 GiB, 8.32% gc time)


(0.04944369f0, 0.014800000000000035)

In [None]:
plot([trnlossDR tstlossDR trnloss tstloss],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
plot([trnerrDR tsterrDR trnerr tsterr],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")
# We are down to 0.015 error.

## Visualizing hidden weights

In [None]:
ENV["COLUMNS"]=120
w = weights[end]
w1 = reshape(Array(w[1])', (28,28,1,256))
w2 = clamp.(2.5.*w1.+0.5,0,1)
IJulia.clear_output(true)
display(hvcat(16, [mnistview(w2,i) for i=1:256]...))