# Underfitting, overfitting, regularization, dropout

## Load and minibatch MNIST data

In [None]:
# Fix for juliabox
if ENV["HOME"] == "/mnt/juliabox"; Pkg.dir(path...)=joinpath("/home/jrun/.julia/v0.6",path...); end

In [None]:
using Knet, Plots, Images
plotly()    # use the plotly backend for Plots
Knet.gpu()  # should return device-id >= 0 if there is a gpu

In [None]:
# Load data, show some samples
include(Knet.dir("data","mnist.jl"))
xtrn,ytrn,xtst,ytst = mnist()
for a in (xtrn,ytrn,xtst,ytst); println(summary(a)); end

In [None]:
# Show some samples
for i=1:3; display(mnistview(xtst,i)); end
ytst[1:3]

In [None]:
# Minibatch data
Atype = gpu() >= 0 ? KnetArray{Float32} : Array{Float32}
dtst = minibatch(xtst,ytst,100;xtype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100
dtrn = minibatch(xtrn,ytrn,100;xtype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100
length(dtrn),length(dtst)

In [None]:
# Here is the first minibatch
(x,y)=first(dtst)
println(summary(x))  # 4-D Float32 array with X,Y,C,N
println(summary(y))  # 1-D integer array

## Define linear model

In [None]:
function linear(w,x)
    y = w[1]*mat(x) .+ w[2]
end

winit1(;std=0.01)=map(Atype, [ std*randn(10,784), zeros(10,1) ])

In [None]:
setseed(9)
w = winit1()  # random weight matrix and a zero bias vector

In [None]:
ypred = linear(w,x)
summary(ypred) # predictions are given as a 10xN score matrix

In [None]:
display(y')   # correct answers are given as an array of integers

## Measuring error rate

In [None]:
accuracy(ypred,y)  # 2-arg version: accuracy on this batch of 100 with initial w

In [None]:
accuracy(w,dtst,linear)  # 3-arg version: accuracy on the whole test dataset

In [None]:
# zeroone loss (error) defined as 1 - accuracy
zeroone(w,data,model) = 1 - accuracy(w,data,model)
zeroone(w,dtst,linear)

## Measuring loss

In [None]:
# Calculate cross entropy loss of a model with weights w for one minibatch (x,y)
# Use the predict function to get model output: ypred = predict(w,x;o...)
# Use non-zero l1 or l2 for regularization (only on matrices not biases)
function softloss(w,x,y,predict;l1=0,l2=0,o...)
    J = nll(predict(w,x;o...),y)
    if l1 != 0; J += Float32(l1) * sum(sum(abs,wi)  for wi in w[1:2:end]); end
    if l2 != 0; J += Float32(l2) * sum(sum(abs2,wi) for wi in w[1:2:end]); end
    return J
end

In [None]:
softloss(w,x,y,linear)  # per-instance average softloss for the first test minibatch

In [None]:
function avgloss(w,data,predict) # average loss for the whole dataset
    sum = cnt = 0
    for (x,y) in data
        sum += softloss(w,x,y,predict)
        cnt += 1
    end
    return sum/cnt
end

In [None]:
avgloss(w,dtst,linear)  # per-instance average softloss for the whole test set, should be close to -log(1/10)=2.3

In [None]:
# Manual loss calculation
ypred=linear(w,x)
yp1 = exp.(ypred)
yp2 = yp1 ./ sum(yp1,1)
yp3 = -log.(yp2)
yc1 = full(sparse(y,1:100,1f0))
sum(Array(yp3).*yc1) / 100

## Calculating gradient

In [None]:
# Manually defined gradient for softloss
function softgrad_manual(w,x,y,predict)
    x = mat(x)
    p = predict(w,x)
    p = p .- maximum(p,1) # for numerical stability
    expp = exp.(p)
    p = expp ./ sum(expp,1)
    q = oftype(p, sparse(convert(Vector{Int},y),1:length(y),1,size(p,1),length(y)))
    dJdy = (p - q) / size(x,2)
    dJdw = dJdy * x'
    dJdb = sum(dJdy,2)
    Any[dJdw,dJdb]
end

In [None]:
# Automatically defined gradient for softloss
softgrad = grad(softloss)  # Knet/AutoGrad makes life easier :)

In [None]:
setseed(9)
w1 = winit1(std=0.1)  # use a larger std to get a larger gradient for this example

In [None]:
g1 = softgrad_manual(w1,x,y,linear)

In [None]:
g2 = softgrad(w1,x,y,linear)

In [None]:
isapprox(g1[1],g2[1])

In [None]:
isapprox(g1[2],g2[2])

## Checking the gradient

In [None]:
display(g1[2]')  
# Meaning of gradient:
# If I move the last entry of w[2] by epsilon, loss will go up by 0.345075 epsilon!

In [None]:
display(w1[2]')

In [None]:
softloss(w1,x,y,linear)

In [None]:
w1[2][10] = 0.1   # to numerically check the gradient let's move the last entry by +0.1.
display(w1[2]')

In [None]:
softloss(w1,x,y,linear)  
# We see that the loss moves by +0.03 as expected.
# You should check all/most entries in your gradients this way to make sure they are correct.

## Training loop (SGD)

In [None]:
# Train model(w) with SGD and return a list containing w for every epoch
function train(w,data,predict; epochs=100,lr=0.1,o...)
    weights = Any[deepcopy(w)]
    for epoch in 1:epochs
        for (x,y) in data
            g = softgrad(w,x,y,predict;o...)
            update!(w,g,lr=lr)  # w[i] = w[i] - lr * g[i]
        end
        push!(weights,deepcopy(w))
    end
    return weights
end

## Training the linear model and underfitting

In [None]:
setseed(1)
@time trn1=train(winit1(),dtrn,linear,lr=0.1)  # 31.1s
@time trnloss1 = [ avgloss(w,dtrn,linear) for w in trn1 ]  # 22.2s
@time tstloss1 = [ avgloss(w,dtst,linear) for w in trn1 ]  # 3.7s
@time trnerr1 = [ zeroone(w,dtrn,linear) for w in trn1 ]   # 20.6s
@time tsterr1 = [ zeroone(w,dtst,linear) for w in trn1 ]   # 3.4s
minimum(tstloss1),minimum(tsterr1)  # 0.2667, 0.0744

In [None]:
plot([trnloss1 tstloss1],ylim=(.2,.36),labels=[:trnloss :tstloss],xlabel="Epochs",ylabel="Loss") 
# Demonstrates underfitting: training loss not close to 0
# Also slight overfitting: test loss higher than train

In [None]:
plot([trnerr1 tsterr1],ylim=(.06,.10),labels=[:trnerr :tsterr],xlabel="Epochs",ylabel="Error")  
# this is the error plot, we get to about 7.5% test error, i.e. 92.5% accuracy

## Multi-layer linear model does not improve results

In [None]:
# Let us try to concatenate multiple linear layers
function multilinear(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
    end
    return x
end

In [None]:
# Weight initialization for multiple layers: x=input size, y=output size, h=array of hidden layer sizes
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix and bias vector for the i'th layer
function winit(h...; std=0.01, x=784, y=10)  # use winit(h1,h2,...,hn) for n hidden layer model
    h = [x, h..., y]
    w = Any[]
    for i=1:length(h)-1
        push!(w, std*randn(h[i+1],h[i]))
        push!(w, zeros(h[i+1],1))
    end
    map(Atype, w)
end

In [None]:
w1m=winit(64) # gives weights and biases for a multi layer model with a single hidden layer of size 64

In [None]:
softloss(w1m,x,y,multilinear)

In [None]:
setseed(1)
@time trn1m=train(winit(64),dtrn,multilinear,lr=0.1)  # 33.9s
#@time trn1m=train(w2,dtrn,multilinear,lr=0.01,epochs=10)  # 33.9s
@time trnloss1m = [ avgloss(w,dtrn,multilinear) for w in trn1m ]  # 22.2s
@time tstloss1m = [ avgloss(w,dtst,multilinear) for w in trn1m ]  # 3.73s
@time trnerr1m = [ zeroone(w,dtrn,multilinear) for w in trn1m ]   # 22.8s
@time tsterr1m = [ zeroone(w,dtst,multilinear) for w in trn1m ]   # 3.84s
minimum(tstloss1m),minimum(tsterr1m)  # 0.285, 0.0797

In [None]:
plot([trnloss1 tstloss1 trnloss1m tstloss1m],ylim=(0.2,0.4),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Loss")  
# multilinear converges to a similar solution, not identical because problem is non-convex

In [None]:
plot([trnerr1 tsterr1 trnerr1m tsterr1m],ylim=(0.06,0.12),
    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel="Epochs",ylabel="Error")  
# error results also close to the linear model

## Multiple linear layers are useless because they are equivalent to a single linear layer
If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:
\begin{align*}
\hat{p} &= \mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\
&= \mbox{soft}((W_2 W_1)\, x + W_2 b_1 + b_2) \\
&= \mbox{soft}(W x + b)
\end{align*}

## Multi Layer Perceptron (MLP) adds non-linearities between layers, shows overfitting

In [None]:
# Using nonlinearities (relu) results in a model with higher capacity which helps underfitting
function mlp(w,x)
    for i=1:2:length(w)
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1  # Apply element-wise non-linearity after every layer except the last
            x = relu.(x)    # relu here is the only difference between this and multilinear
        end    
    end
    return x
end

In [None]:
wmlp=winit(64) # gives weights and biases for an MLP with a single hidden layer of size 64

In [None]:
softloss(wmlp,x,y,mlp)

In [None]:
setseed(1)
@time trn3=train(winit(64),dtrn,mlp,lr=0.1)  # 35.4s
@time trnloss3 = [ avgloss(w,dtrn,mlp) for w in trn3 ]  # 23.7s
@time tstloss3 = [ avgloss(w,dtst,mlp) for w in trn3 ]  # 3.99s
@time trnerr3 = [ zeroone(w,dtrn,mlp) for w in trn3 ]   # 23.3s
@time tsterr3 = [ zeroone(w,dtst,mlp) for w in trn3 ]   # 3.91s
minimum(tstloss3),minimum(tsterr3)  # 0.0887, 0.0234

In [None]:
plot([trnloss1 tstloss1 trnloss3 tstloss3],ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")  
# Solves the underfitting problem!
# A more serious overfitting problem remains.

In [None]:
plot([trnerr1 tsterr1 trnerr3 tsterr3],ylim=(0,0.1),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Error")  
# error improves from 7.5% to 2.3%!

## MLP with L1 regularization

In [None]:
# We still have overfitting, let's try L1 regularization
srand(1)
@time trn4=train(winit(64),dtrn,mlp;lr=0.1,l1=0.00004)  # 47.3s
@time trnloss4 = [ avgloss(w,dtrn,mlp) for w in trn4 ]  # 24.8s
@time tstloss4 = [ avgloss(w,dtst,mlp) for w in trn4 ]  # 4.17s
@time trnerr4 = [ zeroone(w,dtrn,mlp) for w in trn4 ]   # 23.7s
@time tsterr4 = [ zeroone(w,dtst,mlp) for w in trn4 ]   # 3.95s
minimum(tstloss4),minimum(tsterr4)  # 0.0791, 0.0228

In [None]:
plot([trnloss3 tstloss3 trnloss4 tstloss4],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel="Epochs", ylabel="Loss")  
# overfitting less, test loss improves from 0.0887 to 0.0791

In [None]:
plot([trnerr3 tsterr3 trnerr4 tsterr4],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel="Epochs", ylabel="Error")    
# however test error does not change significantly: 0.0234 -> 0.0228

In [None]:
:mlperr,minimum(tsterr3),:l1err,minimum(tsterr4)

## MLP with dropout

In [None]:
# Dropout is another way to address overfitting
function mlpdrop(w,x; pdrop=(0,0))
    for i=1:2:length(w)
        x = dropout(x, pdrop[i==1?1:2])  # apply one of two dropout rates
        x = w[i]*mat(x) .+ w[i+1]
        if i < length(w)-1; x = relu.(x); end
    end
    return x
end

In [None]:
setseed(1)
@time trn5=train(winit(64),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 38.9s
@time trnloss5 = [ avgloss(w,dtrn,mlpdrop) for w in trn5 ]     # 25.7s
@time tstloss5 = [ avgloss(w,dtst,mlpdrop) for w in trn5 ]     # 4.25s
@time trnerr5 = [ zeroone(w,dtrn,mlpdrop) for w in trn5 ]      # 24.3s
@time tsterr5 = [ zeroone(w,dtst,mlpdrop) for w in trn5 ]      # 4.11s
minimum(tstloss5),minimum(tsterr5)  # 0.0645, 0.0186

In [None]:
plot([trnloss3 tstloss3 trnloss5 tstloss5],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Loss")
# overfitting less, loss results improve 0.0887 -> 0.0645

In [None]:
plot([trnerr3 tsterr3 trnerr5 tsterr5],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Error")  
# this time error also improves 0.0234 -> 0.0186

In [None]:
:mlperr,minimum(tsterr3),:l1err,minimum(tsterr4),:dropouterr,minimum(tsterr5)

In [None]:
:mlploss,minimum(tstloss3),:l1loss,minimum(tstloss4),:dropoutloss,minimum(tstloss5)

## MLP with larger hidden layer

In [None]:
# The current trend is to use models with higher capacity tempered with dropout
setseed(1)
@time trn6=train(winit(256),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 34.6s
@time trnloss6 = [ avgloss(w,dtrn,mlpdrop) for w in trn6 ]      # 21.2s
@time tstloss6 = [ avgloss(w,dtst,mlpdrop) for w in trn6 ]      # 3.61s
@time trnerr6 = [ zeroone(w,dtrn,mlpdrop) for w in trn6 ]       # 21.7s
@time tsterr6 = [ zeroone(w,dtst,mlpdrop) for w in trn6 ]       # 3.63s
minimum(tstloss6),minimum(tsterr6)  # 0.0473, 0.0147

In [None]:
plot([trnloss5 tstloss5 trnloss6 tstloss6],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
plot([trnerr5 tsterr5 trnerr6 tsterr6],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")
# We are down to 0.015 error.