# Underfitting, overfitting, regularization, dropout

## Load and minibatch MNIST data

In [None]:
using Knet, Plots

In [None]:
include(Knet.dir("examples","mnist.jl"))
MNIST.loaddata()
using MNIST: xtrn,ytrn,xtst,ytst,minibatch
Atype = gpu() >= 0 ? KnetArray{Float32} : Array{Float32}
dtst = minibatch(xtst,ytst,100;atype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100
dtrn = minibatch(xtrn,ytrn,100;atype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100
length(dtrn),length(dtst)

In [None]:
(x,y)=dtst[1]

In [None]:
display(x)

In [None]:
display(y)

## Define linear model

In [None]:
function linear(w,x)
    y = w[1]*x .+ w[2]
end

winit1()=map(Atype, [ 0.1*randn(10,784), zeros(10,1) ])

In [None]:
srand(9)
w = winit1()  # random weight matrix and a zero bias vector

In [None]:
ypred=linear(w,x)
display(ypred)

In [None]:
display(y)

## Measuring error rate

In [None]:
sum(y .* (ypred .== maximum(ypred,1)))  # number of correct answers in this batch of 100 with initial w

In [None]:
function zeroone(w,data,model)
    ncorr = ninst = 0
    for (x,y) in data
        ypred = model(w,x)
        ncorr += sum(y .* (ypred .== maximum(ypred,1)))
        ninst += size(x,2)
    end
    return 1 - ncorr/ninst
end

In [None]:
zeroone(w,dtst,linear)  # number of errors in the whole test dataset

## Measuring loss

In [None]:
# Calculate cross entropy loss of a model with weights w for one minibatch (x,p)
# Use non-zero l1 or l2 for regularization
function softloss(w,x,p,model;l1=0,l2=0,o...)  
    y = model(w,x;o...)
    y = y .- maximum(y,1) # for numerical stability
    expy = exp.(y)
    logphat = y .- log.(sum(expy,1))
    J = -sum(p .* logphat) / size(x,2)  # dividing by number of instances for per-instance average
    if l1 != 0; J += l1 * sum(sum(abs,wi)  for wi in w[1:2:end]); end
    if l2 != 0; J += l2 * sum(sum(abs2,wi) for wi in w[1:2:end]); end
    return J
end

In [None]:
softloss(w,x,y,linear)  # per-instance average softloss for the first test minibatch

In [None]:
function avgloss(w,data,model) # average loss for the whole dataset
    sum = cnt = 0
    for (x,y) in data
        sum += softloss(w,x,y,model)
        cnt += 1
    end
    return sum/cnt
end

In [None]:
avgloss(w,dtst,linear)  # per-instance average softloss for the whole test set

## Manually defined gradient for softloss

In [None]:
function grad1(w,x,p,model)
    y = model(w,x)
    y = y .- maximum(y,1) # for numerical stability
    expy = exp.(y)
    q = expy ./ sum(expy,1)
    dJdy = (q - p) / size(x,2)
    dJdw = dJdy * x'
    dJdb = sum(dJdy,2)
    Any[dJdw,dJdb]
end

## Automatically defined gradient for softloss

In [None]:
softgrad = grad(softloss)  # Knet/AutoGrad makes life easier :)

In [None]:
g1 = grad1(w,x,y,linear)

In [None]:
g2 = softgrad(w,x,y,linear)

In [None]:
isapprox(g1[1],g2[1])

In [None]:
isapprox(g1[2],g2[2])

## Checking the gradient

In [None]:
display(g1[2])  
# Meaning of gradient:
# If I move the last entry of w[2] by epsilon, loss will go up by 0.345075 epsilon!

In [None]:
display(w[2])

In [None]:
softloss(w,x,y,linear)

In [None]:
w[2][10] += 0.1   # to numerically check the gradient let's move the last entry by +0.1.

In [None]:
softloss(w,x,y,linear)  
# We see that the loss moves by +0.03 as expected.
# You should check all/most entries in your gradients this way to make sure they are correct.

## Training loop (SGD)

In [None]:
# Train model(w) with SGD and return a list containing w for every epoch
function train(w,data,model; epochs=100,lr=0.5,o...)
    weights = Any[copy(w)]
    for epoch in 1:epochs
        for (x,y) in data
            g = softgrad(w,x,y,model;o...)
            for i in 1:length(w)
                w[i] = w[i] - lr * g[i]
            end
        end
        push!(weights,copy(w))
    end
    return weights
end

## Training the linear model

In [None]:
srand(1)
@time trn1=train(winit1(),dtrn,linear)
@time trnloss1 = [ avgloss(w,dtrn,linear) for w in trn1 ]
@time tstloss1 = [ avgloss(w,dtst,linear) for w in trn1 ]
@time trnerr1 = [ zeroone(w,dtrn,linear) for w in trn1 ]
@time tsterr1 = [ zeroone(w,dtst,linear) for w in trn1 ]
minimum(tstloss1),minimum(tsterr1)

In [None]:
length(trn1)

## Overfitting and underfitting

In [None]:
plot([trnloss1 tstloss1],ylim=(.2,.36),labels=[:trnloss :tstloss],xlabel="Epochs",ylabel="Loss") 
# Demonstrates both overfitting and underfitting
# Overfitting: test loss is higher than training loss and getting worse
# Underfitting: training loss not close to 0

In [None]:
plot([trnerr1 tsterr1],ylim=(.06,.10),labels=[:trnerr :tsterr],xlabel="Epochs",ylabel="Error")  
# this is the error plot, we get to about 8% error, i.e. 92% accuracy

## Linear model with L2 regularization

In [None]:
# Let us try L2 regularization to address overfitting
srand(1)
@time trn2=train(winit1(),dtrn,linear;l2=0.00004)
@time trnloss2 = [avgloss(w,dtrn,linear) for w in trn2]
@time tstloss2 = [avgloss(w,dtst,linear) for w in trn2]
@time trnerr2 = [zeroone(w,dtrn,linear) for w in trn2]
@time tsterr2 = [zeroone(w,dtst,linear) for w in trn2]
minimum(tstloss2)

In [None]:
plot([trnloss1 tstloss1 trnloss2 tstloss2],ylim=(0.2,0.36),
    labels=[:trnloss :tstloss :trnlossL2 :tstlossL2],xlabel="Epochs",ylabel="Loss") 
# overfitting less but results do not improve much

In [None]:
plot([trnerr1 tsterr1 trnerr2 tsterr2],ylim=(0.06,0.10),
    labels=[:trnerr :tsterr :trnerrL2 :tsterrL2],xlabel="Epochs",ylabel="Error")

## Multi-layer Perceptron

In [None]:
# Using a model with higher capacity helps underfitting
function mlp(w,x)
    for i=1:2:length(w)-2
        x = relu.(w[i]*x .+ w[i+1])
    end
    return w[end-1]*x .+ w[end]
end

In [None]:
function winit(h...; std=0.01, x=784, y=10,  # use winit(h1,h2,...,hn) for n hidden layer mlp
               atype=gpu()>=0 ? KnetArray{Float32} : Array{Float32})
    h = [x, h..., y]
    w = Any[]
    for i=1:length(h)-1
        push!(w, std*randn(h[i+1],h[i]))
        push!(w, zeros(h[i+1],1))
    end
    map(atype, w)
end

In [None]:
w2=winit(64) # gives weights and biases for an MLP with a single hidden layer of size 64

In [None]:
softloss(w2,x,y,mlp)

In [None]:
srand(1)
@time trn3=train(winit(64),dtrn,mlp)
@time trnloss3 = [ avgloss(w,dtrn,mlp) for w in trn3 ]
@time tstloss3 = [ avgloss(w,dtst,mlp) for w in trn3 ]
@time trnerr3 = [ zeroone(w,dtrn,mlp) for w in trn3 ]
@time tsterr3 = [ zeroone(w,dtst,mlp) for w in trn3 ]
minimum(tstloss3)

In [None]:
plot([trnloss1 tstloss1 trnloss3 tstloss3],ylim=(0.0,0.4),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Loss")  
# solves the underfitting problem!

In [None]:
plot([trnerr1 tsterr1 trnerr3 tsterr3],ylim=(0,0.1),
    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel="Epochs",ylabel="Error")  
# error improves from 8% to 2%!

## MLP with L1 regularization

In [None]:
# We still have overfitting, let's try L1 regularization and a lower learning rate
srand(1)
@time trn4=train(winit(64),dtrn,mlp;lr=0.1,l1=4e-5)
@time trnloss4 = [ avgloss(w,dtrn,mlp) for w in trn4 ]
@time tstloss4 = [ avgloss(w,dtst,mlp) for w in trn4 ]
@time trnerr4 = [ zeroone(w,dtrn,mlp) for w in trn4 ]
@time tsterr4 = [ zeroone(w,dtst,mlp) for w in trn4 ]
minimum(tstloss4)

In [None]:
plot([trnloss3 tstloss3 trnloss4 tstloss4],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel="Epochs", ylabel="Loss")  
# overfitting less, loss results improve

In [None]:
plot([trnerr3 tsterr3 trnerr4 tsterr4],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel="Epochs", ylabel="Error")    
# however error results do not improve! 

In [None]:
minimum(tsterr3),minimum(tsterr4)

## MLP with dropout

In [None]:
# Dropout is another way to address overfitting
function dropout(x,p)
    if p > 0
        x .* (rand!(similar(x)) .> p) ./ (1-p)
    else
        x
    end
end

function mlp(w,x; pdrop=(0,0))
    x = dropout(x,pdrop[1])
    for i=1:2:length(w)-2
        x = relu.(w[i]*x .+ w[i+1])
        x = dropout(x,pdrop[2])
    end
    return w[end-1]*x .+ w[end]
end

In [None]:
srand(1)
@time trn5=train(winit(64),dtrn,mlp;lr=0.1,pdrop=(0.2,0))
@time trnloss5 = [ avgloss(w,dtrn,mlp) for w in trn5 ]
@time tstloss5 = [ avgloss(w,dtst,mlp) for w in trn5 ]
@time trnerr5 = [ zeroone(w,dtrn,mlp) for w in trn5 ]
@time tsterr5 = [ zeroone(w,dtst,mlp) for w in trn5 ]
minimum(tstloss5),minimum(tsterr5)

In [None]:
plot([trnloss3 tstloss3 trnloss5 tstloss5],ylim=(0,0.15),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Loss")  
# overfitting less, loss results improve

In [None]:
plot([trnerr3 tsterr3 trnerr5 tsterr5],ylim=(0,0.04),
    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel="Epochs", ylabel="Error")  
# this time error also improves slightly

In [None]:
minimum(tsterr3),minimum(tsterr4),minimum(tsterr5)

In [None]:
minimum(tstloss3),minimum(tstloss4),minimum(tstloss5)

## MLP with larger hidden layer

In [None]:
# The current trend is to use models with higher capacity tempered with dropout
srand(1)
@time trn6=train(winit(256),dtrn,mlp;lr=0.1,pdrop=(0.2,0))
@time trnloss6 = [ avgloss(w,dtrn,mlp) for w in trn6 ]
@time tstloss6 = [ avgloss(w,dtst,mlp) for w in trn6 ]
@time trnerr6 = [ zeroone(w,dtrn,mlp) for w in trn6 ]
@time tsterr6 = [ zeroone(w,dtst,mlp) for w in trn6 ]
minimum(tstloss6),minimum(tsterr6)

In [None]:
plot([trnloss5 tstloss5 trnloss6 tstloss6],ylim=(0,0.15),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Loss")

In [None]:
plot([trnerr5 tsterr5 trnerr6 tsterr6],ylim=(0,0.04),
    labels=[:trn64 :tst64 :trn256 :tst256],xlabel="Epochs",ylabel="Error")
# We are down to 0.015 error.