# Linear models, loss functions, gradients, SGD

* Objectives: Define, train and visualize a simple model; understand gradients and SGD; learn to use the GPU.
* Prerequisites: [Callable objects](https://docs.julialang.org/en/v1/manual/methods/#Function-like-objects-1)
* AutoGrad: Param, @diff, gradient, value (used and explained)
* Knet: accuracy, zeroone, train! (defined and explained)
* Knet: nll, gpu, KnetArray (used and explained)

In [None]:
import Knet: Knet, Data, minibatch, nll, gpu, KnetArray

In [None]:
include(Knet.dir("data","mnist.jl"))  # Load data (see 02.mnist.ipynb)
dtrn,dtst = mnistdata(xsize=(784,:),xtype=Array{Float32});

## Define linear model

In [None]:
# We will use a callable object to define our model 
# (see https://docs.julialang.org/en/v1/manual/methods/#Function-like-objects-1)
struct Linear; w; b; end           # A linear model has two components: w=weightMatrix, b=biasVector.
(f::Linear)(x) = f.w * x .+ f.b    # When we use a Linear object like a function it gives us a prediction

## Prediction and accuracy

In [None]:
# Let's take the first minibatch from the test set
x,y = first(dtst)
summary.((x,y))

In [None]:
# Initialize a random Linear model
f = Linear(randn(10,784)*0.01, zeros(10))

In [None]:
# Display its prediction on the first minibatch
ENV["COLUMNS"]=92
ypred = f(x)          # predictions are given as a 10xN score matrix         

In [None]:
y'                    # correct answers are given as an array of integers

In [None]:
# We can calculate the accuracy of our model for the first minibatch
using Statistics
accuracy(ypred,y) = mean(y' .== map(i->i[1],findmax(Array(ypred),dims=1)[2]))
accuracy(ypred,y)

In [None]:
# We can calculate the accuracy of our model for the whole test set
accuracy(f,d::Data) = mean(accuracy(f(x),y) for (x,y) in d)
accuracy(f,dtst)

In [None]:
# ZeroOne loss (or error) is defined as 1 - accuracy
zeroone(x...) = 1 - accuracy(x...)
zeroone(f,dtst)

## Negative log likelihood

In [None]:
# Calculate negative log likelihood (aka cross entropy, softmax loss) of our model for the first minibatch
nll(f(x),y)

In [None]:
# Here is what the `nll` function does behind the scenes:
using SparseArrays
ypred=f(x)
yp1 = exp.(ypred)
yp2 = yp1 ./ sum(yp1,dims=1)
yp3 = -log.(yp2)
yc1 = Array(sparse(y,1:100,1f0))
sum(Array(yp3).*yc1) / 100

In [None]:
# per-instance average negative log likelihood for the whole test set
nll(f,d::Data) = mean(nll(f(x),y) for (x,y) in d)
nll(f,dtst)

## Calculating the gradient using AutoGrad

In [None]:
using AutoGrad
@doc AutoGrad

In [None]:
using Random
Random.seed!(9);

In [None]:
# To compute gradients we need to mark fields of f as Params:
f = Linear(Param(randn(10,784)), Param(zeros(10)))

In [None]:
# We can still do predictions with f and calculate loss:
nll(f(x),y)

In [None]:
# And we can do the same loss calculation also computing gradients:
J = @diff nll(f(x),y)

In [None]:
# To get the actual loss value from J:
value(J)

In [None]:
# To get the gradient of a parameter from J:
∇w = gradient(J,f.w)

In [None]:
# Note that each gradient has the same size and shape as the corresponding parameter:
∇b = gradient(J,f.b)

## Checking the gradient using numerical approximation

In [None]:
# Meaning of gradient: If I move the last entry of f.b by epsilon, loss will go up by 0.792576 epsilon!
@show ∇b;

In [None]:
@show f.b;

In [None]:
nll(f(x),y)     # loss for the first minibatch with the original parameters

In [None]:
f.b[10] = 0.1   # to numerically check the gradient let's move the last entry of f.b by +0.1.
@show f.b;

In [None]:
nll(f(x),y)     # We see that the loss moves by ≈ +0.79*0.1 as expected.

In [None]:
f.b[10] = 0

## Checking the gradient using manual implementation

In [None]:
# Without AutoGrad we would have to define the gradients manually:
function nllgrad(f,x,y)
    p = f.w * x .+ f.b
    p = p .- maximum(p,dims=1) # for numerical stability
    expp = exp.(p)
    p = expp ./ sum(expp,dims=1)
    q = oftype(p, sparse(convert(Vector{Int},y),1:length(y),1,size(p,1),length(y)))
    dJdy = (p - q) / size(x,2)
    dJdw = dJdy * x'
    dJdb = vec(sum(dJdy,dims=2))
    dJdw,dJdb
end;

In [None]:
∇w2,∇b2 = nllgrad(f,x,y)

In [None]:
∇w2 ≈ ∇w

In [None]:
∇b2 ≈ ∇b

## Training with Stochastic Gradient Descent (SGD)

In [None]:
using LinearAlgebra: axpy!

function train!(model, data; lr=0.1)
    for (x,y) in data
        loss = @diff nll(model(x),y)
        for param in (model.w, model.b)
            ∇param = gradient(loss, param)
            axpy!(-lr, ∇param, value(param))
        end
    end
end

In [None]:
# Let's try a randomly initialized model for 10 epochs
model = Linear(Param(randn(10,784)*0.01), Param(zeros(10)))
@show nll(model,dtst)
@time for i=1:10; train!(model,dtrn); end
@show nll(model,dtst)

In [None]:
# To work on the GPU, all we have to do is convert our Arrays to KnetArrays:
if Knet.gpu() >= 0
    dtrn.xtype = dtst.xtype = KnetArray{Float32}
    model = Linear(Param(KnetArray{Float32}(randn(10,784)*0.01)), Param(KnetArray{Float32}(zeros(10))))
    @show nll(model,dtst)
    @time for i=1:10; train!(model,dtrn); end
    @show nll(model,dtst)
end

In [None]:
# Let's collect some data to draw training curves and visualizing weights:
using FileIO
if !isfile("lin.jld2")
    models = []; trnloss = []; tstloss = []; trnerr = []; tsterr = []
    model = Linear(Param(KnetArray{Float32}(randn(10,784)*0.01)), Param(KnetArray{Float32}(zeros(10))))
    @time while true
        push!(models, deepcopy(model))
        push!(trnloss, nll(model,dtrn))
        push!(tstloss, nll(model,dtst))
        push!(trnerr, zeroone(model,dtrn))
        push!(tsterr, zeroone(model,dtst))
        length(tsterr) == 100 && break
        train!(model,dtrn)
    end
    save("lin.jld2","trnloss",trnloss,"tstloss",tstloss,"trnerr",trnerr,"tsterr",tsterr)
end

In [None]:
lin = load("lin.jld2")
minimum(lin["tstloss"]), minimum(lin["tsterr"])

## Linear model shows underfitting

In [None]:
using Plots
plot([lin["trnloss"], lin["tstloss"]],ylim=(.0,.4),labels=[:trnloss :tstloss],xlabel="Epochs",ylabel="Loss") 
# Demonstrates underfitting: training loss not close to 0
# Also slight overfitting: test loss higher than train

In [None]:
plot([lin["trnerr"], lin["tsterr"]],ylim=(.0,.12),labels=[:trnerr :tsterr],xlabel="Epochs",ylabel="Error")  
# this is the error plot, we get to about 7.5% test error, i.e. 92.5% accuracy

## Visualizing the learned weights

In [None]:
# Let us visualize the evolution of the weight matrix as images below
# Each row is turned into a 28x28 image with positive numbers light and negative numbers dark gray
using Images, ImageMagick
mnistview(x,i)=colorview(Gray,permutedims(x[:,:,1,i],(2,1)))
for t in 10 .^ range(0,stop=log10(length(models)),length=10) #logspace(0,2,20)
    i = floor(Int,t)
    f = models[i]
    w1 = reshape(Array(value(f.w))', (28,28,1,10))
    w2 = clamp.(w1.+0.5,0,1)
    IJulia.clear_output(true)
    display(hcat([mnistview(w2,i) for i=1:10]...))
    display("Epoch $i")
    sleep(1) # (0.96^i)
end