In [237]:
#Classifying MNIST digits with a softmax classifier
#Name: Ege Ersü

In this assignment, you will implement a softmax classifier to predict the digit presented in a given image. We will use the MNIST dataset for this task. Please first skim through the notebook. Then complete the following steps mentioned in the main function:

1. minibatch
2. init_params
3. forward and backward propagation
    * softmax_forw
    * softmax_cost
4. grad_check
5. train
    

In [238]:
using Pkg; for p in ["Knet"]; haskey(Pkg.installed(),p) || Pkg.add(p); end #Knet installation to use the MNIST dataset
using Knet, Printf, Random

include(Knet.dir("data", "mnist.jl"))

In [239]:
 function minibatch2(X, Y, bs=100)
    #takes raw input (X) and gold labels (Y)
    #returns list of minibatches (x, y)
    data = Any[]
    N = size(Y)[2]
    @show N
    for i in 1:bs:N
        if i+bs-1 >= N
            batchX = X[:,i:N]
            batchY = Y[:,i:N]
            push!(data, (batchX, batchY))
            break;
        end
        
        batchX = X[:,i:i+bs-1]
        batchY = Y[:,i:i+bs-1]
        #@show batchX, batchY
        push!(data, (batchX, batchY))
    end
    return data                                                                 
end

minibatch2 (generic function with 2 methods)

#here is a little toy example to see what a minibatch looks like

#take first 6 data points only
little_xtrn = xtrn[:,1:10]
little_ytrn = ytrn[:,1:10]

#call the minibatch function, it will give you a list of batches (batches are tuples)
mini_batch1 = minibatch2(little_xtrn, little_ytrn, 4)

#take the first batch (a tuple) from the list of batches
batch1 = mini_batch1[1]

#you can simply get X and Y components of the batch from the tuple
batch1_X = batch1[1]
batch1_Y = batch1[2]

#once you have the batch, you can get a single y vector from the array if you want
Y1 = batch1_Y[:,1]


@show mini_batch1[2][2]

In [240]:
function init_params(ninputs, noutputs)
    #takes number of inputs and number of outputs(number of classes)
    #returns randomly generated W and b(must be zeros vector) params of softmax
    
    #start of step 2
    W = 0.001 * randn(noutputs, ninputs)
    b = zeros(Float64, noutputs)
    W, b
    #end of step 2                                                              
end

init_params (generic function with 1 method)

In [241]:
function softmax_forw(W, b, data)
    #applies the affine transformation and softmax function
    #returns predicted probabilities
    
    ### step 3_1
    output = broadcast(+, (W * data), b)
    output = (x->MathConstants.e^x).(output)
    sums = sum(output, dims=1)
    output = broadcast(/, output, sums)
    ### step 3_1                                                                  
end 

softmax_forw (generic function with 1 method)

In [242]:
using ForwardDiff, AutoGrad, Knet

function softmax_cost(W, b, data, labels)
    #takes W, b paremeters, data and correct labels
    #calculates the soft loss, gradient of w and gradient of b
    #data is a batch of Xs (784, 100)
    #labels is a batch of Ys (10, 100)
    batch_size = size(labels)[2]
    
    #get predictions as a probability distribution for each xi
    prediction = softmax_forw(W, b, data)
    
    #indexes of correct labels
    correct_labels = argmax(labels, dims=1)

    #get the prob corresponding to correct label
    correct_probs = prediction[correct_labels]
    
    #sum the -logs of individual prob values
    loss = sum((x->-log(x)).(correct_probs))/batch_size
    
    dscores = prediction
    for n in 1:batch_size
        yi = argmax(labels[:,n])
        dscores[yi,n] -= 1
    end
    dscores /= batch_size
    dw = (data * dscores')'
    db = sum(dscores, dims=2)
    loss, dw, db
end

softmax_cost (generic function with 1 method)

In [210]:
function grad_check(W, b, data, labels)
    function numeric_loss(W, b, data, labels)
        batch_size = size(labels)[2]
        prediction = softmax_forw(W, b, data)
        correct_labels = argmax(labels, dims=1)
        correct_probs = prediction[correct_labels]
        loss = sum((x->-log(x)).(correct_probs))/batch_size
    end
    
    
    function numeric_gradient()
        epsilon = 0.0001
        
        gw = zeros(size(W))
        gb = zeros(size(b))
        
        #start of step 4
        for i in 1:10
            for j in 1:784
                W[i,j] += epsilon
                L1 = numeric_loss(W, b, data, labels)
                W[i,j] -= 2*epsilon
                L2 = numeric_loss(W, b, data, labels)    
                gw[i,j] = (L1-L2)/(2*epsilon)
                W[i,j] += epsilon
            end
        end               
        
        for i in 1:size(b)[1]
            b[i] += epsilon
            L1 = numeric_loss(W,b,data,labels)
            b[i] -= 2*epsilon
            L2 = numeric_loss(W,b,data,labels)
            gb[i] = (L1-L2)/(2*epsilon)
            b[i] += epsilon
        end
        #end of step 4
        return gw, gb
    end
    
    _,gradW,gradB = softmax_cost(W, b, data, labels)
    gw, gb = numeric_gradient()
    diff = sqrt(sum((gradW - gw) .^ 2) + sum((gradB - gb) .^ 2))
    println("Diff: $diff")
    if diff < 1e-7
        println("Gradient Checking Passed")
    else
        println("Diff must be < 1e-7")
    end                                                                         
end

debug = true #Turn this parameter off, after gradient checking passed
if debug
    grad_check(W, b, xtrn[:, 1:100], ytrn[:, 1:100])
end

Diff: 1.8315191059140225e-9
Gradient Checking Passed


In [243]:
## STEP 5: Training
#  The train function takes model parameters and the data
#  Trains the model over minibatches
#  For each minibatch, first cost and gradients are calculated then model parameters are updated
#  train function returns the average cost per instance
    
function train(W, b, data, lr=0.15)
    totalcost = 0.0
    numins = 0
    for (x, y) in data
        #start of step 5
        #YOUR CODE HERE
        batch_cost, batch_dw, batch_db = softmax_cost(W, b, x, y)
        #@show batch_cost
        totalcost += batch_cost
        W -= lr * batch_dw 
        b -= lr * batch_db 
        numins += 1
        #end of step 5
    end

    avgcost = totalcost / numins    
    
    return avgcost, W, b
end

train (generic function with 2 methods)

In [244]:
function accuracy2(ygold, yhat)
    correct = 0.0
    for i=1:size(ygold, 2)
        correct += findmax(ygold[:,i]; dims=1)[2] == findmax(yhat[:, i]; dims=1)[2] ? 1.0 : 0.0
    end
    return correct / size(ygold, 2)                                             
end

accuracy2 (generic function with 1 method)

In [245]:
function main()
    Random.seed!(12345)
    
    # Size of input vector (MNIST images are 28x28)
    ninputs = 28 * 28
    
    # Number of classes (MNIST images fall into 10 classes)
    noutputs = 10
    
    ## Data loading & preprocessing
    #
    #  In this section, we load the input and output data,
    #  prepare data to feed into softmax model.
    #  For softmax regression on MNIST pixels,
    #  the input data is the images, and
    #  the output data is the labels.
    #  Size of xtrn: (28,28,1,60000)
    #  Size of xtrn must be: (784, 60000)
    #  Size of xtst must be: (784, 10000)
    #  Size of ytrn must be: (10, 10000)
    #  Size of ytst must be: (10, 10000)
    
    xtrn, ytrn, xtst, ytst = mnist() # loading the data
    xtrn = reshape(xtrn, 784, 60000)
    xtst = reshape(xtst, 784, 10000)
    
    function to_onehot(x)
        onehot = zeros(10, 1)
        onehot[x, 1] = 1.0
        return onehot
    end
    
    ytrn = hcat(map(to_onehot, ytrn)...)
    ytst = hcat(map(to_onehot, ytst)...)
    
    ## STEP 1: Create minibatches
    # Complete the minibatch function
    # It takes the input matrix (X) and gold labels (Y)
    # returns list of tuples contain minibatched input and labels (x, y)
    bs = 100
    trn_data = minibatch(xtrn, ytrn, bs)
    
    ## STEP 2: Initialize parameters
    #  Complete init_params function
    #  It takes number of inputs and number of outputs(number of classes)
    #  It returns randomly generated W matrix and bias vector
    #  Sample from N(0, 0.001)
    
    W, b = init_params(ninputs, noutputs)
    
    ## STEP 3: Implement softmax_forw and softmax_cost
    #  softmax_forw function takes W, b, and data
    #  calculates predicted probabilities
    #
    #  softmax_cost function obtains probabilites by calling softmax_forw
    #  then calculates soft loss and
    #  gradient of W and gradient of b
    
    ## STEP 4: Gradient checking
    #  Skip this part for the lab session.
    #  As with any learning algorithm, you should always check that your
    #  gradients are correct before learning the parameters.
    
    debug = true #Turn this parameter off, after gradient checking passed
    if debug
        grad_check(W, b, xtrn[:, 1:100], ytrn[:, 1:100])
    end
    
    lr = 0.15
    
    ## STEP 5: Training
    #  The train function takes model parameters and the data
    #  Trains the model over minibatches
    #  For each minibatch, first cost and gradients are calculated then model parameters are updated
    #  train function returns the average cost per instance
    
    for i=1:50   
        cost, W, b = train(W, b, trn_data, lr)
        pred = softmax_forw(W, b, xtrn)
        trnacc = accuracy2(ytrn, pred)
        pred = softmax_forw(W, b, xtst)
        tstacc = accuracy2(ytst, pred)
        @printf("epoch: %d softloss: %g trn accuracy: %g tst accuracy: %g\n", i, cost, trnacc, tstacc)
    end
end

main (generic function with 1 method)

In [246]:
main()

Diff: 1.830923770917749e-9
Gradient Checking Passed
epoch: 1 softloss: 0.481559 trn accuracy: 0.896983 tst accuracy: 0.9064
epoch: 2 softloss: 0.339105 trn accuracy: 0.907617 tst accuracy: 0.9119
epoch: 3 softloss: 0.31604 trn accuracy: 0.912017 tst accuracy: 0.9142
epoch: 4 softloss: 0.303876 trn accuracy: 0.914783 tst accuracy: 0.9156
epoch: 5 softloss: 0.29597 trn accuracy: 0.916567 tst accuracy: 0.9172
epoch: 6 softloss: 0.290259 trn accuracy: 0.918033 tst accuracy: 0.9187
epoch: 7 softloss: 0.285858 trn accuracy: 0.919233 tst accuracy: 0.9198
epoch: 8 softloss: 0.282317 trn accuracy: 0.920083 tst accuracy: 0.92
epoch: 9 softloss: 0.279378 trn accuracy: 0.9209 tst accuracy: 0.9204
epoch: 10 softloss: 0.276879 trn accuracy: 0.921717 tst accuracy: 0.9211
epoch: 11 softloss: 0.274716 trn accuracy: 0.92225 tst accuracy: 0.9207
epoch: 12 softloss: 0.272816 trn accuracy: 0.92305 tst accuracy: 0.9214
epoch: 13 softloss: 0.271127 trn accuracy: 0.923667 tst accuracy: 0.9214
epoch: 14 softlo

In [None]:
#= Example Output
Diff: 1.8292339049184216e-9
Gradient Checking Passed
epoch: 1 softloss: 0.481559 trn accuracy: 0.896983 tst accuracy: 0.9064
epoch: 2 softloss: 0.339105 trn accuracy: 0.907617 tst accuracy: 0.9119
epoch: 3 softloss: 0.31604 trn accuracy: 0.912017 tst accuracy: 0.9142
epoch: 4 softloss: 0.303876 trn accuracy: 0.914783 tst accuracy: 0.9156
epoch: 5 softloss: 0.29597 trn accuracy: 0.916567 tst accuracy: 0.9172
epoch: 6 softloss: 0.290259 trn accuracy: 0.918033 tst accuracy: 0.9187
epoch: 7 softloss: 0.285858 trn accuracy: 0.919233 tst accuracy: 0.9198
epoch: 8 softloss: 0.282317 trn accuracy: 0.920083 tst accuracy: 0.92
epoch: 9 softloss: 0.279378 trn accuracy: 0.9209 tst accuracy: 0.9204
epoch: 10 softloss: 0.276879 trn accuracy: 0.921717 tst accuracy: 0.9211
epoch: 11 softloss: 0.274716 trn accuracy: 0.92225 tst accuracy: 0.9207
epoch: 12 softloss: 0.272816 trn accuracy: 0.92305 tst accuracy: 0.9214
epoch: 13 softloss: 0.271127 trn accuracy: 0.923667 tst accuracy: 0.9214
epoch: 14 softloss: 0.269609 trn accuracy: 0.924133 tst accuracy: 0.9215
epoch: 15 softloss: 0.268235 trn accuracy: 0.924417 tst accuracy: 0.922
epoch: 16 softloss: 0.26698 trn accuracy: 0.9247 tst accuracy: 0.9219
epoch: 17 softloss: 0.265828 trn accuracy: 0.924933 tst accuracy: 0.9218
epoch: 18 softloss: 0.264764 trn accuracy: 0.92505 tst accuracy: 0.922
epoch: 19 softloss: 0.263777 trn accuracy: 0.925367 tst accuracy: 0.9223
epoch: 20 softloss: 0.262856 trn accuracy: 0.92575 tst accuracy: 0.9225
epoch: 21 softloss: 0.261995 trn accuracy: 0.9263 tst accuracy: 0.9227
epoch: 22 softloss: 0.261186 trn accuracy: 0.926567 tst accuracy: 0.9226
epoch: 23 softloss: 0.260424 trn accuracy: 0.9269 tst accuracy: 0.9229
epoch: 24 softloss: 0.259704 trn accuracy: 0.92715 tst accuracy: 0.9227
epoch: 25 softloss: 0.259022 trn accuracy: 0.927367 tst accuracy: 0.9227
epoch: 26 softloss: 0.258374 trn accuracy: 0.9275 tst accuracy: 0.9229
epoch: 27 softloss: 0.257758 trn accuracy: 0.927767 tst accuracy: 0.923
epoch: 28 softloss: 0.257171 trn accuracy: 0.928083 tst accuracy: 0.9229
epoch: 29 softloss: 0.25661 trn accuracy: 0.92825 tst accuracy: 0.9231
epoch: 30 softloss: 0.256073 trn accuracy: 0.92835 tst accuracy: 0.9229
epoch: 31 softloss: 0.255558 trn accuracy: 0.928517 tst accuracy: 0.923
epoch: 32 softloss: 0.255064 trn accuracy: 0.928783 tst accuracy: 0.9228
epoch: 33 softloss: 0.254589 trn accuracy: 0.92895 tst accuracy: 0.9229
epoch: 34 softloss: 0.254133 trn accuracy: 0.9291 tst accuracy: 0.9227
epoch: 35 softloss: 0.253692 trn accuracy: 0.929167 tst accuracy: 0.9228
epoch: 36 softloss: 0.253268 trn accuracy: 0.92925 tst accuracy: 0.9227
epoch: 37 softloss: 0.252858 trn accuracy: 0.929417 tst accuracy: 0.923
epoch: 38 softloss: 0.252462 trn accuracy: 0.929567 tst accuracy: 0.9229
epoch: 39 softloss: 0.252078 trn accuracy: 0.929667 tst accuracy: 0.9228
epoch: 40 softloss: 0.251707 trn accuracy: 0.929783 tst accuracy: 0.9229
epoch: 41 softloss: 0.251347 trn accuracy: 0.929867 tst accuracy: 0.9231
epoch: 42 softloss: 0.250998 trn accuracy: 0.930067 tst accuracy: 0.9235
epoch: 43 softloss: 0.25066 trn accuracy: 0.9301 tst accuracy: 0.9235
epoch: 44 softloss: 0.250331 trn accuracy: 0.930233 tst accuracy: 0.9235
epoch: 45 softloss: 0.250011 trn accuracy: 0.930333 tst accuracy: 0.9235
epoch: 46 softloss: 0.2497 trn accuracy: 0.9305 tst accuracy: 0.9237
epoch: 47 softloss: 0.249397 trn accuracy: 0.930583 tst accuracy: 0.9238
epoch: 48 softloss: 0.249102 trn accuracy: 0.9307 tst accuracy: 0.9239
epoch: 49 softloss: 0.248815 trn accuracy: 0.93085 tst accuracy: 0.9242
epoch: 50 softloss: 0.248535 trn accuracy: 0.930933 tst accuracy: 0.9243
=#