In [2]:
#  Please refer http://ufldl.stanford.edu/tutorial/supervised/SoftmaxRegression/ for more detailed explanation
for p in ("Compat","GZip")
    Pkg.installed(p) == nothing && Pkg.add(p)
end
using Compat, GZip, DataFrames

In [3]:
function loaddata()
    info("Loading MNIST...")
    xtrn = gzload("train-images-idx3-ubyte.gz")[17:end]
    xtst = gzload("t10k-images-idx3-ubyte.gz")[17:end]
    ytrn = gzload("train-labels-idx1-ubyte.gz")[9:end]
    ytst = gzload("t10k-labels-idx1-ubyte.gz")[9:end]
    return (xtrn, ytrn, xtst, ytst)
end

loaddata (generic function with 1 method)

In [4]:
function gzload(file; path="$file", url="http://yann.lecun.com/exdb/mnist/$file")
    isfile(path) || download(url, path)
    f = gzopen(path)
    a = @compat read(f)
    close(f)
    return(a)
end

gzload (generic function with 1 method)

In [5]:
function minibatch(X, Y, bs=100)
    #takes raw input (X) and gold labels (Y)
    #returns list of minibatches (x, y)
    data = Any[]
    #start of step 1
    for i=1:bs:size(X, 2)
        bl = min(i+bs-1, size(X, 2))
        push!(data, (X[:, i:bl], Y[:, i:bl]))
    end
    #end of step 1
    return data
end

minibatch (generic function with 2 methods)

In [6]:
function init_params(ninputs, noutputs)
    # takes number of inputs and number of outputs(number of classes)
    # returns randomly generated W  with zero mean
    # and 0.001 std and b(must be zeros vector) params of softmax
    # start of step 2

    # YOUR CODE HERE
    
    w=randn(noutputs,ninputs)*0.001
    b=zeros(noutputs,1)
    
    return w,b
    #println(size(w))
    #end of step 2
end

init_params (generic function with 1 method)

In [7]:
function softmax_forw(W, b, data)
    #applies affine transformation and softmax function
    #returns predicted probabilities

    ### step 3
    # YOUR CODE HERE
    pred=(W*data.+b)
    #p=pred./sum(pred,1)
    p=zeros(size(pred))
    for i=1:size(data,2)
        p[:,i]=exp(pred[:,i])./sum(exp(pred[:,i]))
    end
    return p
    ### step 3
end

softmax_forw (generic function with 1 method)

In [8]:
function softmax_cost(W, b, data, labels)
    #takes W, b paremeters, data and correct labels
    #calculates the soft loss, gradient of w and gradient of b

    #start of step 3
    # YOUR CODE HERE
    p=softmax_forw(W,b,data)
    n=size(data,2)
    #println(size(p))
    #println(size(labels))
    grad_y=(labels-p)
    grad_w=(grad_y*data')./n
    grad_b=sum(grad_y,2)./n
    
    
    #println(p)
    
    loss=-sum(labels.*log(p))/n
    #println(size(loss))
    return (loss,grad_w,grad_b)
    
    #end of step 3
end

softmax_cost (generic function with 1 method)

In [9]:
function accuracy(ygold, yhat)
    correct = 0.0
    for i=1:size(ygold, 2)
        correct += indmax(ygold[:,i]) == indmax(yhat[:, i]) ? 1.0 : 0.0
    end
    return correct / size(ygold, 2)
end

accuracy (generic function with 1 method)

In [12]:
function grad_check(W, b, data, labels)
    function numeric_gradient()
        epsilon = 0.0001
        gw = zeros(size(W))
        gb = zeros(size(b))
        println("weight size", gw)
        println("bias size", gb)
        #p=softmax_forw(W,b,data)
        #n=size(data,2)
        #loss=-sum(labels.*log(p))/n
        loss,_,_= softmax_cost(W,b,data,labels)
        #start of step 4
        # YOUR CODE HERE
        for i = 1 : size(W,1)
            for j = 1 : size(W,2)
                W[i,j]=W[i,j]+epsilon
                loss_gw,_,_= softmax_cost(W,b,data,labels)
                gw[i,j]=(loss_gw-loss)/epsilon
                W[i,j]=W[i,j]-epsilon
                #p_gw=softmax_forw(W,b,data)
                #loss_gw=-sum(labels.*log(p_gw))/n
                
                #p_gb=softmax_forw(W,(b.+epsilon),data)
            end
        end
        
        for i = 1:size(b,1)
            b[i]=b[i]+epsilon
            loss_gb,_,_=softmax_cost(W,b,data,labels)
            gb[k] = (loss_gb-loss)/epsilon 
            b[i]=b[i]-epsilon
        end 
        
        #end of step 4

        return gw, gb
    end

    cost,gradW,gradB = softmax_cost(W, b, data, labels)
    gw, gb = numeric_gradient()

    #println(size(gradW))
    #println(size(gw))
    #println(size(gradB))
    #println(size(gb))
    diff = sqrt(sum((gradW - gw) .^ 2) + sum((gradB - gb) .^ 2))
    println("Diff: $diff")
    if diff < 1e-7
        println("Gradient Checking Passed")
    else
        println("Diff must be < 1e-7")
    end
    flush(STDOUT)
end

grad_check (generic function with 1 method)

In [13]:
function train(W, b, data, lr=0.15)
    totalcost = 0.0
    numins = 0
    for (x, y) in data
        ### step 5
        # YOUR CODE HERE
        # Get the cost and gradients
        # Update parameters using gradients
        cost,grad_w,grad_b=softmax_cost(W, b, x, y)
        W  .= W + lr * grad_w
        #println(size(grad_b))
        #println(size(b))

        b  .= b + lr * grad_b

        ### step 5
        totalcost += cost * size(x, 2)
        numins += size(x, 2)
    end
    avgcost = totalcost / numins
end

train (generic function with 2 methods)

In [14]:
function main()
    srand(12345)
    # Size of input vector (MNIST images are 28x28)
    ninputs = 28 * 28

    # Number of classes (MNIST images fall into 10 classes)
    noutputs = 10

    ## Data loading & preprocessing
    #
    #  In this section, we load the input and output data,
    #  prepare data to feed into softmax model.
    #  For softmax regression on MNIST pixels,
    #  the input data is the images, and
    #  the output data is the labels.
    #  Size of xtrn: (28,28,1,60000)
    #  Size of xtrn must be: (784, 60000)
    #  Size of xtst must be: (784, 10000)

    xtrnraw, ytrnraw, xtstraw, ytstraw = loaddata()
    info("data loaded");flush(STDOUT)
    xtrn = convert(Array{Float32}, reshape(xtrnraw ./ 255, 28*28, div(length(xtrnraw), 784)))
    ytrnraw[ytrnraw.==0]=10;
    ytrn = convert(Array{Float32}, sparse(convert(Vector{Int},ytrnraw),1:length(ytrnraw),one(eltype(ytrnraw)),10,length(ytrnraw)))

    xtst = convert(Array{Float32}, reshape(xtstraw ./ 255, 28*28, div(length(xtstraw), 784)))
    ytstraw[ytstraw.==0]=10;
    ytst = convert(Array{Float32}, sparse(convert(Vector{Int},ytstraw),1:length(ytstraw),one(eltype(ytstraw)),10,length(ytstraw)))
    info("size(xtrn):",size(xtrn)," size(ytrn):",size(ytrn))
    info("size(xttst):",size(xtst),"size(ytst):",size(ytst))
    flush(STDOUT)
    ## STEP 1: Create minibatches
    # It takes the input matrix (X) and gold labels (Y)
    # returns list of tuples contain minibatched input and labels (x, y)
    # For only LAB-02, we provide you a complete minibatch function
    # Please try to understand what it does before moving further
    # since you are going to write your own batch functions in upcoming
    # lab sessions.
    bs = 100
    trn_data = minibatch(xtrn, ytrn, bs)
    info("minibatches ready");flush(STDOUT)
    info("length trn_data:",length(trn_data))
    info("size(trn_data[1][1]):",size(trn_data[1][1])," size(trn_data[1][2]):",size(trn_data[1][2]))
    flush(STDOUT)    
    ## STEP 2: Initialize parameters
    #  Complete init_params function
    #  It takes number of inputs and number of outputs(number of classes)
    #  It returns randomly generated W matrix with
    # mean 0 and standard deviation 0.001
    #  and zeros bias vector
    W, b = init_params(ninputs, noutputs)
    info("parameters are set");flush(STDOUT)
    ## STEP 3: Implement softmax_forw and softmax_cost
    #  softmax_forw function takes W, b, and data
    #  calculates predicted probabilities
    #
    #  softmax_cost function obtains probabilites by calling softmax_forw
    #  then calculates soft loss and
    #  gradient of W and gradient of b

    ## STEP 4: Gradient checking
    #  Skip this part for the lab session, but complete later.
    #  As with any learning algorithm, you should always check that your
    #  gradients are correct before learning the parameters.

    debug = false #Turn this parameter off, after gradient checking passed

    if debug
        info("debugging mode");flush(STDOUT)
        grad_check(W, b, xtrn[:, 1:100], ytrn[:, 1:100])
    end

    lr = 0.15

    ## STEP 5: Training
    #  The train function takes model parameters and the data
    #  Trains the model over minibatches
    #  For each minibatch, cost and gradients are calculated then model parameters updated
    #  train function returns the average cost

    for i=1:50
        cost = train(W, b, trn_data, lr)
        pred = softmax_forw(W, b, xtrn)
        trnacc = accuracy(ytrn, pred)
        pred = softmax_forw(W, b, xtst)
        tstacc = accuracy(ytst, pred)
        @printf("epoch: %d softloss: %g trn accuracy: %g tst accuracy: %g\n", i, cost, trnacc, tstacc)
        flush(STDOUT)
    end
end

main (generic function with 1 method)

In [15]:
main()

[1m[36mINFO: [39m[22m[36mLoading MNIST...
[39m[1m[36mINFO: [39m[22m[36mdata loaded
[39m[1m[36mINFO: [39m[22m[36msize(xtrn):(784, 60000) size(ytrn):(10, 60000)
[39m[1m[36mINFO: [39m[22m[36msize(xttst):(784, 10000)size(ytst):(10, 10000)
[39m[1m[36mINFO: [39m[22m[36mminibatches ready
[39m[1m[36mINFO: [39m[22m[36mlength trn_data:600
[39m[1m[36mINFO: [39m[22m[36msize(trn_data[1][1]):(784, 100) size(trn_data[1][2]):(10, 100)
[39m[1m[36mINFO: [39m[22m[36mparameters are set
[39m

epoch: 1 softloss: 0.481559 trn accuracy: 0.896983 tst accuracy: 0.9064
epoch: 2 softloss: 0.339105 trn accuracy: 0.907617 tst accuracy: 0.9119
epoch: 3 softloss: 0.31604 trn accuracy: 0.912017 tst accuracy: 0.9142
epoch: 4 softloss: 0.303876 trn accuracy: 0.914783 tst accuracy: 0.9156
epoch: 5 softloss: 0.29597 trn accuracy: 0.916567 tst accuracy: 0.9172
epoch: 6 softloss: 0.290259 trn accuracy: 0.918033 tst accuracy: 0.9187
epoch: 7 softloss: 0.285858 trn accuracy: 0.919233 tst accuracy: 0.9198
epoch: 8 softloss: 0.282317 trn accuracy: 0.920083 tst accuracy: 0.92


LoadError: [91mInterruptException:[39m

In [None]:
#Example experiment log:
#===========================
Diff: 1.8302437625092545e-9
Gradient Checking Passed
epoch: 1 softloss: 0.481559 trn accuracy: 0.896983 tst accuracy: 0.9064
epoch: 2 softloss: 0.339105 trn accuracy: 0.907617 tst accuracy: 0.9119
epoch: 3 softloss: 0.31604 trn accuracy: 0.912017 tst accuracy: 0.9142
epoch: 4 softloss: 0.303876 trn accuracy: 0.914783 tst accuracy: 0.9156
epoch: 5 softloss: 0.29597 trn accuracy: 0.916567 tst accuracy: 0.9172
epoch: 6 softloss: 0.290259 trn accuracy: 0.918033 tst accuracy: 0.9187
epoch: 7 softloss: 0.285858 trn accuracy: 0.919233 tst accuracy: 0.9198
epoch: 8 softloss: 0.282317 trn accuracy: 0.920083 tst accuracy: 0.92
epoch: 9 softloss: 0.279378 trn accuracy: 0.9209 tst accuracy: 0.9204
epoch: 10 softloss: 0.276879 trn accuracy: 0.921717 tst accuracy: 0.9211
epoch: 11 softloss: 0.274716 trn accuracy: 0.92225 tst accuracy: 0.9207
epoch: 12 softloss: 0.272816 trn accuracy: 0.92305 tst accuracy: 0.9214
epoch: 13 softloss: 0.271127 trn accuracy: 0.923667 tst accuracy: 0.9214
epoch: 14 softloss: 0.269609 trn accuracy: 0.924133 tst accuracy: 0.9215
epoch: 15 softloss: 0.268235 trn accuracy: 0.924417 tst accuracy: 0.922
epoch: 16 softloss: 0.26698 trn accuracy: 0.9247 tst accuracy: 0.9219
epoch: 17 softloss: 0.265828 trn accuracy: 0.924933 tst accuracy: 0.9218
epoch: 18 softloss: 0.264764 trn accuracy: 0.92505 tst accuracy: 0.922
epoch: 19 softloss: 0.263777 trn accuracy: 0.925367 tst accuracy: 0.9223
epoch: 20 softloss: 0.262856 trn accuracy: 0.92575 tst accuracy: 0.9225
epoch: 21 softloss: 0.261995 trn accuracy: 0.9263 tst accuracy: 0.9227
epoch: 22 softloss: 0.261186 trn accuracy: 0.926567 tst accuracy: 0.9226
epoch: 23 softloss: 0.260424 trn accuracy: 0.9269 tst accuracy: 0.9229
epoch: 24 softloss: 0.259704 trn accuracy: 0.92715 tst accuracy: 0.9227
epoch: 25 softloss: 0.259022 trn accuracy: 0.927367 tst accuracy: 0.9227
epoch: 26 softloss: 0.258374 trn accuracy: 0.9275 tst accuracy: 0.9229
epoch: 27 softloss: 0.257758 trn accuracy: 0.927767 tst accuracy: 0.923
epoch: 28 softloss: 0.257171 trn accuracy: 0.928083 tst accuracy: 0.9229
epoch: 29 softloss: 0.25661 trn accuracy: 0.92825 tst accuracy: 0.9231
epoch: 30 softloss: 0.256073 trn accuracy: 0.92835 tst accuracy: 0.9229
epoch: 31 softloss: 0.255558 trn accuracy: 0.928517 tst accuracy: 0.923
epoch: 32 softloss: 0.255064 trn accuracy: 0.928783 tst accuracy: 0.9228
epoch: 33 softloss: 0.254589 trn accuracy: 0.92895 tst accuracy: 0.9229
epoch: 34 softloss: 0.254133 trn accuracy: 0.9291 tst accuracy: 0.9227
epoch: 35 softloss: 0.253692 trn accuracy: 0.929167 tst accuracy: 0.9228
epoch: 36 softloss: 0.253268 trn accuracy: 0.92925 tst accuracy: 0.9227
epoch: 37 softloss: 0.252858 trn accuracy: 0.929417 tst accuracy: 0.923
epoch: 38 softloss: 0.252462 trn accuracy: 0.929567 tst accuracy: 0.9229
epoch: 39 softloss: 0.252078 trn accuracy: 0.929667 tst accuracy: 0.9228
epoch: 40 softloss: 0.251707 trn accuracy: 0.929783 tst accuracy: 0.9229
epoch: 41 softloss: 0.251347 trn accuracy: 0.929867 tst accuracy: 0.9231
epoch: 42 softloss: 0.250998 trn accuracy: 0.930067 tst accuracy: 0.9235
epoch: 43 softloss: 0.25066 trn accuracy: 0.9301 tst accuracy: 0.9235
epoch: 44 softloss: 0.250331 trn accuracy: 0.930233 tst accuracy: 0.9235
epoch: 45 softloss: 0.250011 trn accuracy: 0.930333 tst accuracy: 0.9235
epoch: 46 softloss: 0.2497 trn accuracy: 0.9305 tst accuracy: 0.9237
epoch: 47 softloss: 0.249397 trn accuracy: 0.930583 tst accuracy: 0.9238
epoch: 48 softloss: 0.249102 trn accuracy: 0.9307 tst accuracy: 0.9239
epoch: 49 softloss: 0.248815 trn accuracy: 0.93085 tst accuracy: 0.9242
epoch: 50 softloss: 0.248535 trn accuracy: 0.930933 tst accuracy: 0.9243
============================#
