# Deep Learning frameworks 

This is a brief description and comparison of two deep learning frameworks, Knet and Flux, which are implemented in Julia programming language. 

Knet supports GPU operation and automatic differentiation using dynamic computational graphs for models defined in plain Julia. The foundation of Knet is a package called AutoGrad which is an automatic differentiation package for Julia. This package follows the popular Python Autograd package. Flux is a framework in which the programmers can write normal Julia code and then by using Flux, the code can be run on tensorflow or other graph based deep learning framework as a backend.

Knet is somehow equivalent to Tensorflow and Flux is like Keras in Python. This means that using Flux is easer than using Knet and they are less things that the programmers is dealing with, when using Flux. On the other hand, Knet is a frame work which is implemented completely in Julia, and it does not rely on any other framework in backend that can have a positive impact on the time required for training.

One of interesting feature of Flux, that makes it easy to form different models, is defining layers with different shapes and different activation functions. This can be easily done by using Dense function, and then by using Chain, a list of layers can be defined and in order to stack them up and to arrange the shape of the model which is required. 

** Optimisers **

<table style="width:60%">
  <tr >
    <th style="text-align: left;"><b>Optimiser</b></th>
    <th style="text-align: left;"><b>Knet</b></th>
    <th style="text-align: left;"><b>Flux</b></th> 
  </tr>
  <tr>
    <td style="text-align: left;">Stochastic gradient descent</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">Yes</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Momentum</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">Yes</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Nesterov's momentum</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">Yes</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Adagrad</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">No</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Adadelta</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">No</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Rmsprop</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">No</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Adam</td>
    <td style="text-align: left;">Yes</td>
    <td style="text-align: left;">Yes</td> 
  </tr>
  <tr>
    <td style="text-align: left;">Nadam</td>
    <td style="text-align: left;">No</td>
    <td style="text-align: left;">No</td> 
  </tr>
</table>

# Sample Codes

In [36]:
using Knet
using Flux, Flux.Data.MNIST
using Flux.Tracker
using Flux: onehotbatch, argmax, crossentropy, throttle
using Base.Iterators: repeated, partition

# Linear Regression (Housing data)

** KNet **

In [1]:
include(Knet.dir("data","housing.jl"))
x,y = housing();

In [2]:
predict(w,x) = w[1]*x .+ w[2]
loss(w,x,y) = mean(abs2,y-predict(w,x))
lossgradient = grad(loss)

function train(w, data; lr=.1)
    for (x,y) in data
        dw = lossgradient(w, x, y)
    for i in 1:length(w)
        w[i] -= lr * dw[i]
    end    
    end
    return w
end

w = Any[ 0.1*randn(1,13), 0.0 ]
for i=1:10
    train(w, [(x,y)]);
    println(loss(w,x,y)); 
end

367.9544143509537
242.50852102455386
163.46664060246212
113.09234753743877
80.91832375406112
60.33689670453079
47.148583748042704
38.68009965771645
33.227860638866304
29.705222335336902


** Flux **

In [4]:
W = param(rand(1, 13))
b = param(rand(1))
predict(x) = W*x .+ b
loss(x, y) = mean((predict(x) .- y).^2)
#loss(x, y) = mean(abs2,y-predict(x))

#l = loss(x, y);
#back!(l)

function update()
  η = 0.1 # Learning Rate
  for p in (W, b)
    p.data .-= η .* p.grad # Apply the update
    p.grad .= 0            # Clear the gradient
  end
end

for i=1:10
    l = loss(x, y);
    back!(l)
    update()
    println(l)
end

param(624.648)
param(367.281)
param(241.562)
param(163.271)
param(113.356)
param(81.4281)
param(60.9682)
param(47.8293)
param(39.3691)
param(33.9022)


# Logistic Regression (MNIST Data)

** KNet **

In [19]:
using Knet
include(Knet.dir("data","mnist.jl"))
xtrn, ytrn, xtst, ytst = mnist()
dtrn = minibatch(xtrn, ytrn, 100)
dtst = minibatch(xtst, ytst, 100);

In [8]:
predict(w,x) = w[1]*mat(x) .+ w[2]
loss(w,x,ygold) = nll(predict(w,x), ygold)
lossgradient = grad(loss)

w = Any[ 0.1f0*randn(Float32,10,784), zeros(Float32,10,1) ]
println((:epoch, 0, :trn, accuracy(w,dtrn,predict), :tst, accuracy(w,dtst,predict)))
for epoch=1:10
    train(w, dtrn; lr=0.5)
    println((:epoch, epoch, :trn, accuracy(w,dtrn,predict), :tst, accuracy(w,dtst,predict)))
end

(:epoch, 0, :trn, 0.11666666666666667, :tst, 0.1221)
(:epoch, 1, :trn, 0.8997, :tst, 0.9033)
(:epoch, 2, :trn, 0.9072333333333333, :tst, 0.9096)
(:epoch, 3, :trn, 0.9111166666666667, :tst, 0.9117)
(:epoch, 4, :trn, 0.9133833333333333, :tst, 0.9129)
(:epoch, 5, :trn, 0.9149833333333334, :tst, 0.9142)
(:epoch, 6, :trn, 0.9160833333333334, :tst, 0.9147)
(:epoch, 7, :trn, 0.9170166666666667, :tst, 0.9144)
(:epoch, 8, :trn, 0.9178333333333333, :tst, 0.9148)
(:epoch, 9, :trn, 0.9184666666666667, :tst, 0.9152)
(:epoch, 10, :trn, 0.9193166666666667, :tst, 0.916)


** Flux **

In [9]:
imgs = MNIST.images()
X = hcat(float.(reshape.(imgs, :))...)
labels = MNIST.labels()
Y = onehotbatch(labels, 0:9);

In [10]:
m = Chain(Dense(784, 10), softmax)

loss(x, y) = crossentropy(m(x), y)
accuracy1(x, y) = mean(argmax(m(x)) .== argmax(y))

dataset = repeated((X, Y), 200)
evalcb() = @show(loss(X, Y))
opt = SGD(params(m))

Flux.train!(loss, dataset, opt, cb = throttle(evalcb, 10))

accuracy1(X, Y)

# Test set accuracy
tX = hcat(float.(reshape.(MNIST.images(:test), :))...)
tY = onehotbatch(MNIST.labels(:test), 0:9)

accuracy1(tX, tY)

loss(X, Y) = param(2.27113)
loss(X, Y) = param(1.46039)
loss(X, Y) = param(1.1093)
loss(X, Y) = param(0.928734)
loss(X, Y) = param(0.82016)
loss(X, Y) = param(0.742456)
loss(X, Y) = param(0.69136)
loss(X, Y) = param(0.655337)
loss(X, Y) = param(0.624003)
loss(X, Y) = param(0.598561)
loss(X, Y) = param(0.57742)
loss(X, Y) = param(0.55952)
loss(X, Y) = param(0.544128)
loss(X, Y) = param(0.530721)
loss(X, Y) = param(0.518914)
loss(X, Y) = param(0.508417)
loss(X, Y) = param(0.499009)


0.8854

# Muli-Layer Perceptron (MNIST Data)

** Knet **

In [11]:
function predict(w,x)
    x = mat(x)
    for i=1:2:length(w)-2
        x = relu.(w[i]*x .+ w[i+1])
    end
    return w[end-1]*x .+ w[end]
end

w = Any[ 0.1f0*randn(Float32,64,784), zeros(Float32,64,1),
         0.1f0*randn(Float32,10,64),  zeros(Float32,10,1) ]

function train(model, data, optim)
    for (x,y) in data
        grads = lossgradient(model,x,y)
        update!(model, grads, optim)
    end
end

o = optimizers(w, Adam)
println((:epoch, 0, :trn, accuracy(w,dtrn,predict), :tst, accuracy(w,dtst,predict)))
for epoch=1:10
    train(w, dtrn, o)
    println((:epoch, epoch, :trn, accuracy(w,dtrn,predict), :tst, accuracy(w,dtst,predict)))
end

(:epoch, 0, :trn, 0.08423333333333333, :tst, 0.0838)
(:epoch, 1, :trn, 0.9283166666666667, :tst, 0.9283)
(:epoch, 2, :trn, 0.9486333333333333, :tst, 0.9453)
(:epoch, 3, :trn, 0.9605166666666667, :tst, 0.9558)
(:epoch, 4, :trn, 0.9671833333333333, :tst, 0.9618)
(:epoch, 5, :trn, 0.9726666666666667, :tst, 0.9655)
(:epoch, 6, :trn, 0.9773333333333334, :tst, 0.9675)
(:epoch, 7, :trn, 0.97965, :tst, 0.9678)
(:epoch, 8, :trn, 0.9813833333333334, :tst, 0.9691)
(:epoch, 9, :trn, 0.9831666666666666, :tst, 0.9695)
(:epoch, 10, :trn, 0.98445, :tst, 0.9702)


 ** Flux **

In [78]:
m = Chain(
  Dense(784, 32, relu),
  Dense(32, 10),
  softmax)

opt = ADAM(params(m))

Flux.train!(loss, dataset, opt, cb = throttle(evalcb, 10))

accuracy1(X, Y)

# Test set accuracy
tX = hcat(float.(reshape.(MNIST.images(:test), :))...)
tY = onehotbatch(MNIST.labels(:test), 0:9)

accuracy1(tX, tY)

loss(X, Y) = param(2.28075)




loss(X, Y) = param(1.85265)


.repeated instead.
  likely near In[78]:9


loss(X, Y) = param(1.44539)
loss(X, Y) = param(1.12403)
loss(X, Y) = param(0.91184)
loss(X, Y) = param(0.776611)
loss(X, Y) = param(0.688019)
loss(X, Y) = param(0.62645)
loss(X, Y) = param(0.581399)
loss(X, Y) = param(0.546992)
loss(X, Y) = param(0.519825)
loss(X, Y) = param(0.500026)
loss(X, Y) = param(0.483293)
loss(X, Y) = param(0.468935)
loss(X, Y) = param(0.456461)
loss(X, Y) = param(0.445507)
loss(X, Y) = param(0.43579)
loss(X, Y) = param(0.427099)
loss(X, Y) = param(0.419269)
loss(X, Y) = param(0.412163)
loss(X, Y) = param(0.404905)
loss(X, Y) = param(0.398315)
loss(X, Y) = param(0.39229)
loss(X, Y) = param(0.386753)


0.9006