# Knet-Flux mlp benchmark based on [Flux/model-zoo](https://github.com/FluxML/model-zoo/blob/master/vision/mnist/mlp.jl) mlp example

In [1]:
]activate ..; instantiate; st

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h

│     — /home/gridsan/dyuret/.julia/registries/General — failed to fetch from repo
└ @ Pkg.API /buildworker/worker/package_linux64/build/usr/share/julia/stdlib/v1.0/Pkg/src/API.jl:157


[32m[1m    Status[22m[39m `~/Klutz.jl/Project.toml`
 [90m [3a865a2d][39m[37m CuArrays v0.8.1[39m
 [90m [587475ba][39m[37m Flux v0.6.8[39m
 [90m [1902f260][39m[37m Knet v1.1.1[39m


In [2]:
# Uncomment this to get Knet profiling info at the end:
# ENV["KNET_TIMER"] = ENV["AUTOGRAD_TIMER"] = "true"
# using Pkg; Pkg.build("AutoGrad"); Pkg.build("Knet")

In [3]:
using Flux, Flux.Data.MNIST, Statistics
using Flux: onehotbatch, onecold, crossentropy, throttle
using Base.Iterators: repeated
using CuArrays
using Knet: Knet, KnetArray, param, param0, nll, Param, AutoGrad
Knet.gpu()

┌ Info: Recompiling stale cache file /home/gridsan/dyuret/.julia/compiled/v1.0/Knet/f4vSz.ji for Knet [1902f260-5fb4-5aff-8c31-6271790ab950]
└ @ Base loading.jl:1187


0

In [4]:
# Implement Chain and Dense in Knet
struct kChain; layers; kChain(ls::Tuple)=new(ls); end
kChain(ls...)=kChain(ls)
(c::kChain)(x) = (for l in c.layers; x = l(x); end; x)
struct kDense; w; b; f; end
kDense(nx::Int,ny::Int,fn=identity)=kDense(param(ny,nx),param0(ny),fn)
(d::kDense)(x) = d.f.(d.w * x .+ d.b)

## GPU tests

In [5]:
# Load data
imgs = MNIST.images()
X = hcat(float.(reshape.(imgs, :))...) |> gpu
labels = MNIST.labels()
Y = onehotbatch(labels, 0:9) |> gpu
dataset = repeated((X, Y), 200)

kX = KnetArray(Array(X))
kY = labels .+ 1
kdata = repeated((kX,kY),200)

summary.((X,Y,kX,kY))

("784×60000 CuArray{Float32,2}", "10×60000 Flux.OneHotMatrix{CuArray{Flux.OneHotVector,1}}", "784×60000 KnetArray{Float32,2}", "60000-element Array{Int64,1}")

In [10]:
# Run this several times to get timing for Flux:
# (loss(X, Y), accuracy(X, Y)) = (2.2691698f0 (tracked), 0.17285)
#  2.402374 seconds (513.62 k allocations: 20.364 MiB, 25.76% gc time)
# (loss(X, Y), accuracy(X, Y)) = (0.27879566f0 (tracked), 0.9224166666666667)
m = Chain(Dense(28^2, 32, relu),Dense(32, 10),softmax) |> gpu
opt = ADAM(params(m))
loss(x, y) = crossentropy(m(x), y)
accuracy(x, y) = mean(onecold(m(x)) .== onecold(y))
@show loss(X, Y), accuracy(X, Y)
@time Flux.train!(loss, dataset, opt)
@show loss(X, Y), accuracy(X, Y)

(loss(X, Y), accuracy(X, Y)) = (2.3488135f0 (tracked), 0.11515)
  2.402374 seconds (513.62 k allocations: 20.364 MiB, 25.76% gc time)
(loss(X, Y), accuracy(X, Y)) = (0.27673373f0 (tracked), 0.9230833333333334)


(0.27673373f0 (tracked), 0.9230833333333334)

In [15]:
# Run this several times to get timing for Knet:
# (Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (2.3258605f0, 0.0694)
#   0.581472 seconds (360.04 k allocations: 240.723 MiB, 5.80% gc time)
# (Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (0.29319283f0, 0.9203833333333333)
km = kChain(kDense(28^2, 32, Knet.relu),kDense(32, 10))
iters(n)=(J->((n-=1)>=0))
@show Knet.nll(km,kX,kY), Knet.accuracy(km,kX,kY)
@time Knet.train!(km,kdata,callback=iters(200),optimizer=Knet.Adam())
@show Knet.nll(km,kX,kY), Knet.accuracy(km,kX,kY)

(Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (2.2998385f0, 0.1048)
  0.581472 seconds (360.04 k allocations: 240.723 MiB, 5.80% gc time)
(Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (0.2898485f0, 0.9210333333333334)


(0.2898485f0, 0.9210333333333334)

## Knet GPU Profile

In [20]:
using TimerOutputs: reset_timer!
reset_timer!(Knet.to); reset_timer!(AutoGrad.to)
km = kChain(kDense(28^2, 32, Knet.relu),kDense(32, 10))
@time Knet.train!(km,kdata,callback=iters(200),optimizer=Knet.Adam())
println(); flush(stdout)
display(AutoGrad.to)
println(); flush(stdout)
display(Knet.to)

  0.894881 seconds (347.96 k allocations: 239.375 MiB, 2.89% gc time)



 [1m──────────────────────────────────────────────────────────────────────────────────────[22m
 [1m                                      [22m        Time                   Allocations      
                                       ──────────────────────   ───────────────────────
           Tot / % measured:                907ms / 73.0%            240MiB / 59.3%    

 Section                       ncalls     time   %tot     avg     alloc   %tot      avg
 ──────────────────────────────────────────────────────────────────────────────────────
 *[1]                             402    158ms  23.8%   393μs    283KiB  0.19%        -
   Knet.A_mul_Bt                  201   36.0ms  5.44%   179μs    104KiB  0.07%        -
 *                                402    113ms  17.0%   281μs    208KiB  0.14%        -
 +.[2]                            402   74.1ms  11.2%   184μs    302KiB  0.21%        -
 Knet.cudnnSoftmaxForward         201   55.7ms  8.41%   277μs    219KiB  0.15%  1.09KiB
 getindex    




 [1m────────────────────────────────────────────────────────────────────────────────────────[22m
 [1m                                        [22m        Time                   Allocations      
                                         ──────────────────────   ───────────────────────
            Tot / % measured:                 909ms / 69.2%            240MiB / 0.59%    

 Section                         ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────────
 cublasSgemm_v2                   1.00k    271ms  43.0%   269μs    126KiB  8.62%        -
 sum_32_21                          402   66.9ms  10.6%   166μs   25.1KiB  1.72%        -
 cudnnSoftmaxForward                201   53.6ms  8.53%   267μs    171KiB  11.7%        -
   cudnnCreateTensorDescriptor      402   1.55ms  0.25%  3.86μs         -  0.00%        -
   cudnnSetTensorNdDescriptor       402   1.29ms  0.20%  3.21μs         -  0.00% 

## CPU tests

In [21]:
# Load data
epochs = 10
imgs = MNIST.images()
X = hcat(float.(reshape.(imgs, :))...) # Float64
labels = MNIST.labels()
Y = onehotbatch(labels, 0:9)
dataset = repeated((X, Y), epochs)

kX = X
kY = labels .+ 1
kdata = repeated((kX,kY), epochs)

summary.((X,Y,kX,kY))

("784×60000 Array{Float64,2}", "10×60000 Flux.OneHotMatrix{Array{Flux.OneHotVector,1}}", "784×60000 Array{Float64,2}", "60000-element Array{Int64,1}")

In [28]:
# Run this several times to get CPU timing for Flux:
# (loss(X, Y), accuracy(X, Y)) = (2.3871476663323263 (tracked), 0.08006666666666666)
#  8.411364 seconds (6.07 k allocations: 4.812 GiB, 30.91% gc time)
# (loss(X, Y), accuracy(X, Y)) = (1.8506316291000398 (tracked), 0.5061166666666667)
m = Chain(Dense(28^2, 32, relu),Dense(32, 10),softmax) # gpu default is Float32 but cpu default is Float64!!!
m0 = deepcopy(m)
opt = ADAM(params(m))
loss(x, y) = crossentropy(m(x), y)
accuracy(x, y) = mean(onecold(m(x)) .== onecold(y))
@show loss(X, Y), accuracy(X, Y)
@time Flux.train!(loss, dataset, opt)
@show loss(X, Y), accuracy(X, Y);

(loss(X, Y), accuracy(X, Y)) = (2.3871476663323263 (tracked), 0.08006666666666666)
  8.411364 seconds (6.07 k allocations: 4.812 GiB, 30.91% gc time)
(loss(X, Y), accuracy(X, Y)) = (1.8506316291000398 (tracked), 0.5061166666666667)


In [29]:
# Run this several times to get CPU timing for Knet:
# (Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (2.3871476663323263, 0.08006666666666666)
#   3.668864 seconds (67.47 k allocations: 1.321 GiB, 53.07% gc time)
# (Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (1.8487510324791598, 0.5076166666666667)
f2k(a)=Param(Array(a))
km = kChain(kDense(f2k(m0.layers[1].W.data),f2k(m0.layers[1].b.data),Knet.relu), 
            kDense(f2k(m0.layers[2].W.data),f2k(m0.layers[2].b.data),identity))
iters(n)=(J->((n-=1)>=0))
@show Knet.nll(km,kX,kY), Knet.accuracy(km,kX,kY)
@time Knet.train!(km,kdata,callback=iters(epochs),optimizer=Knet.Adam())
@show Knet.nll(km,kX,kY), Knet.accuracy(km,kX,kY);

(Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (2.3871476663323263, 0.08006666666666666)
  3.668864 seconds (67.47 k allocations: 1.321 GiB, 53.07% gc time)
(Knet.nll(km, kX, kY), Knet.accuracy(km, kX, kY)) = (1.8487510324791598, 0.5076166666666667)


In [6]:
# You can use this to start Knet with exact same model on gpu
# m0 = deepcopy(m)
# c2k(a)=Param(KnetArray(Array(a)))
# km = kChain(kDense(c2k(m0.layers[1].W.data),c2k(m0.layers[1].b.data),Knet.relu), 
#             kDense(c2k(m0.layers[2].W.data),c2k(m0.layers[2].b.data),identity))

#5 (generic function with 1 method)