## Comparisson codes


There is a significant speedup but...  what is the most impactful change in the code?

Some comments

- Use **```A .+= b ```** instead of **```A += b ```** the first one allocates less memory.


#### Different progressive steps to imporve speed

Version 1) defined beforehand the arrays that are used inside the for loop.
    

In [277]:
using BenchmarkTools

# Generate data that will be used later
X_train = rand(5000,784);
X_batch_col = X_train'[:,1:200];

In [278]:
type RBM_col{T <: Real}
    W::Matrix{T}         
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    n_vis::Int64
    n_hid::Int64
    trained::Bool
end


function initializeRBM_col(n_vis::Int64, n_hid::Int64; sigma=0.01, T=Float64)
    
    return RBM_col{T}(sigma*randn(n_hid,n_vis),  # weight matrix
                      zeros(n_vis),              # visible vector  
                      zeros(n_hid),              # Hidden vector
                      n_vis,                     # num visible units 
                      n_hid,                     # num hidden unnits
                      false)                     # trained


end

rbm = initializeRBM_col(784, 225);

In [279]:
versioninfo()

Julia Version 0.6.0-dev.2069
Commit ff9a949 (2017-01-13 02:17 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin13.4.0)
  CPU: Intel(R) Core(TM) i7-4650U CPU @ 1.70GHz
  WORD_SIZE: 64
  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
  LAPACK: libopenblas64_
  LIBM: libopenlibm
  LLVM: libLLVM-3.9.1 (ORCJIT, haswell)


## 0) First implementation of my own

In [280]:

function sigmoid(vector::Array{Float64})
    return 1./(1 + e.^(-vector))
end


function contrastive_divergence_col_K(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))

    
    for i in 1:batch_size
        x =  Xbatch[:,i]
        xneg =  Xbatch[:,i]

        for k in 1:K
            hneg = sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid)
            xneg = sigmoid( rbm.W' * hneg .+ rbm.vis_bias) .> rand(rbm.n_vis)
        end

        ehp = sigmoid(rbm.W * x + rbm.hid_bias)
        ehn = sigmoid(rbm.W * xneg + rbm.hid_bias)
 
        ### kron vs no kron???
        #Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_W .+= lr * ( x * ehp' - xneg * ehn')'
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end

contrastive_divergence_col_K (generic function with 1 method)

In [281]:
contrastive_divergence_col_K(X_batch_col[:,1:10], rbm, 1, 0.01);

In [283]:
@time contrastive_divergence_col_K(X_batch_col, rbm, 1, 0.01);

  0.977352 seconds (18.21 k allocations: 1.342 GB, 28.73% gc time)


## After 5) I realized most of the speed problem came from a transpose

In [284]:

function sigmoid(vector::Array{Float64})
    return 1./(1 + e.^(-vector))
end


function contrastive_divergence_col_K_(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))

    
    for i in 1:batch_size
        x =  Xbatch[:,i]
        xneg =  Xbatch[:,i]

        for k in 1:K
            hneg = sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid)
            xneg = sigmoid( rbm.W' * hneg .+ rbm.vis_bias) .> rand(rbm.n_vis)
        end

        ehp = sigmoid(rbm.W * x + rbm.hid_bias)
        ehn = sigmoid(rbm.W * xneg + rbm.hid_bias)
 
        ### kron vs no kron???
        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end

contrastive_divergence_col_K_ (generic function with 1 method)

In [256]:
contrastive_divergence_col_K_(X_batch_col, rbm, 1, 0.01);

In [257]:
@time contrastive_divergence_col_K_(X_batch_col, rbm, 1, 0.01);

  0.252066 seconds (12.61 k allocations: 28.984 MB, 1.17% gc time)


## 1) Define all vectors used inside the function and "fill them"

Don't get why this version requires more gc time but then it has less allocation and it is a bit faster

This version uses the **```similar```** function to generate the arrays beforehand and fill them later.

- **B = ```similar(A)```** will generate a new array ```B``` of the same type and shape as ```A```.

- We could just do  **```B = zeros(A)```** which would generate new array ```B``` of the same type and shape as ```A``` full of zeros.
    - Notice that if A is an array then **``` zeros(A) == zeros(size(A))```** because if the input of zeros is an Array then it generates a new array of the shape of its input, if it is a tuple it generates an array of of shape equal to the tuple.

 

In [287]:
zeros(rbm.W) == zeros(size(rbm.W))

true

In [288]:
function contrastive_divergence_col_K_1(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    hneg = similar(rbm.hid_bias)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(rbm.vis_bias)
    x = similar(rbm.vis_bias)
    
    for i in 1:batch_size
        x = Xbatch[:,i]
        xneg = Xbatch[:,i]

        for k in 1:K
            hneg .= sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid)
            xneg .= sigmoid( rbm.W' * hneg .+ rbm.vis_bias) .> rand(rbm.n_vis)
        end

        ehp .= sigmoid(rbm.W * x + rbm.hid_bias)
        ehn .= sigmoid(rbm.W * xneg + rbm.hid_bias)
 
        ### kron vs no kron???
        Delta_W .+= lr * ( x * ehp' - xneg * ehn')'
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end

contrastive_divergence_col_K_1 (generic function with 1 method)

In [289]:
rbm = initializeRBM_col(784, 225)
contrastive_divergence_col_K_1(X_batch_col[:,1:10], rbm, 1, 0.01);

In [290]:
@time contrastive_divergence_col_K(X_batch_col, rbm, 1, 0.01);

  1.421306 seconds (297.66 k allocations: 1.348 GB, 10.64% gc time)


In [291]:
@time contrastive_divergence_col_K_1(X_batch_col, rbm, 1, 0.01);

  0.744455 seconds (14.02 k allocations: 1.339 GB, 21.13% gc time)


## 2) Use view in order no to copy the current instance of the minibatch

We will use a view of ```Xbatch[:,i]``` instead of allocating it to x.

I though that slices of arrays where passed as views not as copies...

Performance decreases ?

In [292]:
?@view

```
@view A[inds...]
```

Creates a `SubArray` from an indexing expression. This can only be applied directly to a reference expression (e.g. `@view A[1,2:end]`), and should *not* be used as the target of an assignment (e.g. `@view(A[1,2:end]) = ...`).


In [293]:
function contrastive_divergence_col_K_2(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    hneg = similar(rbm.hid_bias)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(rbm.vis_bias)
    x = similar(rbm.vis_bias)
    
    for i in 1:batch_size
        x = @view Xbatch[:,i]
        xneg = @view Xbatch[:,i]

        for k in 1:K
            hneg .= sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid)
            xneg .= sigmoid( rbm.W' * hneg .+ rbm.vis_bias) .> rand(rbm.n_vis)
        end

        ehp .= sigmoid(rbm.W * Xbatch[:,i] + rbm.hid_bias)
        ehn .= sigmoid(rbm.W * xneg + rbm.hid_bias)
 
        ### kron vs no kron???
        Delta_W .+= lr * (Xbatch[:,i] * ehp' - xneg * ehn')'
        Delta_b .+= lr * (Xbatch[:,i] - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end

contrastive_divergence_col_K_2 (generic function with 1 method)

In [294]:
# warmup 
contrastive_divergence_col_K_2(X_batch_col, rbm, 1, 0.01);

In [295]:
@time contrastive_divergence_col_K_1(X_batch_col, rbm, 1, 0.01);

  0.721602 seconds (14.02 k allocations: 1.339 GB, 21.10% gc time)


In [296]:
@time contrastive_divergence_col_K_2(X_batch_col, rbm, 1, 0.01);

  0.799129 seconds (15.82 k allocations: 1.340 GB, 18.49% gc time)


## 3) Elementwise application of the sigmoid

Notice that you can apply a function f that takes as input Float64 to every position of an array of Float64 using a dot. I do not see any benefit though in terms of speed, though is more efficient in terms of KB memory.


In [297]:
function f(x::Float64)
    return 2*x+23 -x^2
end

f (generic function with 2 methods)

In [298]:
function f(x::Array{Float64})
    return 2x + 23 - x.^2
end

f (generic function with 2 methods)

In [299]:
a = rand(10000);

In [300]:
f.(a)
@time f.(a);

  0.000138 seconds (29 allocations: 79.359 KB)


In [301]:
f(a)
@time f(a);

  0.000036 seconds (12 allocations: 312.969 KB)


In [302]:
sigmoid(x::Float64) = 1. / (1. + exp(-x))

function contrastive_divergence_col_K_3(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    hneg = similar(rbm.hid_bias)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(rbm.vis_bias)
    x = similar(rbm.vis_bias)
    
    for i in 1:batch_size
        x = Xbatch[:,i]
        xneg = Xbatch[:,i]

        for k in 1:K
            hneg .= sigmoid.(rbm.W * xneg .+ rbm.hid_bias) .> rand()
            xneg .= sigmoid.(rbm.W' * hneg .+ rbm.vis_bias) .> rand()
        end

        ehp .= sigmoid.(rbm.W * x + rbm.hid_bias)
        ehn .= sigmoid.(rbm.W * xneg + rbm.hid_bias)
 
        ### kron vs no kron???
        Delta_W .+= lr * (x * ehp' - xneg * ehn')'
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end


contrastive_divergence_col_K_3 (generic function with 1 method)

In [303]:
#warmup
contrastive_divergence_col_K_3(X_batch_col, rbm, 1, 0.01);

In [304]:
@time contrastive_divergence_col_K_1(X_batch_col, rbm, 1, 0.01);

  0.733203 seconds (14.02 k allocations: 1.339 GB, 20.99% gc time)


In [305]:
@time contrastive_divergence_col_K_3(X_batch_col, rbm, 1, 0.01);

  0.728931 seconds (10.02 k allocations: 1.327 GB, 21.26% gc time)


## 4) Using the BLAS

Using A_mul_B! is not in itself enough to speed up the code

In [306]:
sigmoid(x::Float64) = 1. / (1. + exp(-x))

function contrastive_divergence_col_K_4(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    hneg = similar(rbm.hid_bias)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(rbm.vis_bias)
    x = similar(rbm.vis_bias)
    b1 = similar(rbm.hid_bias)
    b2 = similar(rbm.vis_bias)
    
    for i in 1:batch_size
        x = Xbatch[:,i]
        xneg = Xbatch[:,i]

        for k in 1:K
            A_mul_B!(b1, rbm.W, xneg)
            hneg .= sigmoid.(b1 .+ rbm.hid_bias) .> rand.()
            At_mul_B!(b2, rbm.W, hneg)
            xneg .= sigmoid.(b2 .+ rbm.vis_bias) .> rand.()
        end

        A_mul_B!(b1, rbm.W, x)
        ehp .= sigmoid.(b1 .+ rbm.hid_bias)
        A_mul_B!(b1, rbm.W, xneg)
        ehn .= sigmoid.(b1 .+ rbm.hid_bias)
        
        ### kron vs no kron???
        
        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')

        #Delta_W .+= lr * (x * ehp' - xneg * ehn')'
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end



contrastive_divergence_col_K_4 (generic function with 1 method)

In [307]:
 contrastive_divergence_col_K_4(X_batch_col, rbm, 1, 0.01);

In [308]:
@time contrastive_divergence_col_K_4(X_batch_col, rbm, 1, 0.01);

  0.172969 seconds (2.82 k allocations: 10.858 MB, 2.46% gc time)


## 5)    Do not need transpose!!  Delta_W .+= lr * ( x * ehp' - xneg * ehn')'
 

sigmoid(x::Float64) = 1. / (1. + exp(-x))

function contrastive_divergence_col_K_5(Xbatch, rbm, K::Int64, lr::Float64)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    hneg = similar(rbm.hid_bias)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(rbm.vis_bias)
    x = similar(rbm.vis_bias)
    b1 = similar(rbm.hid_bias)
    b2 = similar(rbm.vis_bias)
    
    for i in 1:batch_size
        x = @view Xbatch[:,i]
        xneg .= @view Xbatch[:,i]

        for k in 1:K
            A_mul_B!(b1, rbm.W, xneg)
            hneg .= sigmoid.(b1 .+ rbm.hid_bias) .> rand.()
            At_mul_B!(b2, rbm.W, hneg)
            xneg .= sigmoid.(b2 .+ rbm.vis_bias) .> rand.()
        end

        A_mul_B!(b1, rbm.W, x)
        ehp .= sigmoid.(b1 .+ rbm.hid_bias)
        A_mul_B!(b1, rbm.W, xneg)
        ehn .= sigmoid.(b1 .+ rbm.hid_bias)
    
        # THIS IS WHAT I HAD
        #Delta_W .+= lr * (x * ehp' - xneg * ehn')'
        
        #Evilzero version
        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr * (x - xneg)
        Delta_c .+= lr * (ehp - ehn)
    end
        

    rbm.W .+= Delta_W / batch_size;
    rbm.vis_bias .+= Delta_b / batch_size;
    rbm.hid_bias .+= Delta_c / batch_size;
    
end



In [309]:
contrastive_divergence_col_K_5(X_batch_col, rbm, 1, 0.01);

In [310]:
@time contrastive_divergence_col_K_5(X_batch_col, rbm, 1, 0.01);

  0.135139 seconds (2.22 k allocations: 8.420 MB)


This version which is very similar to the first one at the top of the notebook is almost as fast as evilzero version

- Seems Julia really doesn't like transposing

# evilzero version

In [311]:
    Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr .* (x .- xneg)
        Delta_c .+= lr .* (ehp .- ehn)
    

LoadError: [91mUndefVarError: x not defined[39m

In [312]:
sigmoid(x::Float64) = 1. / (1. + exp(-x))


function contrastive_divergence_col_K_evilzero(Xbatch, rbm, K::Int64, lr::Float64)

    batch_size = size(Xbatch)[2]

    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))

    hneg = similar(rbm.hid_bias)
    b1 = similar(rbm.W * Xbatch[:,1])
    b2 = similar(rbm.W' * hneg)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(Xbatch[:,1])
    
    @inbounds for i in 1:batch_size
        x = @view Xbatch[:,i]
        xneg .= @view Xbatch[:,i]

        for k in 1:K
            A_mul_B!(b1, rbm.W, xneg)
            hneg .= sigmoid.(b1 .+ rbm.hid_bias) .> rand.()
            At_mul_B!(b2, rbm.W, hneg)
            xneg .= sigmoid.(b2 .+ rbm.vis_bias) .> rand.()
        end

        A_mul_B!(b1, rbm.W, x)
        ehp .= sigmoid.(b1 .+ rbm.hid_bias)
        A_mul_B!(b1, rbm.W, xneg)
        ehn .= sigmoid.(b1 .+ rbm.hid_bias)

        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr .* (x .- xneg)
        Delta_c .+= lr .* (ehp .- ehn)
    end

    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_b ./ batch_size;
    rbm.hid_bias .+= Delta_c ./ batch_size;

    return
end

contrastive_divergence_col_K_evilzero (generic function with 1 method)

In [313]:
#warmup
contrastive_divergence_col_K_evilzero(X_batch_col, rbm, 1, 0.01);

In [314]:

@time contrastive_divergence_col_K_evilzero(X_batch_col, rbm, 1, 0.01);

  0.131786 seconds (1.42 k allocations: 3.863 MB)


#### A trivial improvement: We know the shape of b1 and b2 beforehand


In [315]:
sigmoid(x::Float64) = 1. / (1. + exp(-x))


function contrastive_divergence_col_K_evilzero_1(Xbatch, rbm, K::Int64, lr::Float64)

    batch_size = size(Xbatch)[2]

    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))

    hneg = similar(rbm.hid_bias)
    b1 = similar(rbm.W * Xbatch[:,1])
    b2 = similar(rbm.W' * hneg)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
    xneg = similar(Xbatch[:,1])
    
    @inbounds for i in 1:batch_size
        x = @view Xbatch[:,i]
        xneg .= @view Xbatch[:,i]

        for k in 1:K
            A_mul_B!(b1, rbm.W, xneg)
            hneg .= sigmoid.(b1 .+ rbm.hid_bias) .> rand.()
            At_mul_B!(b2, rbm.W, hneg)
            xneg .= sigmoid.(b2 .+ rbm.vis_bias) .> rand.()
        end

        A_mul_B!(b1, rbm.W, x)
        ehp .= sigmoid.(b1 .+ rbm.hid_bias)
        A_mul_B!(b1, rbm.W, xneg)
        ehn .= sigmoid.(b1 .+ rbm.hid_bias)

        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr .* (x .- xneg)
        Delta_c .+= lr .* (ehp .- ehn)

    end

    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_b ./ batch_size;
    rbm.hid_bias .+= Delta_c ./ batch_size;

    return
end

contrastive_divergence_col_K_evilzero_1 (generic function with 1 method)

In [316]:
contrastive_divergence_col_K_evilzero_1(X_batch_col, rbm, 1, 0.01);

In [317]:
@time contrastive_divergence_col_K_evilzero_1(X_batch_col, rbm, 1, 0.01);

  0.134424 seconds (1.42 k allocations: 3.863 MB)
