# RBM implementation

Objective: Implement CRBM in Julia for time series analysis

In [2]:
# Import Distributions to generate random numbers W matrix of the RBM
using Distributions
using MNIST
using BenchmarkTools
using PyPlot

In [3]:
type RBM{T <: Real}
    n_vis::Int
    n_hid::Int
    W::Matrix{T}         
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    trained::Bool
end

function Base.show{T}(io::IO, rbm::RBM{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end

In [4]:
function sigmoid(vector::Array{Float64})
    return 1./(1 + exp.(-vector))
end

sigmoid (generic function with 1 method)

In [5]:
function sigmoid(x::Float64)
    return 1/(1 + exp(-x))
end

sigmoid (generic function with 2 methods)

In [6]:
function initialize_RBM(n_vis, n_hid, sigma, T)
    
    return RBM{T}( n_vis,                                 # num visible units 
                   n_hid,                                 # num hidden unnits
                   rand(Normal(0,sigma), n_hid, n_vis),   # weight matrix
                   zeros(n_vis),                          # visible vector  
                   zeros(n_hid),                          # Hidden vector
                   false)                                 # trained
end

initialize_RBM (generic function with 1 method)

In [7]:
rbm = initialize_RBM(784, 100, 0.01, Float64)

RBM{Float64}(n_vis=784, n_hid=100, trained=false)

In [8]:
size(rbm.W)

(100,784)

In [9]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()

(
[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

[7.0,2.0,1.0,0.0,4.0,1.0,4.0,9.0,5.0,9.0  …  7.0,8.0,9.0,0.0,1.0,2.0,3.0,4.0,5.0,6.0])

In [10]:
function contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
        
    batch_size = size(Xbatch)[2]
    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))
    
    xneg = zeros(size(rbm.vis_bias))
    hneg = similar(rbm.hid_bias)
    b1 = similar(rbm.W * Xbatch[:,1])
    b2 = similar(rbm.W' * hneg)
    ehp = similar(rbm.hid_bias)
    ehn = similar(rbm.hid_bias)
        
    @inbounds for i in 1:batch_size
        x =  @view Xbatch[:,i]
        xneg = @view Xbatch[:,i]

        for k in 1:K
            hneg .= sigmoid(rbm.W * xneg .+ rbm.hid_bias) .> rand.()
            At_mul_B!(b2, rbm.W, hneg)
            xneg .= sigmoid(b2 .+ rbm.vis_bias) .> rand.()         
        end

        A_mul_B!(b1, rbm.W, x)
        ehp .= sigmoid(b1 .+ rbm.hid_bias)
        A_mul_B!(b1, rbm.W, xneg)
        ehn .= sigmoid(b1 .+ rbm.hid_bias)

        Delta_W .+= lr .* (ehp .* x' .- ehn .* xneg')
        Delta_b .+= lr .* (x .- xneg)
        Delta_c .+= lr .* (ehp .- ehn)

    end

    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_b ./ batch_size;
    rbm.hid_bias .+= Delta_c ./ batch_size;

    return 
end

contrastive_divergence_K (generic function with 1 method)

In [None]:
X_batch = X_train[:,1:25]

@benchmark contrastive_divergence_K(X_batch, rbm, 1, 0.01)
#@time contrastive_divergence_K(X_batch, rbm, 1, 0.01)

In [None]:
size(X_train), size(X_batch)

# Fit RBM

In [None]:
function fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    print("number minibatches:", length(indicies), "\n")
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Xbatch = @view X[:, minibatch_ind]
            contrastive_divergence_K(Xbatch, rbm, K, lr)
            
        end
        print("\nepoch ", epoch, "  time epoch:", toq())
    end
    rbm.trained = true
end

In [None]:
PyPlot.imshow(reshape(rbm.W[9,:],28,28),"gray")

In [None]:
n_epochs = 1
batch_size = 200
K = 1
lr = 0.01

@time fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

# vectorized cdk

In [None]:
function vec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
    
    Xneg = copy(Xbatch)
    batch_size = size(Xbatch)[2]
    
    for k in 1:K
        Hneg = sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Xneg = sigmoid(rbm.W' * Hneg  .+ rbm.vis_bias) .> rand()
    end
       
    Ehp = sigmoid(rbm.W * Xbatch .+ rbm.hid_bias)
    Ehn = sigmoid(rbm.W * Xneg .+ rbm.hid_bias)

    Delta_W = lr*( Ehp * Xbatch' -  Ehn *  Xneg')
    Delta_vis_bias = sum(lr .* (Xbatch .- Xneg), 2)[:]
    Delta_hid_bias = sum(lr .* (Ehp - Ehn), 2)[:]
    
    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_vis_bias ./ batch_size;
    rbm.hid_bias .+= Delta_hid_bias ./ batch_size;
    
end

In [None]:
X_batch = X_train[:,1:25]
@benchmark vec_contrastive_divergence_K(X_batch, rbm, 1, 0.01)

In [None]:
function vec_fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    println("number minibatches:", length(indicies), "\n")
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Xbatch = @view X[:, minibatch_ind]
            vec_contrastive_divergence_K(Xbatch, rbm, K, lr)
        end
        #print("\n\nepoch ", epoch, "  time epoch:", toq(), "\n")
    end
    rbm.trained = true
end

In [None]:
n_epochs = 20
batch_size = 1000
K = 1
lr = 0.01

@time vec_fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

In [None]:
cm_greys = PyPlot.cm_get_cmap("Greys_r")

In [None]:
PyPlot.imshow(reshape(rbm.W[9,:],28,28),cm_greys)

In [None]:
reshape(rbm.W[9,:],28,28)

## DEFINE arrays at the beginning of the function

In [None]:
expand(:(lr.*( Ehp * Xbatch' .-  Ehn *  Xneg')))

In [None]:
?A_mul_Bc

In [None]:
expand(:(rbm.W * Xbatch .+ rbm.hid_bias))

In [None]:
function vec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
    
    Xneg = Xbatch
    batch_size = size(Xbatch)[2]
    
    Hneg::Array{Float64} = zeros(rbm.n_hid, batch_size)
    Xneg::Array{Float64} = zeros(rbm.n_vis, batch_size)
    
    for k in 1:K
        Hneg .= sigmoid.(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Xneg .= sigmoid.(rbm.W' * Hneg  .+ rbm.vis_bias) .> rand()
    end
       
    Ehp = sigmoid.(rbm.W * Xbatch .+ rbm.hid_bias)
    Ehn = sigmoid.(rbm.W * Xneg .+ rbm.hid_bias)

    Delta_W = lr.*( Ehp * Xbatch' .-  Ehn *  Xneg')
    Delta_vis_bias = sum(lr .* (Xbatch .- Xneg), 2)[:]
    Delta_hid_bias = sum(lr .* (Ehp .- Ehn), 2)[:]
    
    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_vis_bias ./ batch_size;
    rbm.hid_bias .+= Delta_hid_bias ./ batch_size;
    
end

In [None]:
X_batch = X_train[:,1:25]
@benchmark vec_contrastive_divergence_K(X_batch, rbm, 3, 0.01)

In [None]:
function vec_fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    println("number minibatches:", length(indicies), "\n")
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Xbatch = @view X[:, minibatch_ind]
            vec_contrastive_divergence_K(Xbatch, rbm, K, lr)
        end
        print("\n\nepoch ", epoch, "  time epoch:", toq(), "\n")
    end
    rbm.trained = true
end

In [None]:
n_epochs = 1
batch_size = 1000
K = 1
lr = 0.01

@time vec_fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

### Define space for all the arrays

In [None]:
function vec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
    
    Xneg = copy(Xbatch)
    batch_size = size(Xbatch)[2]
    
    local Hneg::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Xneg::Array{Float64} = zeros(rbm.n_vis, batch_size)
    local Ehp::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Ehn::Array{Float64} = zeros(rbm.n_hid, batch_size)
    
    for k in 1:K
        Hneg .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Xneg .= sigmoid(rbm.W' * Hneg  .+ rbm.vis_bias) .> rand()
    end
       
    Ehp .= sigmoid(rbm.W * Xbatch .+ rbm.hid_bias)
    Ehn .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias)
    
    rbm.W .+= lr.*( Ehp * Xbatch' .-  Ehn *  Xneg') ./ batch_size;
    rbm.vis_bias .+= sum(lr .* (Xbatch .- Xneg), 2)[:]./ batch_size;
    rbm.hid_bias .+= sum(lr .* (Ehp .- Ehn), 2)[:] ./ batch_size;
    
end

In [None]:
X_batch = X_train[:,1:25]
@benchmark vec_contrastive_divergence_K(X_batch, rbm, 1, 0.01)

In [None]:
function vec_fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    #println("number minibatches:", length(indicies), "\n")
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Xbatch = @view X[:, minibatch_ind]
            vec_contrastive_divergence_K(Xbatch, rbm, K, lr)
        end
        #print("\n\nepoch ", epoch, "  time epoch:", toq(), "\n")
    end
    rbm.trained = true
end

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.01

@benchmark vec_fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

## Use the BLAS

In [None]:
?BLAS.gemm

In [None]:
?BLAS.gemm!

In [None]:
?BLAS.gemv!

In [None]:
?BLAS.ger!

### The blas has understandable names in Julia

In [None]:
?A_mul_Bt

In [None]:
?At_mul_B

In [None]:
?A_mul_B

#### Example

In [None]:
Hneg= rand(rbm.n_hid, batch_size);

In [None]:
(rbm.W' * Hneg)[1:3]

In [None]:
BLAS.gemm('T','N', Float64(1.0), rbm.W, Hneg)[1:3]

In [None]:
At_mul_B(rbm.W, Hneg)[1:3]

In [None]:
expand(:(rbm.W' * Hneg))

In [None]:
@benchmark Ac_mul_B(rbm.W,Hneg)

In [None]:
@benchmark At_mul_B(rbm.W,Hneg)

In [None]:
function vec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
    
    Xneg = copy(Xbatch)
    batch_size = size(Xbatch)[2]
    
    local Hneg::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Xneg::Array{Float64} = zeros(rbm.n_vis, batch_size)
    local Ehp::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Ehn::Array{Float64} = zeros(rbm.n_hid, batch_size)
    
    for k in 1:K
        #Hneg .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Hneg .= sigmoid.(A_mul_B(Rbm.W,Xneg) .+ rbm.hid_bias) .> rand()
        #Xneg .= sigmoid(rbm.W' * Hneg  .+ rbm.vis_bias) .> rand()
        Xneg .= sigmoid.(At_mul_B(rbm.W, Hneg) .+ rbm.vis_bias) .> rand()
    end
       
    Ehp .= sigmoid.( A_mul_B(rbm.W, Xbatch) .+ rbm.hid_bias)
    Ehn .= sigmoid.( A_mul_B(rbm.W, Xneg) .+ rbm.hid_bias)
   
    #rbm.W .+= lr*( Ehp * Xbatch' -  Ehn *  Xneg') ./ batch_size;
    rbm.W .+= lr.*(A_mul_Bt(Ehp, Xbatch) .- A_mul_Bt(Ehn, Xneg)) ./ batch_size;
    rbm.vis_bias .+= sum(lr .* (Xbatch .- Xneg), 2)[:]./ batch_size;
    rbm.hid_bias .+= sum(lr .* (Ehp .- Ehn), 2)[:] ./ batch_size;
    
end

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.01

@benchmark vec_fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

### Allocating memory inside

In [None]:
function vec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real, Hneg, Xneg, Ehp,Ehn  )
    
    Xneg = Xbatch

    for k in 1:K
        #Hneg .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Hneg .= sigmoid( rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        #Xneg .= sigmoid(rbm.W' * Hneg  .+ rbm.vis_bias) .> rand()
        Xneg .= sigmoid(At_mul_B(rbm.W, Hneg) .+ rbm.vis_bias) .> rand()
    end
       
    Ehp .= sigmoid(rbm.W * Xbatch .+ rbm.hid_bias)
    Ehn .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias)
   
    #rbm.W .+= lr*( Ehp * Xbatch' -  Ehn *  Xneg') ./ batch_size;
    rbm.W .+= lr.*(A_mul_Bt(Ehp, Xbatch) .- A_mul_Bt(Ehn, Xneg)) ./ batch_size;
    rbm.vis_bias .+= sum(lr .* (Xbatch .- Xneg), 2)[:]./ batch_size;
    rbm.hid_bias .+= sum(lr .* (Ehp - Ehn), 2)[:] ./ batch_size;
    
end

In [None]:
function mem_vec_fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    println("number minibatches:", length(indicies), "\n")
    
    batch_size = length(indicies[1] )
    local Hneg::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Xneg::Array{Float64} = zeros(rbm.n_vis, batch_size)
    local Ehp::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Ehn::Array{Float64} = zeros(rbm.n_hid, batch_size)
    
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Hneg .= zero(Hneg)
            Xneg .= zero(Xneg)
            Ehp .= zero(Ehp)
            Ehn .= zero(Ehn)
            
            vec_contrastive_divergence_K(X[:, minibatch_ind], rbm, K, lr, Hneg, Xneg, Ehp,Ehn  )
        end
        print("\n\nepoch ", epoch, "  time epoch:", toq(), "\n")
    end
    rbm.trained = true
end

In [None]:
n_epochs = 1
batch_size = 1000
K = 1
lr = 0.01

@benchmark mem_vec_fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)

In [None]:
?zeros


# Optimized vectorial

Use BLAS directly to make the "transposes"

- https://discourse.julialang.org/t/blas-performance-issues-for-common-neural-network-patterns/565

In [None]:
?BLAS.gemv

In [None]:
?BLAS.gemv!

In [None]:
Xbatch = X_train[:,1:25]
Xneg = copy(Xbatch)
Hneg = sigmoid(rbm.W * Xneg .+ rbm.hid_bias);

In [None]:
?BLAS.gemm

In [None]:
(rbm.W' * Hneg)[1:3]

In [None]:
BLAS.gemm('T','N', Float64(1.0), rbm.W, Hneg)[1:3]

In [None]:
@benchmark rbm.W' * Hneg

In [None]:
@benchmark BLAS.gemm('T','N', Float64(1.0), rbm.W, Hneg)

In [None]:
[3,4,5] .> rand()

In [None]:
rand()

In [None]:
T = Float32
function optvec_contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
    
    Xneg = copy(Xbatch)
    batch_size = size(Xbatch)[2]
    
    # I put the line below here because then 
    # Hneg = sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand() 
    # is cast as an Array{float64} and then I can use the BLAS 
    # without errors
    local Hneg::Array{Float64} = zeros(rbm.n_hid, batch_size)
    local Xneg::Array{Float64} = zeros(rbm.n_vis, batch_size)

    for k in 1:K
        Hneg .= sigmoid(rbm.W * Xneg .+ rbm.hid_bias) .> rand()
        Xneg .= sigmoid(BLAS.gemm('T','N', Float64(1.0), rbm.W, Hneg)  .+ rbm.vis_bias) .> rand()
    end
       
    Ehp = sigmoid(rbm.W * Xbatch .+ rbm.hid_bias)
    Ehn = sigmoid(rbm.W * Xneg .+ rbm.hid_bias)

    Delta_W = lr*( Ehp * Xbatch' -  Ehn *  Xneg')
    Delta_vis_bias = sum(lr .* (Xbatch .- Xneg), 2)[:]
    Delta_hid_bias = sum(lr .* (Ehp - Ehn), 2)[:]
    
    rbm.W .+= Delta_W ./ batch_size;
    rbm.vis_bias .+= Delta_vis_bias ./ batch_size;
    rbm.hid_bias .+= Delta_hid_bias ./ batch_size;
    
end

In [None]:
X_batch = X_train[:,1:25]
@benchmark optvec_contrastive_divergence_K(X_batch, rbm, 1, 0.01)

In [None]:
function vec_fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    mb = 1
    println("number minibatches:", length(indicies), "\n")
    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            Xbatch = @view X[:, minibatch_ind]
            vec_contrastive_divergence_K(Xbatch, rbm, K, lr)
        end
        print("\n\nepoch ", epoch, "  time epoch:", toq(), "\n")
    end
    rbm.trained = true
end

In [None]:
expand(:(Delta_W .+= lr * ( x * ehp' - xneg * ehn')'))

In [None]:
?A_mul_Bc