In [26]:
# Import Distributions to generate random numbers W matrix of the RBM
using Distributions
using MNIST
using BenchmarkTools
using Combinatorics
#using PyPlot

In [27]:
X_train, y_train = MNIST.traindata()
X_test, y_test = MNIST.testdata()

T = Float32
X_train = Array{T}( (X_train - minimum(X_train))/(maximum(X_train) - minimum(X_train)) )
y_train = Array{T}(y_train)
X_test = Array{T}(X_test - minimum(X_test))/(maximum(X_test) - minimum(X_test)) 
y_test = Array{T}(y_test);

#### Speed function return arrays vs modify arrays

In [28]:
function addstuff(a, b)
    return a .+1, b .+1
end

function addstuff!(a,b)
    a.= a .+ 1
    b.= b .+ 1
end

addstuff! (generic function with 1 method)

In [29]:
A = zeros(10);
B = zeros(20);

In [30]:
@benchmark res = addstuff(A,B)

BenchmarkTools.Trial: 
  memory estimate:  432 bytes
  allocs estimate:  3
  --------------
  minimum time:     115.934 ns (0.00% GC)
  median time:      133.447 ns (0.00% GC)
  mean time:        176.786 ns (14.76% GC)
  maximum time:     2.709 μs (84.79% GC)
  --------------
  samples:          10000
  evals/sample:     927
  time tolerance:   5.00%
  memory tolerance: 1.00%

In [31]:
@benchmark addstuff!(A,B)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     50.119 ns (0.00% GC)
  median time:      53.290 ns (0.00% GC)
  mean time:        63.692 ns (0.00% GC)
  maximum time:     1.734 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     986
  time tolerance:   5.00%
  memory tolerance: 1.00%

#### Make RBM with efficient optimizer

In [34]:
function sigmoid(x::Float32)
    return 1/(1 + exp(-x))
end

type RBM{T <: Real}
    n_vis::Int
    n_hid::Int
    W::Matrix{T}         
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    trained::Bool
    n_epochs_trained::Int
end

function initialize_RBM(n_vis, n_hid, sigma, T)
    
    return RBM{T}(n_vis,                                 # num visible units 
                  n_hid,                                 # num hidden unnits
                  rand(Normal(0,sigma), n_hid, n_vis),  # weight matrix
                  zeros(n_vis),                          # visible vector  
                  zeros(n_hid),                          # Hidden vector
                  false,0)                                 # trained
end

function Base.show{T}(io::IO, rbm::RBM{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end

function generate_M(W, n_columns)

    n_hid = size(W)[1]
    n_vis = size(W)[2]
    costat = Int(sqrt(n_vis))    
    n_rows = Int(round(n_hid/n_columns))   

    print("\ncostat: ",costat,
          "\nn_rows: ", n_rows,
          "\nn_cols: ", n_columns)
    
    M = zeros(costat * n_rows, costat * n_columns)
    
    n_im = 1
    for r in 1:costat:size(M)[1]
        for c in 1:costat:size(M)[2]
            M[r:r+costat-1, c:c+costat-1] = reshape(W[n_im,:],costat,costat)
            n_im +=1
        end
    end
    return M
end

#    # Placeholders needed for the gradients of the parameters of the RBM
#    grad_W::Matrix{T}         
#    grad_vis_bias::Vector{T}     
#    grad_hid_bias::Vector{T}   
#    
#    # Placeholders needed for performing CDK in a minibatch
#    #V::Matrix{T}
#    H::Matrix{T}
#    V_hat::Matrix{T}
#    H_hat::Matrix{T}

type CDK{T}
    K::Int
    batch_size::Int
    
    # Placeholders needed for the gradients of the parameters of the RBM
    grad_W::Matrix{T}         
    grad_vis_bias::Vector{T}     
    grad_hid_bias::Vector{T}   
    
    # Placeholders needed for performing CDK in a minibatch
    H::Matrix{T}
    V_hat::Matrix{T}
    H_hat::Matrix{T}
    rec_error::Float64

    # Placeholders needed for performing sampling in a minibatch
    V_sampling::Matrix{T}
    H_sampling::Matrix{T}    
    
end

function initialize_CDK(rbm::RBM, K, batch_size)
    T = eltype(rbm.vis_bias)
    
    grad_W = zeros(T, rbm.W)
    grad_vis_bias = zeros(T, rbm.vis_bias)
    grad_hid_bias = zeros(T, rbm.hid_bias)
    V_hat = zeros(T, rbm.n_vis, batch_size)
    H_hat = zeros(T, rbm.n_hid, batch_size)
    H = zeros(T, rbm.n_hid, batch_size)
    V_sampling = zeros(T, rbm.n_vis, batch_size)
    H_sampling = zeros(T, rbm.n_hid, batch_size)
    
    cdk = CDK(K, batch_size, 
              grad_W, grad_vis_bias,grad_hid_bias,
              H, V_hat, H_hat, 0.,
              V_sampling, H_sampling)
    
    return cdk
end

function fit!(rbm::RBM, X::Matrix, batch_size::Integer, n_epochs::Integer, lr::Real, shuffle_data::Bool, opt)
        
    T = eltype(X)
    lr = T(lr)
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]
    sample_perm = Vector(1:n_samples)
    n_minibatches = T(length(indicies))
    rec_errors = Vector{T}([])
        
    ###### Initialize Optimizer, CDK, PCDK, ....#######
    #cdk = initialize_CDK(rbm, K, batch_size)  
    
    for epoch in 1:n_epochs
        rec_error = Float32(0.)
        
        # should  it be more efficient to Shuffle indicies not the whole data?
        # then access is not contiguous though
        if shuffle_data==true
            shuffle!(sample_perm)
            X .= X[:,sample_perm]
        end
        
        for minibatch_ind in indicies          
            partial_fit!(rbm, X[:, minibatch_ind], lr, opt)
            rec_error += opt.rec_error
        end
        
        push!(rec_errors, rec_error/n_minibatches)
        rbm.n_epochs_trained +=1
        print(rec_errors[end], "\n")
    end
    rbm.trained = true
    return rec_errors
end

function partial_fit!(rbm::RBM, X::Matrix,  lr::Real, opt::CDK)
    #lr = eltype(rbm.vis_bias)(lr)
    compute_grad!(rbm, X, opt)
    update_params!(rbm, opt, lr)    
end


# function grad_apply_momentum!{T}(rbm::RBM{T}, X::Mat,
#                                  dtheta::Tuple, ctx::Dict)
#     dW, db, dc = dtheta
#     momentum = @get(ctx, :momentum, 0.9)
#     dW_prev = @get_array(ctx, :dW_prev, size(dW), zeros(T, size(dW)))
#     # same as: dW += momentum * dW_prev
#     axpy!(momentum, dW_prev, dW)
# end

partial_fit! (generic function with 1 method)

In [35]:
function compute_grad!(rbm::RBM, X::Matrix,  opt::CDK)

    T = eltype(rbm.vis_bias)
    batch_size = size(X)[2]
    
    # Add code here that copies some instances of X to make it size of the "chosen minibatch"
    # in case the last minibatch has not the same size as all other minibatches. 
    # We want to do this to avoid destroying all the memory allocations in opt and creating them again

    # Perform gibbs sampling to compute the negative phase
    for k in 1:opt.K
        if k==1       
            opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
            opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        else
            opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> rand(T,rbm.n_vis, batch_size)
            opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) 
        end      
        ## This does not impact that much
        #opt.V_sampling .= rand(T, rbm.n_vis, batch_size)
        #opt.H_sampling .= rand(T, rbm.n_hid, batch_size)
        #
        #if k==1       
        #    opt.H .= sigmoid.(rbm.W * X .+ rbm.hid_bias)
        #    opt.V_hat .= sigmoid.(rbm.W'* opt.H .+ rbm.vis_bias) .> opt.V_sampling
        #    opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) .> opt.H_sampling
        #else
        #    opt.V_hat .= sigmoid.(rbm.W'* opt.H_hat .+ rbm.vis_bias) .> opt.V_sampling
        #    opt.H_hat .= sigmoid.(rbm.W * opt.V_hat .+ rbm.hid_bias) .> opt.H_sampling
        #end               
    end   
   
    # WORKS
    opt.grad_W = (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    opt.grad_vis_bias = vec(sum((X .- opt.V_hat), 2))./ batch_size;
    opt.grad_hid_bias = vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;
    
    # DOES NOT WORK (with .=) WHY?????
    #opt.grad_W .=  (opt.H * X' .-  opt.H_hat * opt.V_hat')./ batch_size; 
    #opt.grad_vis_bias .= vec(sum((X .- opt.V_hat), 2))./ batch_size;
    #opt.grad_hid_bias .= vec(sum((opt.H .- opt.H_hat), 2))./ batch_size;

    opt.rec_error = sqrt(sum((X.-opt.V_hat).^2))
end

function update_params!(rbm::RBM, opt::CDK, lr)
    rbm.W .+= lr .* opt.grad_W 
    rbm.vis_bias .+= lr .* opt.grad_vis_bias
    rbm.hid_bias .+= lr .* opt.grad_hid_bias
end

update_params! (generic function with 1 method)

In [36]:
rbm = initialize_RBM(784, 20, 0.01, Float32);
#cdk = initialize_CDK(rbm, 1, 500);

In [37]:
initialize_CDK(rbm, 1, 500)

LoadError: [91mMethodError: Cannot `convert` an object of type Array{Float32,2} to an object of type Int64
This may have arisen from a call to the constructor Int64(...),
since type constructors fall back to convert methods.[39m

In [15]:
@time partial_fit!(rbm, X_train[:,1:500], 0.1, cdk);

LoadError: [91mUndefVarError: cdk not defined[39m

In [14]:
# function partial_fit!(rbm::RBM, X::Matrix, K::Integer, lr::Real, optimizer::CDK)
@benchmark partial_fit!(rbm, X_train[:,1:500], 0.1, cdk)

LoadError: [91mUndefVarError: cdk not defined[39m

In [None]:
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05

@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

In [None]:
#This was with the update inside!
n_epochs = 1
batch_size = 500
K = 1
lr = 0.05

@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 2
lr = 0.001

@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 3
lr = 0.001

@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

In [None]:
n_epochs = 1
batch_size = 500
K = 10
lr = 0.001

@benchmark fit!(rbm, X_train, batch_size,  n_epochs, lr, false, cdk)

In [None]:
n_epochs = 10
batch_size = 500
K = 2
lr = 0.001

res = fit!(rbm, X_train, batch_size,  n_epochs, K, lr, T, true)

In [None]:
PyPlot.plot(res)

In [None]:
rbm = initialize_RBM(784, 20, 0.01, Float32)

n_epochs = 40
batch_size = 500
K = 3
lr = 0.001

res = fit!(rbm, X_train, batch_size,  n_epochs, K, lr, T, true)

In [None]:
PyPlot.plot(res)

In [None]:
for i in 1:1
    print("HI")
end

In [None]:
M = generate_M(rbm.W,25);
PyPlot.imshow(M,"gray")

In [None]:
M = generate_M(rbm.W, 10);

PyPlot.imshow(M,"gray")

In [None]:
Hneg = zeros(rbm.n_hid, 10);

In [None]:
Hneg .= sigmoid.(rbm.W * X_train[:,1:10] .+ rbm.hid_bias) .> rand(rbm.n_hid, 10);

In [None]:
sum(Hneg)

In [None]:
rand()

### Function to plot the weights

In [None]:
# https://github.com/dfdx/Boltzmann.jl/blob/master/examples/mnistexample.jl

function plot_weights(W, imsize)
    padding=10
    h, w = imsize
    n = size(W, 1)
    rows = Int(floor(sqrt(n)))
    cols = Int(ceil(n / rows))
    halfpad = div(padding, 2)
    dat = zeros(rows * (h + padding), cols * (w + padding))
    for i=1:n
        wt = W[i, :]
        wim = reshape(wt, imsize)
        wim = wim ./ (maximum(wim) - minimum(wim))
        r = div(i - 1, cols) + 1
        c = rem(i - 1, cols) + 1
        dat[(r-1)*(h+padding)+halfpad+1 : r*(h+padding)-halfpad,
            (c-1)*(w+padding)+halfpad+1 : c*(w+padding)-halfpad] = wim
    end
    #ImageView.view(dat)
    return dat
end

In [None]:
rbm = initialize_RBM(784, 50, 0.01, Float32)
M = generate_M(rbm.W,25);
PyPlot.imshow(M,"gray")

## Train model several epochs 

Plot the weights after training

In [None]:
rbm = initialize_RBM(784, 50, 0.01, Float32)
M = generate_M(rbm.W,25);
PyPlot.imshow(M,"gray")

In [None]:
n_epochs = 100
batch_size = 200
K = 1
lr = 0.01

fit!(rbm, X_train,  batch_size,  n_epochs, K, lr, T, true)

In [None]:
M = generate_M(rbm.W, 10);
PyPlot.imshow(M,"gray")

In [None]:
PyPlot.imshow(M,"gray", vmin=minimum(M), vmax = maximum(M))

In [None]:
dat = plot_weights(rbm.W[1:50,:], (28,28));
PyPlot.imshow(dat,"gray")

## Scaling individually the features of the plot: TO DO

#### Python code for printing the feature detectors


    for i, comp in enumerate(self.W.T):
        plt.subplot(15, 15, i + 1)
        if min_max_scale:
            plt.imshow(comp.reshape((28, 28)),
                       cmap= plt.get_cmap('gray'), vmin=min_, vmax=max_)

In [None]:
function plot_per_hidden_unit(W, n_columns, minmax_scale)
    n_hid = size(W)[1]
    n_vis = size(W)[2]
    costat = Int(sqrt(n_vis))    
    n_rows = Int(round(n_hid/n_columns))   

    print("\ncostat: ", costat,
          "\nn_rows: ", n_rows,
          "\nn_cols: ", n_columns)
    
    if minmax_scale == true
        min_ = minimum(W)
        max_ = maximum(W)
    end
                    
    for i in 1:n_hid
        comp = W[i,:]
        PyPlot.subplot(10, 10, i+1)
        if minmax_scale == true
            PyPlot.imshow(reshape(comp, 28, 28),
            cmap= PyPlot.get_cmap("gray"), vmin=min_, vmax=max_)
        end
    end
    
end

In [None]:
plot_per_hidden_unit(rbm.W, rbm.n_hid, true)