# RBM and CRBM

Objective: Implement CRBM in Julia for time series analysis

In [1]:
# Import Distributions to generate the W matrix of the RBM
using Distributions

In [2]:
# Import MNIST dataset for experimenting
using MNIST

In [3]:
X_train, y_train = MNIST.traindata()
X_train_rows = X_train';
X_train_rows = X_train_rows[1:42000,:]

X_test, y_test = MNIST.testdata()
X_test_rows = X_test';


In [4]:
# Data
size(X_train_rows), size(y_train)

((42000,784),(60000,))

In [5]:
# Targets
size(X_test_rows), size(y_test)

((10000,784),(10000,))

### Define a type RBM

In [6]:
type RBM_rows{T <: Real}
    W::Matrix{T}         
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    n_vis::Int32
    n_hid::Int32
    trained::Bool
end

The following function allow us to define what will be printed once we type RBM in our julia terminal (or notebook)

In [7]:
function Base.show{T}(io::IO, rbm::RBM_rows{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end

The following function is used to instanciate an RBM

In [8]:
function initializeRBM_rows(n_vis::Int64, n_hid::Int64; sigma=0.01, T=Float64)
    
    return RBM_rows{T}( rand(Normal(0,sigma),(n_vis, n_hid)),  # weight matrix
                   zeros(n_vis),                          # visible vector  
                   zeros(n_hid),                          # Hidden vector
                   n_vis,                                 # num visible units 
                   n_hid,                                 # num hidden unnits
                   false)                                 # trained


end


initializeRBM_rows (generic function with 1 method)

In [9]:
rbm = initializeRBM_rows(784, 225)

RBM{Float64}(n_vis=784, n_hid=225, trained=false)

In [10]:
size(rbm.vis_bias), size(rbm.hid_bias), size(rbm.W)

((784,),(225,),(784,225))

In [11]:
size(rbm.W), size(X_train[1:5,:])

((784,225),(5,60000))

### Train and RBM


    def update_CDK(self, 
                   Xbatch, 
                   lr=0.1,
                   K=1):

        batch_size = Xbatch.shape[0]

        Delta_W = 0
        Delta_b = 0
        Delta_c = 0

        for x in Xbatch:
            xneg = x
        
            for k in range(0, K):
                hneg = sig( npdot(xneg, self.W) + self.c) > np.random.random(self.hidden_dim).astype(np.float32)
                xneg = sig( npdot(hneg, self.W.T) + self.b) > np.random.random(self.visible_dim).astype(np.float32)
        
            ehp = sig( npdot(x, self.W) + self.c )
            ehn = sig( npdot(xneg, self.W) + self.c)

            Delta_W += lr * (np_outer(x, ehp) - np_outer(xneg, ehn))
            Delta_b += lr * (x - xneg)
            Delta_c += lr * (ehp - ehn)

        self.W += Delta_W * (1. / batch_size)
        self.b += Delta_b * (1. / batch_size)
        self.c += Delta_c * (1. / batch_size)

In [12]:
function sigmoid(vector::Array{Float64})
    return 1./(1 + e.^(-vector))
end

sigmoid (generic function with 1 method)

# Thinking the data as rows (Julia stores it as columns)

In [13]:
xneg=rand(1,rbm.n_vis)

1×784 Array{Float64,2}:
 0.103239  0.876587  0.465789  0.423347  …  0.95656  0.932866  0.536371

In [14]:
hneg= sigmoid( xneg * rbm.W .+ rbm.hid_bias') .> rand(1,rbm.n_hid)

1×225 BitArray{2}:
 false  false  true  true  true  …  true  false  true  true  true  true

In [15]:
sigmoid(xneg * rbm.W + rbm.hid_bias')

1×225 Array{Float64,2}:
 0.483447  0.530851  0.510552  0.489585  …  0.450777  0.505512  0.448985

In [16]:
sigmoid(xneg * rbm.W + rbm.hid_bias')

1×225 Array{Float64,2}:
 0.483447  0.530851  0.510552  0.489585  …  0.450777  0.505512  0.448985

In [36]:
function contrastive_divergence_rows_K(Xbatch, rbm, K::Int32, lr::Float32)
        
    batch_size = size(Xbatch)[1]

    Delta_W = zeros(rbm.W)
    Delta_b = zeros(rbm.vis_bias)
    Delta_c = zeros(rbm.hid_bias)

    for i in 1:batch_size
        x =  Xbatch[i:i,:]
        xneg = Xbatch[i:i,:]

        for k in 1:K
            hneg = sigmoid( xneg * rbm.W .+ rbm.hid_bias') .> rand(1,rbm.n_hid)
            xneg = sigmoid( hneg * rbm.W' .+ rbm.vis_bias') .> rand(1,rbm.n_vis)
        end

        ehp = sigmoid(x * rbm.W + rbm.hid_bias')
        ehn = sigmoid(xneg * rbm.W + rbm.hid_bias')
        
        Delta_W += lr * (kron(x, ehp') - kron(xneg, ehn'))'
        Delta_b += lr * (x - xneg)'
        Delta_c += lr * (ehp - ehn)'
    end
    
    rbm.W += Delta_W / batch_size;
    rbm.vis_bias += vec(Delta_b / batch_size);
    rbm.hid_bias += vec(Delta_c / batch_size);
    
    return 
end

contrastive_divergence_rows_K (generic function with 2 methods)

In [37]:
xneg = X_train_rows[1:1,:]
xneg * rbm.W

1×225 Array{Float64,2}:
 -4.07622  -20.9677  49.1053  -67.0958  …  -36.5505  6.08238  -23.6831

In [38]:
sigmoid( hneg * rbm.W' .+ rbm.vis_bias') .> rand(rbm.n_vis)'

1×784 BitArray{2}:
 true  false  false  false  false  true  …  false  true  true  false  false

In [39]:
xneg * rbm.W +rbm.hid_bias'

1×225 Array{Float64,2}:
 -4.07622  -20.9677  49.1053  -67.0958  …  -36.5505  6.08238  -23.6831

In [40]:
print(rbm.W[1,1], rbm.vis_bias[1])

-0.0058656614296453550.0

In [41]:
size(rbm.W)

(784,225)

In [51]:
X_batch_rows = X_train_rows[1:25,:];

In [43]:
size(X_batch_rows)

(25,784)

In [44]:
X_batch_rows = Matrix{Float32}(X_train_rows[1:25,:]);

In [52]:
typeof(X_batch_rows)

Array{Float64,2}

In [48]:
rbm

RBM{Float64}(n_vis=784, n_hid=225, trained=false)

In [49]:
contrastive_divergence_rows_K

contrastive_divergence_rows_K (generic function with 2 methods)

In [53]:
@time contrastive_divergence_rows_K(X_batch_rows, rbm, 1, 0.01)

LoadError: LoadError: MethodError: no method matching contrastive_divergence_rows_K(::Array{Float64,2}, ::RBM_rows{Float64}, ::Int64, ::Float64)
Closest candidates are:
  contrastive_divergence_rows_K(::Any, ::Any, !Matched::Int32, !Matched::Float32) at In[36]:3
  contrastive_divergence_rows_K(!Matched::Array{Float32,2}, ::Any, !Matched::Int32, !Matched::Float32) at In[17]:3
while loading In[53], in expression starting on line 184

#### Generate indicies for the minibatches

In [None]:
X = rand(1000, 784)

batch_size=300; 
n_samples=1000

indicies = [x:min(x+batch_size-1,n_samples) for x in 1:batch_size:n_samples]

In [None]:
size(X[indicies[end-1],:])

In [None]:
size(X[indicies[end],:])

In [None]:
function fit_CDK_rows(X, rbm, batch_size::Integer, n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[1]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]

    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            contrastive_divergence_K(X[minibatch_ind, :], rbm, K, lr)
        end
        print("\nepoch ", epoch, "  time epoch:", toq())
        
    end
    rbm.trained = true
end

In [None]:
size(X_train_rows)

In [None]:
size(X)

In [None]:
size(X_batch_rows)

In [None]:
size(X_train_rows[1:1000, :])

In [None]:
n_samples = size(X_train)[1]
batch_size = 200
indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]

In [None]:
@time contrastive_divergence_K(X_train_rows[1:1000, :], rbm, 1, 0.01)

In [None]:
size(X_train_rows)

In [None]:
n_epochs = 1
batch_size = 200
K = 1
lr = 0.01

fit_CDK_rows(X_train_rows, rbm, batch_size,  n_epochs, K, lr)

# Thinking the data as columns


In [None]:
type RBM_col{T <: Real}
    W::Matrix{T}         
    vis_bias::Vector{T}     
    hid_bias::Vector{T}   
    n_vis::Int32
    n_hid::Int32
    trained::Bool
end

In [None]:
function Base.show{T}(io::IO, rbm::RBM_col{T})
    n_vis = size(rbm.vis_bias, 1)
    n_hid = size(rbm.hid_bias, 1)
    trained = rbm.trained
    print(io, "RBM{$T}(n_vis=$n_vis, n_hid=$n_hid, trained=$trained)")
end

In [None]:
function sigmoid(vector::Array{Float64})
    return 1./(1 + e.^(-vector))
end

In [None]:
function initialize_RBM_col(n_vis::Int64, n_hid::Int64; sigma=0.01, T=Float64)
    
    return RBM_col{T}( rand(Normal(0,sigma),(n_hid,n_vis)),  # weight matrix
                   zeros(n_vis),                          # visible vector  
                   zeros(n_hid),                          # Hidden vector
                   n_vis,                                 # num visible units 
                   n_hid,                                 # num hidden unnits
                   false)                                 # trained


end

In [None]:
rbm = initialize_RBM_col(784, 225)

In [None]:
size(rbm.W)

In [None]:
X_train_col, y_train = MNIST.traindata()
X_test_col, y_test = MNIST.testdata()

In [None]:
size(X_train_col)

In [None]:
size(rbm.W)

In [None]:
X_batch = X_train_col[:,1:25];
size(X_batch)

In [None]:
size(rbm.W * X_batch .+ rbm.hid_bias)

In [None]:
function contrastive_divergence_K(Xbatch, rbm, K::Integer, lr::Real)
        
    batch_size = size(Xbatch)[2]

    Delta_W = zeros(size(rbm.W))
    Delta_b = zeros(size(rbm.vis_bias))
    Delta_c = zeros(size(rbm.hid_bias))

    for i in 1:batch_size
        x =  Xbatch[:,i]
        xneg = Xbatch[:,i]

        for k in 1:K
            hneg = sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid)
            xneg = sigmoid( rbm.W' * hneg .+ rbm.vis_bias) .> rand(rbm.n_vis)
        end

        ehp = sigmoid(rbm.W * x + rbm.hid_bias)
        ehn = sigmoid(rbm.W * xneg + rbm.hid_bias)
     
        Delta_W += lr * (kron(x, ehp') - kron(xneg, ehn'))'
        Delta_b += lr * (x - xneg)
        Delta_c += lr * (ehp - ehn)

    end

    rbm.W += Delta_W / batch_size;
    rbm.vis_bias += Delta_b / batch_size;
    rbm.hid_bias += Delta_c / batch_size;
    
    return 
end

In [None]:
X_batch = X_train[:,1:25];
size(X_batch)

In [None]:
size(X_batch)[2]

In [None]:
xneg = X_batch[:,1];
size(xneg)

In [None]:
hneg = sigmoid( rbm.W * xneg .+ rbm.hid_bias) .> rand(rbm.n_hid);
size(hneg)

In [None]:
xneg = sigmoid(  rbm.W' *hneg  .+ rbm.vis_bias) .> rand(rbm.n_vis)
size(xneg)

In [None]:
size(rbm.W)

In [None]:
@time contrastive_divergence_K(X_batch, rbm, 1, 0.01)

In [None]:
size(X_train)[2]

In [None]:
function fit_CDK(X, rbm, batch_size::Integer,  n_epochs::Integer, K::Integer, lr::Real)
        
    n_samples = size(X)[2]
    indicies = [x:min(x + batch_size-1, n_samples) for x in 1:batch_size:n_samples]

    for epoch in 1:n_epochs
        tic();
        for minibatch_ind in indicies
            contrastive_divergence_K(X[:, minibatch_ind], rbm, K, lr)
        end
        print("\nepoch ", epoch, "  time epoch:", toq())
    end
    rbm.trained = true
end

In [None]:
n_epochs = 3
batch_size = 200
K = 1
lr = 0.01

fit_CDK(X_train, rbm, batch_size,  n_epochs, K, lr)