## Toy NNets for education pourpouses in  Julia

Interesting discussion how to make forward pass efficiently using BLAS:

- https://discourse.julialang.org/t/blas-performance-issues-for-common-neural-network-patterns/565

- http://int8.io/neural-networks-in-julia-hyperbolic-tangent-and-relu/

- http://int8.io/backpropagation-from-scratch-in-julia-part-ii-derivation-and-implementation/

- http://www.breloff.com/JuliaML-and-Plots/

- https://github.com/JuliaQuant/OnlineAI.jl

Many machine learning packages in Julia

- https://github.com/svaksha/Julia.jl/blob/master/AI.md#hmm

#### Add the follwoing packages before executing this notebok

- Pkg.add("MNIST")


In [6]:
using LossFunctions
#Pkg.add("LossFunctions")

[1m[36mINFO: Precompiling module LossFunctions.
[0m

In [7]:
using MNIST

In [8]:
train = MNIST.traindata()

(
[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

[5.0,0.0,4.0,1.0,9.0,2.0,1.0,3.0,1.0,4.0  …  9.0,2.0,9.0,5.0,1.0,8.0,3.0,5.0,6.0,8.0])

In [9]:
X_train = train[1];
y_train = Vector{Int32}(train[2]);

In [10]:
sort(unique(y_train))

10-element Array{Int32,1}:
 0
 1
 2
 3
 4
 5
 6
 7
 8
 9

## Defining Linear layer and relu layer

In [11]:
T = Float32
n_visible = 784
n_hidden = 500

srand(1234)
W1 = randn(T, n_hidden, n_visible );
W1 = W1/norm(W1)
b = zeros(n_hidden);

In [12]:
size(W1)

(500,784)

In [13]:
size(X_train[:,1:10])

(784,10)

In [14]:
#linear layer
batch = W1 * X_train[:,1:3].+b

#relu
l1_batch = batch .* (batch .>0);

In [15]:
exp.(batch)./sum(exp.(batch),1);

In [19]:
sum(exp.(l1_batch),1)

1×3 Array{Float64,2}:
 3.69229e74  3.80362e69  4.13172e48

In [22]:
exp.(l1_batch)./sum(exp.(l1_batch),1);

#### Defining layers

In [23]:
type LinearLayer{T}
    """
    Standard layer between activations.
    The output of this layer for a given input is meant to be a matrix product 
    of the input times W
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    b::Vector{T}
    seed::Int

    function LinearLayer(input, output; seed=1234)
        srand(seed)
        return new(input,
                   output,
                   randn(T,output,input)/sqrt(input),
                   zeros(output))
    end
end

Base.show(io::IO, l::LinearLayer{T}) = \,
    print(io,"LinearLayer{$T} [input_dim: $(l.input_dim), ouput_dim: $(l.output_dim)]")

In [24]:
input_dim = 784
output_dim = 500
l = LinearLayer{Float32}(input_dim,output_dim)

LinearLayer{Float32} [input_dim: 784, ouput_dim: 500]

In [25]:
type ReluActivation{T}
    """
    Relu Activation function latyer
    """
    dim::Int
end

In [26]:
type SoftMaxLayer{T}
    """
    Standard layer between activations.
    The output of this layer for a given input is meant to be a matrix product 
    of the input times W
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    seed::Int
    
    function SoftMaxLayer(input, output; seed=1234)
        srand(seed)
        return new(input,
                   output,
                   randn(T,output, input)/sqrt(input))
    end
end


Base.show(io::IO, l::SoftMaxLayer{T}) = \,
print(io,"SoftMaxLayer{$T} [input_dim: $(l.input_dim), ouput_dim: $(l.output_dim)]")


About Softmax layer

http://stats.stackexchange.com/questions/79454/softmax-layer-in-a-neural-network

### First example

Now let us define the structure, weight types (float type) of a MLP

In [27]:
input_dim = 784
hidden_dim = 500
output_dim = 10

10

In [28]:
mlp_classifer = [LinearLayer{Float32}(input_dim , hidden_dim),
                 ReluActivation{Float32}(hidden_dim),
                 SoftMaxLayer{Float32}(hidden_dim, output_dim)];

In [29]:
# write the code that accepts something like this
# mlp(784,500,10, ["sigmoid", "softmax"])

In [30]:
mlp_classifer[1]

LinearLayer{Float32} [input_dim: 784, ouput_dim: 500]

In [31]:
mlp_classifer[2]

ReluActivation{Float32}(500)

In [32]:
mlp_classifer[3]

SoftMaxLayer{Float32} [input_dim: 500, ouput_dim: 10]

## Making predictions with the network

We have defined a MLP as list of layers and activation functions.

In order to make a prediction we need to make a forward pass through the network.
Let us assume by now that we have a good set of weights at each layer in the network and
we want to make a prediction.

In [33]:
function forward(linear_layer::LinearLayer, Xbatch::Array)
    """
    Given an input batch where the data comes as columns this method propagates 
    the batch using the weights of the linear layer
    """
    return linear_layer.W * Xbatch .+ linear_layer.b
end

function forward(relu_activation::ReluActivation, Xbatch::Array)
    return Xbatch.*( Xbatch .> 0.)
end

function forward(softmax_layer::SoftMaxLayer, Xbatch::Array)
    """
    Layer shrinking the output to [0,1] values.
    Notice that sum(exp(Xbatch),1) will generate a Matrix with as many elements as
    columns in Xbatch. 
    """
    Xbatch_out = softmax_layer.W * Xbatch
    return exp.(Xbatch_out)./sum(exp.(Xbatch_out), 1)
end

forward (generic function with 3 methods)

In [34]:
Xbatch = X_train[:,1:25];
ybatch = y_train[1:25];

In [35]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))
aux = forward(mlp_classifer[2], aux)
print(size(aux))
aux = forward(mlp_classifer[3], aux)
print(size(aux))

(500,25)(500,25)(10,25)

In [36]:
function predict_proba(mlp, Xbatch::Array)
    for l in mlp
        Xbatch = forward(l, Xbatch)
    end
    return Xbatch
end

predict_proba (generic function with 1 method)

In [37]:
# Each column contains a vector that represents
# The conditional probability of the target beein from a particular class having observed
# the input vector.

@time predict_proba(mlp_classifer, Xbatch)

  1.601142 seconds (1.63 M allocations: 60.273 MB, 2.09% gc time)


10×25 Array{Float64,2}:
 5.39905e-22  1.0           9.17328e-30  …  9.1359e-32   8.48121e-20
 2.37617e-5   1.11199e-49   2.55392e-68     5.00068e-19  2.31968e-43
 6.93362e-51  1.8805e-97    5.01312e-33     9.6184e-80   1.53352e-83
 3.41924e-13  1.70124e-72   8.44621e-12     1.31159e-34  4.3633e-48 
 2.1359e-42   2.54938e-76   1.57863e-45     5.86561e-11  9.00681e-47
 7.27582e-16  5.33328e-55   1.75377e-23  …  4.57536e-48  1.0        
 0.000291457  1.23333e-74   1.0             1.0          1.73416e-63
 8.83703e-17  1.10494e-125  1.52251e-36     1.17959e-45  3.82412e-82
 1.68701e-43  4.17899e-67   3.65337e-25     2.82104e-50  3.34436e-60
 0.999685     6.36062e-51   7.87676e-32     2.7335e-64   1.85022e-33

# Encoding class values as "onehot" vectors

In [38]:
function one_hot_encoding(y_train::Vector,
                          unique_classes::Vector,
                          class_to_pos::Dict)
    
    encoded_classes = zeros(length(unique_classes), length(y_train))
    for (i,y) in enumerate(y_train)
        encoded_classes[class_to_pos[y],i] = 1
    end
    return encoded_classes
end

one_hot_encoding (generic function with 1 method)

In [39]:
unique_classes = sort(unique(y_train))
class_to_pos = Dict(class => pos for (pos,class) in enumerate(unique_classes));    

In [40]:
print("\nclass integer: ", y_train[1:3])
print("\nEncoding:\n")
one_hot_encoding(y_train[1:3], unique_classes, class_to_pos)


class integer: Int32[5,0,4]
Encoding:


In [41]:
one_hot_encoding(ybatch, unique_classes, class_to_pos)

10×25 Array{Float64,2}:
 0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  1.0
 0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0     0.0  1.0  0.0  0.0  1.0  0.0  0.0

# Loss functions



## Cross entropy loss for mlp_classifier
- http://neuralnetworksanddeeplearning.com/chap3.html
- http://datascience.stackexchange.com/questions/9302/the-cross-entropy-error-function-in-neural-networks

We will focus now on an standard loss function for classification problems. The cross entropy loss.

$$
    \text{loss}\left(h(x), e(t) \right) = - \sum_{i=1}^{C} e(t)_i \log(h(x)_i)
$$

In [430]:
type crossentropy_loss
    dim::Int
end



In [433]:
crossentropy = crossentropy_loss(10)

crossentropy_loss(10)

In [437]:
crossentropy.dim

10

In [432]:
function forward(loss::crossentropy_loss, Y_hat_batch::Matrix, Y_encoded::Matrix)
    """
    Should this function do the onehot encoding?
    In order to save memory it seems reasonable but...

    C=number of classes
    e(t) = Encoding (as vector) of class t
      
          loss (h(x), e(t)) = \sum_{i=1}^{C} e(t)_i  log (h(x)_i)

    Returns the loss between the batch
    """
    # This breaks if Y_hat_batch contains 0 because log(0)= -Inf
    # return - sum(Y_encoded.*log.(Y_hat_batch))
    
    n_samples = size(Y_encoded)[2]
    cross_entropy = 0.
    
    @inbounds for m in 1:n_samples
        for d in 1:crossentropy.dim
            if Y_hat_batch[d,m] > 0.0
                cross_entropy +=  Y_encoded[d,m] * log(Y_hat_batch[d,m])
            end
        end
    end
    return -cross_entropy
end



forward (generic function with 4 methods)

In [420]:
size(predict_proba(mlp_classifer, Xbatch))

(10,25)

In [421]:
forward(crossentropy, 
        predict_proba(mlp_classifer, Xbatch),
        one_hot_encoding(ybatch, unique_classes, class_to_pos)  )

1703.668076315073

In [422]:
forward(crossentropy, 
        predict_proba(mlp_classifer, Xbatch)[:,1:3],
        one_hot_encoding(ybatch, unique_classes, class_to_pos)[:,1:3]  )

138.01657375644282

In [423]:
forward(crossentropy, 
        predict_proba(mlp_classifer, Xbatch)[:,1:3],
        one_hot_encoding(ybatch, unique_classes, class_to_pos)[:,1:3])

138.01657375644282

In [424]:
y = Matrix([1. 0. 0. 0.])
y_hat = Matrix([0.9 0.1 0. 0.])

1×4 Array{Float64,2}:
 0.9  0.1  0.0  0.0

In [460]:
y.*log.(y_hat)

1×4 Array{Float64,2}:
 -0.105361  -0.0  NaN  NaN

In [428]:
forward(crossentropy, y_hat, y)

0.10536051565782628

In [459]:
@time forward(crossentropy, y_hat, y)

  0.000003 seconds (5 allocations: 176 bytes)


0.10536051565782628


# Computing Gradients 

## Le us compute the gradient of the loss for a given input vector

Now we will deal with the learning part. That is, given a MLP architecture we will tune the weights in order to minimize some error function. 

- Let $z^L$ be the preactivation at layer $L$.
- Let $h(x)$ be the output values of the network.
- Let $e(y)$ be the encoding of class $y$.


### Equation for computing $\delta^L$ if the error is the crossentropy loss defined and the output layer is a softmax


\begin{equation}
\delta^L = \nabla_{{z^{\,L}\,\,\,}}  loss( h(x), e(y) ) = (h(x) - e(y))
\end{equation}

### Equation for computing $\delta^l$ using $\delta^{l+1}$ for any $1 \leq l<L$ 

$$
\delta^l = \big(W^{l+1 \,\,} \big)^{\,T}  \delta^{l+1} .* g'(z^l)
$$

### Equation for computing the gradient of the weignts at every layer using $\delta^l$ and $a^{l-1}$


$$
\nabla_{W^l} = \big( a^{l-1\,\,} \big)^{\,T}  \delta^l 
$$

### Equation for computing the gradient of the biases at every layer using $\delta^l$ and $a^{l-1}$
$$
\nabla_{b^l} =  \delta^l 
$$




#### Hinton matlab code


    %%% Error back-propagation
    df = [];

    Ix = IO;
    
    %%% do not use outputLayer{nHiddenLayers}: nHiddenLayers may be 0
    dw = outputHiddenLayers' * Ix; 
    df{nHiddenLayers+1} = dw;

    for nLayer=nHiddenLayers:-1:1
      Ix = (Ix * Weights{nLayer+1}') .* MLE_MultilayerPerceptron_DerivativeFactor...
                                           (ActFunHiddens{nLayer},outputLayer{nLayer});
      
      %%% removes the constant column (the added ones for the bias)
      Ix = Ix(:,1:end-1);   
      if nLayer > 1
        dw = outputLayer{nLayer-1}' * Ix; 
      else
        dw = Data' * Ix;                  
      end;
      df{nLayer} = dw;
    end;
    
    
#### derivatives activations


    function DerivativeFactor = MLE_MultilayerPerceptron_DerivativeFactor...
                                  (ActFun,outputLayerAct);

    if strcmp(ActFun,'tanhyper')
      DerivativeFactor = 1 - outputLayerAct .* outputLayerAct;
    elseif strcmp(ActFun,'logistic')
      DerivativeFactor = outputLayerAct .* (1 - outputLayerAct);
    elseif strcmp(ActFun,'hardtanhyper')
      DerivativeFactor = ones(size(outputLayerAct));     %%% set to 0 if outputLayerAct<-1 or outputLayerAct>+1
      DerivativeFactor = DerivativeFactor .* (outputLayerAct > -1) .* (outputLayerAct < +1);
    elseif strcmp(ActFun,'reclinear')
      DerivativeFactor = (outputLayerAct > 0);
    elseif strcmp(ActFun,'softreclinear')                %%% softplus
      DerivativeFactor = 1-exp(-outputLayerAct);         %%% y = log(1+e^x) => dy = 1/(1+e^{-x}) = 1-e^{-y}
    elseif strcmp(ActFun,'linear')
      DerivativeFactor = 1;
    elseif strcmp(ActFun,'sine')
      DerivativeFactor = +sqrt( 1 - outputLayerAct.^2 ); %%% sine/cosine: we may lose the sign (we would need Data*Weights)???
    elseif strcmp(ActFun,'cosine')
      DerivativeFactor = -sqrt( 1 - outputLayerAct.^2 ); %%% sine/cosine: we may lose the sign (we would need Data*Weights)???
    else error('MLE_MultilayerPerceptron_DerivativeFactor: ActFun not implemented');
    end;


In [None]:
mlp_classifer = [LinearLayer{Float32}(input_dim, hidden_dim),
                 ReluActivation{Float32}(hidden_dim),
                 SoftMaxLayer{Float32}(hidden_dim, output_dim)];

In [32]:
function delta(softmax_output::SoftMaxLayer,
               loss::crossentropy_loss,
               Xbatch::Array, 
               Y_enc::Array)
    
    return Xbatch - Y_enc
end

delta (generic function with 1 method)

In [33]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))
aux = forward(mlp_classifer[2], aux)
print(size(aux))
aux = forward(mlp_classifer[3], aux)
print(size(aux))

(500,25)(500,25)(10,25)

In [None]:
function compute_gradients(mlp, loss, X, Y)
    
    activations = []
    for layer in mlp
        push!(activations, forward(layer, X))
        
    #loss(activations[end], Y)
end

In [56]:
a = []

0-element Array{Any,1}

In [36]:
push!(a,[1,2,3])

1-element Array{Any,1}:
 [1,2,3]

In [54]:
push!(a,[1 2 3;  3 3 3])

5-element Array{Any,1}:
 [1,2,3]       
 [1 2 3; 3 3 3]
 [1 2 3; 3 3 3]
 [1 2 3; 3 3 3]
 [1 2 3; 3 3 3]

In [55]:
a[end]

2×3 Array{Int64,2}:
 1  2  3
 3  3  3

In [49]:
a[1]

3-element Array{Int64,1}:
 1
 2
 3

In [51]:
a[2]

2×3 Array{Int64,2}:
 1  2  3
 3  3  3

In [53]:
a[4]

2×3 Array{Int64,2}:
 1  2  3
 3  3  3

In [None]:
type linear_layer

In [None]:
T = Float32
W1 = rand(T, 500, 1000)
W2 = rand(T, 500, 500)
W3 = rand(T, 10, 500)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('N', T(1.0), W3', error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('N', T(1.0), W2', dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('N', T(1.0), W1', dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end


In [None]:

input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN(input, error)
end

In [None]:
T = Float32
W1 = rand(T, 2048, 512 * 512)
W2 = rand(T, 1024, 2048)
W3 = rand(T, 10, 1024)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN2(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('T', T(1.0), W3, error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('T', T(1.0), W2, dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('T', T(1.0), W1, dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end

In [None]:
input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN2(input, error)
end
