# Single layer nnet julia

## Objective of the notebook

The objective is to create a multilayer perceptron with a single hidden layer


Code in python containing an example similar to this can be found here:

http://cs231n.github.io/neural-networks-case-study/#linear

## About the softmax function

https://nolanbconaway.github.io/blog/2017/softmax-numpy

### Toy NNets for education pourpouses in  Julia

Interesting discussion how to make forward pass efficiently using BLAS:

- https://discourse.julialang.org/t/blas-performance-issues-for-common-neural-network-patterns/565

- http://int8.io/neural-networks-in-julia-hyperbolic-tangent-and-relu/

- http://int8.io/backpropagation-from-scratch-in-julia-part-ii-derivation-and-implementation/

- http://www.breloff.com/JuliaML-and-Plots/

- https://github.com/JuliaQuant/OnlineAI.jl

Many machine learning packages in Julia

- https://github.com/svaksha/Julia.jl/blob/master/AI.md#hmm

#### Add the follwoing packages before executing this notebok

- Pkg.add("MLDatautils")


In [1]:
using LossFunctions
using BenchmarkTools
#Pkg.add("LossFunctions")

In [2]:
#using MNIST
#train = MNIST.traindata()
#T = Float32
#X_train = Array{T}(train[1]);
#y_train = Vector{Int32}(train[2]);

In [3]:
using MLDatasets
train = MLDatasets.MNIST.traindata()
T = Float32
X_train = Array{T}(reshape(train[1], 28*28, 60000))
y_train = Vector{Int32}(train[2]);

In [4]:
sort(unique(y_train))'

1×10 RowVector{Int32,Array{Int32,1}}:
 0  1  2  3  4  5  6  7  8  9

## Defining Linear layer and relu layer

Play with functions `A_mult_Bt`

In [5]:
n_visible = 784
n_hidden = 500
batch_size = 200

srand(1234)
W1 = randn(T, n_hidden, n_visible );
W1 = W1/norm(W1)
b = zeros(n_hidden);

In [6]:
size(W1)

(500, 784)

In [7]:
size(X_train[:,1:200])

(784, 200)

In [8]:
#linear layer
batch = W1 * X_train[:,1:3].+b

#relu
l1_batch = batch .* (batch .>0);

In [9]:
exp.(batch)./sum(exp.(batch),1);

In [10]:
sum(exp.(l1_batch),1)

1×3 Array{Float64,2}:
 545.813  551.224  535.861

In [11]:
exp.(l1_batch)./sum(exp.(l1_batch),1);

#### Timing linear layer: Investigate time of sparse matrix

- http://stackoverflow.com/questions/36673939/updating-a-dense-vector-by-a-sparse-vector-in-julia-is-slow

In [12]:
sp_batch  = sparse(X_train[:,1:5000]);
sp_b = sparsevec(b);
sp_W1 = sparse(W1);

In [13]:
@time sp_W1 * sp_batch .+ sp_b;

  1.333627 seconds (386.18 k allocations: 109.565 MiB, 1.71% gc time)


#### Dense version

In [14]:
@time W1 * X_train[:,1:5000] .+ b;

  0.077175 seconds (61 allocations: 43.566 MiB, 13.23% gc time)


In [15]:
@benchmark (BLAS.gemm('N','N', T(1.0), W1, X_train[:,1:5000]) .+b)

BenchmarkTools.Trial: 
  memory estimate:  43.57 MiB
  allocs estimate:  58
  --------------
  minimum time:     47.559 ms (6.75% GC)
  median time:      51.241 ms (12.14% GC)
  mean time:        57.046 ms (10.31% GC)
  maximum time:     89.681 ms (7.97% GC)
  --------------
  samples:          88
  evals/sample:     1

In [16]:
@benchmark (W1*view(X_train,:,1:5000) .+ b)

BenchmarkTools.Trial: 
  memory estimate:  28.61 MiB
  allocs estimate:  89
  --------------
  minimum time:     44.619 ms (4.80% GC)
  median time:      52.091 ms (8.32% GC)
  mean time:        55.400 ms (7.85% GC)
  maximum time:     85.340 ms (6.22% GC)
  --------------
  samples:          91
  evals/sample:     1

In [17]:
# We could include the bias in W1 and append a row or col in X_train
# full of ones so that the affine transformation is done in a single
# matrix multiplication call
@benchmark (BLAS.gemm('N','N', T(1.0), W1, view(X_train,:,1:5000)) )

BenchmarkTools.Trial: 
  memory estimate:  9.54 MiB
  allocs estimate:  39
  --------------
  minimum time:     37.344 ms (0.00% GC)
  median time:      49.433 ms (0.00% GC)
  mean time:        50.645 ms (3.85% GC)
  maximum time:     72.884 ms (8.01% GC)
  --------------
  samples:          99
  evals/sample:     1

We can inspect the code that  **`W1*view(X_train,:,1:5000) .+ b`** calls using the `@code_lowered` macro

In [18]:
@code_lowered (W1*view(X_train,:,1:5000) .+ b)

CodeInfo(:(begin 
        nothing # line 346:
        (Base.depwarn)((Base.string)(".+", " is no longer a function object; use broadcast(", Base.+, ", ...) instead"), :.+) # line 348:
        return (Base.broadcast)(Base.+, a, b)
    end))

# Defining layers

In [19]:
abstract type NeuralNetLayer end

type LinearLayer{T} <: NeuralNetLayer
    """
    Standard layer between activations.
    The output of this layer for a given input is meant to be a matrix product 
    of the input times W
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    b::Vector{T}
    
    #grad_W::Array{T}
    #grad_b::Vector{T}

    seed::Int
end

function LinearLayer{T}(input, output; seed=1234) where T<:Any
    srand(seed)
    return LinearLayer(input,
                       output,
                       randn(T,output,input)/sqrt(input),
                       zeros(output), 
                       seed)
end

Base.show(io::IO, l::LinearLayer{T}) where T <: Number = \,
    print(io,"LinearLayer{$T} [input_dim: $(l.input_dim), ouput_dim: $(l.output_dim)]")

In [20]:
input_dim = 784
output_dim = 500
l = LinearLayer{Float32}(input_dim,output_dim)

LinearLayer{Float64} [input_dim: 784, ouput_dim: 500]

In [196]:
abstract type ActivationFunction end

type SigmoidActivation{T} <: ActivationFunction
    """
    Relu Activation function latyer
    """
    dim::Int
end

type ReluActivation{T}  <: ActivationFunction
    """
    Relu Activation function latyer
    """
    dim::Int
end

type SoftMaxActivation{T}  <: ActivationFunction
    """
    Standard Sotmax Activation function
    The output of this layer for a given input is meant to be 
        output_k = e^(X)_k /sum_j (X)_j
    """
    dim::Int
end

In [197]:
type SoftMaxLayer{T} <: ActivationFunction
    """
    Standard Sotmax layer.
    The output of this layer for a given input is meant to be 
        
        output_k = e^(W*X + b)_k /sum_j (W*X + b)_j
    
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    
    seed::Int
end

function SoftMaxLayer{T}(input, output; seed=1234) where T<:Number
    srand(seed)
    return SoftMaxLayer(input,
                        output,
                        randn(T, output, input)/sqrt(input),
                        #  zeros(T, output, input),
                        seed)
end

Base.show(io::IO, l::SoftMaxLayer{T}) where T <: Number = \,
print(io,"SoftMaxLayer{$T} [input_dim: $(l.input_dim), ouput_dim: $(l.output_dim)]")


About Softmax layer

http://stats.stackexchange.com/questions/79454/softmax-layer-in-a-neural-network

### First example

Now let us define the structure, weight types (float type) of a MLP

In [198]:
input_dim = 784
hidden_dim = 500
output_dim = 10

10

In [24]:
mlp_classifer = [LinearLayer{Float32}(input_dim , hidden_dim),
                 ReluActivation{Float32}(hidden_dim),
                 SoftMaxLayer{Float32}(hidden_dim, output_dim)];

In [25]:
mlp_classifer[1]

LinearLayer{Float64} [input_dim: 784, ouput_dim: 500]

In [26]:
mlp_classifer[2]

ReluActivation{Float32}(500)

In [27]:
mlp_classifer[3]

SoftMaxLayer{Float64} [input_dim: 500, ouput_dim: 10]

## Making predictions with the network

We have defined a MLP as list of layers and activation functions.

In order to make a prediction we need to make a forward pass through the network.
Let us assume by now that we have a good set of weights at each layer in the network and
we want to make a prediction.

In [199]:
function forward(layer::LinearLayer, Xbatch::Array)
    """
    Given an input batch where the data comes as columns this method propagates 
    the batch using the weights of the linear layer
    """
    return layer.W * Xbatch .+ layer.b
end

function forward(layer::ReluActivation, Xbatch::Array)
    return Xbatch.*( Xbatch .> 0.)
end

function forward(layer::SigmoidActivation, Xbatch::Array)
    return 1./( 1 .+ exp.(-Xbatch))
end

function forward(layer::SoftMaxActivation, Xbatch::Array)
    """
    Layer shrinking the output to [0,1] values.
    Notice that sum(exp(Xbatch),1) will generate a Matrix with as many elements as
    columns in Xbatch. 
    """
    return exp.(Xbatch)./sum(exp.(Xbatch), 1)
end

function forward(layer::SoftMaxLayer, Xbatch::Array)
    """
    Layer shrinking the output to [0,1] values.
    Notice that sum(exp(Xbatch),1) will generate a Matrix with as many elements as
    columns in Xbatch. 
    """
    Xbatch_out = softmax_layer.W * Xbatch
    return exp.(Xbatch_out)./sum(exp.(Xbatch_out), 1)
end


forward (generic function with 7 methods)

In [29]:
Xbatch = X_train[:,1:25];
ybatch = y_train[1:25];

In [30]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))

(500, 25)

In [31]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))
aux = forward(mlp_classifer[2], aux)
print(size(aux))
aux = forward(mlp_classifer[3], aux)
print(size(aux))

(500, 25)(500, 25)(10, 25)

In [32]:
function predict_proba(mlp, Xbatch::Array)
    for l in mlp
        Xbatch = forward(l, Xbatch)
    end
    return Xbatch
end

predict_proba (generic function with 1 method)

In [33]:
# Each column contains a vector that represents
# The conditional probability of the target beein from a particular class having observed
# the input vector.

@time predict_proba(mlp_classifer, Xbatch)

  0.608586 seconds (938.58 k allocations: 49.863 MiB, 3.18% gc time)


10×25 Array{Float64,2}:
 0.110592   0.190445   0.0911891  …  0.0980983  0.104619   0.0916908
 0.099009   0.109728   0.0970957     0.119944   0.105477   0.0814854
 0.0734697  0.0690236  0.0826141     0.066948   0.0570211  0.0682049
 0.117364   0.103467   0.0962567     0.0986178  0.12321    0.104741 
 0.0802728  0.0745412  0.0905434     0.109321   0.101395   0.100484 
 0.115514   0.118488   0.120487   …  0.123378   0.110909   0.0904144
 0.124602   0.0953256  0.131559      0.122778   0.111915   0.147542 
 0.0772554  0.0546428  0.0759803     0.0694765  0.121296   0.0858057
 0.0728219  0.0805403  0.090678      0.10233    0.0939435  0.107779 
 0.129099   0.103798   0.123596      0.0891084  0.0702147  0.121853 

# Encoding class values as "onehot" vectors

In [34]:
function one_hot_encoding(y_train::Vector,
                          unique_classes::Vector,
                          class_to_pos::Dict)
    
    encoded_classes = zeros(length(unique_classes), length(y_train))
    for (i,y) in enumerate(y_train)
        encoded_classes[class_to_pos[y],i] = 1
    end
    return encoded_classes
end

one_hot_encoding (generic function with 1 method)

In [35]:
unique_classes = sort(unique(y_train))
class_to_pos = Dict(class => pos for (pos,class) in enumerate(unique_classes));    

In [36]:
print("\nclass integer: ", y_train[1:3])
print("\nEncoding:\n")
one_hot_encoding(y_train[1:3], unique_classes, class_to_pos)


class integer: Int32[5, 0, 4]
Encoding:


10×3 Array{Float64,2}:
 0.0  1.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  1.0
 1.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0
 0.0  0.0  0.0

In [37]:
one_hot_encoding(ybatch, unique_classes, class_to_pos)

10×25 Array{Float64,2}:
 0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0     0.0  0.0  0.0  0.0  0.0  1.0  1.0
 0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0     0.0  1.0  0.0  0.0  1.0  0.0  0.0

# Loss functions



## Cross entropy loss for mlp_classifier
- http://neuralnetworksanddeeplearning.com/chap3.html
- http://datascience.stackexchange.com/questions/9302/the-cross-entropy-error-function-in-neural-networks

We will focus now on an standard loss function for classification problems. The cross entropy loss.

$$
    \text{loss}\left(h(x), e(t) \right) = - \sum_{i=1}^{C} e(t)_i \log(h(x)_i)
$$

In [38]:
type crossentropy_loss
    dim::Int
end

In [39]:
crossentropy = crossentropy_loss(10)

crossentropy_loss(10)

In [40]:
crossentropy.dim

10

In [41]:
function forward(loss::crossentropy_loss, Y_hat_batch::Matrix, Y_encoded::Matrix)
    """
    Should this function do the onehot encoding?
    In order to save memory it seems reasonable but...

    C=number of classes
    e(t) = Encoding (as vector) of class t
      
          loss (h(x), e(t)) = \sum_{i=1}^{C} e(t)_i  log (h(x)_i)

    Returns the loss between the batch
    """
    # This breaks if Y_hat_batch contains 0 because log(0)= -Inf
    # return - sum(Y_encoded.*log.(Y_hat_batch))
    
    n_samples = size(Y_encoded, 2)
    cross_entropy = 0.
    
    @inbounds for m in 1:n_samples
        for d in 1:crossentropy.dim
            cross_entropy +=  Y_encoded[d,m] * log(Y_hat_batch[d,m])
        end
    end
    return -cross_entropy
end

function forward(loss::crossentropy_loss, y_hat::Vector, y::Vector)
    """
    Should this function do the onehot encoding?
    In order to save memory it seems reasonable but...
    C = number of classes
    e(t) = Encoding (as vector) of class t
          loss (h(x), e(t)) = \sum_{i=1}^{C} e(t)_i  log (h(x)_i)
    
    Returns the loss between the batch
    """
    # This breaks if Y_hat_batch contains 0 because log(0)= -Inf
    # return - sum(Y_encoded.*log.(Y_hat_batch))
    cross_entropy = 0.

    @inbounds for d in 1:length(y)
        cross_entropy +=  log(y_hat[d])
    end
    return -cross_entropy
end

forward (generic function with 6 methods)

In [42]:
size(predict_proba(mlp_classifer, Xbatch))

(10, 25)

In [43]:
forward(crossentropy, 
        predict_proba(mlp_classifer, Xbatch),
        one_hot_encoding(ybatch, unique_classes, class_to_pos)  )

56.796679057341976

In [44]:
forward(crossentropy, 
        predict_proba(mlp_classifer, Xbatch)[:,1:3],
        one_hot_encoding(ybatch, unique_classes, class_to_pos)[:,1:3]  )

6.218675162901754

In [45]:
y = Array([1.; 0.; 0.; 0.])
y_hat = Array([0.9; 0.1; 0.; 0.]);

In [46]:
y, y_hat

([1.0, 0.0, 0.0, 0.0], [0.9, 0.1, 0.0, 0.0])

In [47]:
# Why is this inf??
forward(crossentropy, y_hat, y)

Inf

In [48]:
@time forward(crossentropy, y_hat, y)

  0.000005 seconds (5 allocations: 176 bytes)


Inf

## Timing crossentropy onehot vs unique component implementations

https://jamesmccaffrey.wordpress.com

- Let $h(x^m)$ be the probability of the different classes for a given input $x^m$. 
- Let $y^m$ be an integer containing the class label for $x^m$. 

Then the crossentropy for the pair $(h(x^m), e(y^m))$ is defined as:

$$
    \text{cross_entropy}\left(h(x^m), e(y^m) \right) = - \sum_{i=1}^{C} e(y^m)_i \log \left( h(x^m)_i\right) = y^m \log \left(h(x^m)_{y^m}\right)
$$

Be carefull, the cross_entropy loss is not symmetric.


#### Example showing the importance of the crossentropy

The crossentropy loss measures how good the probabilities of the classes are, not just the final prediction.

```
computed       | targets              | correct?
-----------------------------------------------
0.3  0.3  0.4  | 0  0  1 (democrat)   | yes
0.3  0.4  0.3  | 0  1  0 (republican) | yes
0.1  0.2  0.7  | 1  0  0 (other)      | no
```

This neural network has classification error of 1/3 = 0.33, or equivalently a classification accuracy of 2/3 = 0.67. Notice that the NN just barely gets the first two training items correct and is way off on the third training item. But now consider the following neural network:

```
computed       | targets              | correct?
-----------------------------------------------
0.1  0.2  0.7  | 0  0  1 (democrat)   | yes
0.1  0.7  0.2  | 0  1  0 (republican) | yes
0.3  0.4  0.3  | 1  0  0 (other)      | no
```

This NN also has a classification error of 1/3 = 0.33. But this second NN is better than the first because it nails the first two training items and just barely misses the third training item. To summarize, classification error is a very crude measure of error.

Now consider cross-entropy error. The cross-entropy error for the first training item in the first neural network above is:

In [49]:
y_pred = Array([ [0.3  0.3  0.4]; [0.3  0.4  0.3]; [0.1  0.2  0.7]])'
y_true = Array([ [0.   0.   1. ]; [0.0  0.1  0.0]; [1.0  0.0  0.0]])'

3×3 Array{Float64,2}:
 0.0  0.0  1.0
 0.0  0.1  0.0
 1.0  0.0  0.0

In [50]:
-( (log(0.3)*0) + (log(0.3)*0) + (log(0.4)*1) )

0.916290731874155

Notice that in the case of neural network classification, the computation is a bit odd because all terms but one will go away. (There are several good explanations of how to compute cross-entropy on the Internet.) 

The average cross-entropy error (ACE) for the first neural network is computed as:



In [51]:
-(log(0.4) + log(0.4) + log(0.1)) / 3

1.3783888522474517

The average cross-entropy error for the second neural network is: 0.639, 



In [52]:
-(log(0.7) + log(0.7) + log(0.3)) / 3

0.6391075640678003

Notice that the average cross-entropy error for the second, superior neural network is smaller than the ACE error for the first neural network. The ln() function in cross-entropy takes into account the closeness of a prediction and is a more granular way to compute error.


By the way, you can also measure neural network quality by using mean squared error but this has problems too. The squared error term for the first item in the first neural network would be:

```
(0.3 - 0)^2 + (0.3 - 0)^2 + (0.4 - 1)^2 = 0.09 + 0.09 + 0.36 = 0.54
```

And so the mean squared error for the first neural network is:

```
(0.54 + 0.54 + 1.34) / 3 = 0.81
```
The mean squared error for the second, better, neural network is:

```
(0.14 + 0.14 + 0.74) / 3 = 0.34
```

MSE isn’t a hideously bad approach but if you think about how MSE is computed you’ll see that, compared to ACE, MSE gives too much emphasis to the incorrect outputs. It might also be possible to compute a modified MSE that uses only the values associated with the 1s in the target, but I have never seen that approach used or discussed.

So, I think this example explains why using cross-entropy error is clearly preferable to using classification error. Somewhat unfortunately there are some additional issues here. The discussion above refers to computing error during the training process. After training, to get an estimate of the effectiveness of the neural network, classification error is usually preferable to MSE or ACE. The idea is that classification error is ultimately what you’re interested in.

Suppose you are using back-propagation for training. The back-propagation algorithm computes gradient values which are derived from some implicit measure of error. Typically the implicit error is mean squared error, which gives a particular gradient equation that involves the calculus derivative of the softmax output activation function. But you can use implicit cross-entropy error instead of implicit mean squared error. This approach changes the back-propagation equation for the gradients. I have never seen research which directly addresses the question of whether to use cross-entropy error for both the implicit training measure of error and also neural network quality evaluation, or to use cross-entropy just for quality evaluation. Such research may (and fact, probably) exists, but I’ve been unable to track any papers down.

In [53]:
function cross_entropy(prob_Y_given_X::Matrix, Y_encoded::Matrix)
    cross_entropy = 0.  
    @inbounds for m in 1: size(Y_encoded,2)
        for d in 1:size(Y_encoded,1)
            cross_entropy +=  Y_encoded[d,m] * log(prob_Y_given_X[d,m])
        end
    end
    return -cross_entropy
end

function cross_entropy(prob_Y_given_X::Matrix, y::Vector)
    cross_entropy = 0.
    @simd for m in 1:length(y)
        cross_entropy +=  log(prob_Y_given_X[y[m],m])
    end
    return -cross_entropy
end

cross_entropy (generic function with 2 methods)

In [54]:
y_pred     = Array([ [0.3  0.3  0.4]; [0.3  0.4  0.3]; [0.1  0.2  0.7]])'
y_true_enc = Array([ [0.   0.   1. ]; [0.0  1.0  0.0]; [1.0  0.0  0.0]])'
y_true     = Array([ 3, 2, 1])

println("crossentropy: ",  cross_entropy(y_pred , y_true_enc))
println("crossentropy: ", -(log(0.4) + log(0.4) + log(0.1)))
println("crossentropy: ", -(log(y_pred[y_true[1],1]) + log(y_pred[y_true[2],2]) + log(y_pred[y_true[3],3])))
println("crossentropy: ",  cross_entropy(y_pred , y_true))

crossentropy: 4.135166556742355
crossentropy: 4.135166556742355
crossentropy: 4.135166556742355
crossentropy: 4.135166556742355



# TODO: Computing Gradients 

### Le us compute the gradient of the loss for a given input vector

Now we will deal with the learning part. That is, given a MLP architecture we will tune the weights in order to minimize some error function. 

- Let $z^L$ be the preactivation at layer $L$.
- Let $h(x)$ be the output values of the network.
- Let $e(y)$ be the onehot encoding of class $y$.


####  Computing $\delta^L$ if the loss function is the crossentropy and the output layer is a softmax


\begin{equation}
\delta^L = \nabla_{{z^{\,L}\,\,\,}}  loss( h(x), e(y) ) = (h(x) - e(y))
\end{equation}

#### Computing $\delta^l$ using $\delta^{l+1}$ for any $1 \leq l<L$ 

$$
\delta^l = \big(W^{l+1 \,\,} \big)^{\,T}  \delta^{l+1} .* g'(z^l)
$$

#### Computing  gradient of the weignts at every layer using $\delta^l$ and $a^{l-1}$


$$
\nabla_{W^l} = \big( a^{l-1\,\,} \big)^{\,T}  \delta^l 
$$

#### Computing  gradient of the biases at every layer using $\delta^l$ and $a^{l-1}$
$$
\nabla_{b^l} =  \delta^l 
$$




#### Hinton matlab code


    %%% Error back-propagation
    df = [];

    Ix = IO;
    
    %%% do not use outputLayer{nHiddenLayers}: nHiddenLayers may be 0
    dw = outputHiddenLayers' * Ix; 
    df{nHiddenLayers+1} = dw;

    for nLayer=nHiddenLayers:-1:1
      Ix = (Ix * Weights{nLayer+1}') .* MLE_MultilayerPerceptron_DerivativeFactor(ActFunHiddens{nLayer},
                                          outputLayer{nLayer});
      
      %%% removes the constant column (the added ones for the bias)
      Ix = Ix(:,1:end-1);   
      if nLayer > 1
        dw = outputLayer{nLayer-1}' * Ix; 
      else
        dw = Data' * Ix;                  
      end;
      df{nLayer} = dw;
    end;
    
    
#### derivatives activations
 
 The old MATLAB hinton's code look like


    function DerivativeFactor = MLE_MultilayerPerceptron_DerivativeFactor(ActFun,outputLayerAct);

    if strcmp(ActFun,'tanhyper')
      DerivativeFactor = 1 - outputLayerAct .* outputLayerAct;
    elseif strcmp(ActFun,'logistic')
      DerivativeFactor = outputLayerAct .* (1 - outputLayerAct);
    elseif strcmp(ActFun,'hardtanhyper')
      DerivativeFactor = ones(size(outputLayerAct));     %%% set to 0 if outputLayerAct<-1 or outputLayerAct>+1
      DerivativeFactor = DerivativeFactor .* (outputLayerAct > -1) .* (outputLayerAct < +1);
    elseif strcmp(ActFun,'reclinear')
      DerivativeFactor = (outputLayerAct > 0);
    elseif strcmp(ActFun,'softreclinear')                %%% softplus
      DerivativeFactor = 1-exp(-outputLayerAct);         %%% y = log(1+e^x) => dy = 1/(1+e^{-x}) = 1-e^{-y}
    elseif strcmp(ActFun,'linear')
      DerivativeFactor = 1;
    elseif strcmp(ActFun,'sine')
      DerivativeFactor = +sqrt( 1 - outputLayerAct.^2 ); %%% sine/cosine: we may lose the sign (we would need Data*Weights)???
    elseif strcmp(ActFun,'cosine')
      DerivativeFactor = -sqrt( 1 - outputLayerAct.^2 ); %%% sine/cosine: we may lose the sign (we would need Data*Weights)???
    else error('MLE_MultilayerPerceptron_DerivativeFactor: ActFun not implemented');
    end;


In [None]:
mlp_classifer = [LinearLayer{Float32}(input_dim, hidden_dim),
                 SigmoidActivation{Float32}(hidden_dim),
                 SoftMaxLayer{Float32}(hidden_dim, output_dim)];

In [230]:
mlp_classifer = [LinearLayer{Float32}(input_dim, hidden_dim),
                 SigmoidActivation{Float32}(hidden_dim),
                 LinearLayer{Float32}(hidden_dim, output_dim),
                 SoftMaxActivation{Float32}(output_dim)];

#### Delta term for the output layer for (softmax, crossentropy loss)

Since the delta terms are the gradients of the loss with respect to the preactivation at some layer, the gradient wil depend on both the softmax and the loss function. 

**If the softmax output uses sigmoid activations the delta term has the exact same form**

In [231]:
function backward(softmax::SoftMaxLayer,
                  loss::crossentropy_loss,
                  A_batch::Matrix, 
                  Y_enc_batch::Matrix)
    """
    Returns the Gradient of the crossentropy loss with respect to the preactivation
    This is usualy called the error at the output layer.
    """
    return softmax.W' * (A_batch - Y_enc_batch)
end

backward (generic function with 6 methods)

In [232]:
y_batch_enc = one_hot_encoding(y_train[1:25], unique_classes, class_to_pos);
ce_loss_10_classes = crossentropy_loss(10)

crossentropy_loss(10)

In [248]:
act1 = forward(mlp_classifer[1], Xbatch)
act2 = forward(mlp_classifer[2], act1)
act3 = forward(mlp_classifer[3], act2);
act4 = forward(mlp_classifer[4], act3);

In [251]:
size(act3 - y_batch_enc)

(10, 25)

In [252]:
function backward(softmax::SoftMaxActivation,
                  loss::crossentropy_loss,
                  A_batch::Matrix, 
                  Y_enc_batch::Matrix)
    """
    Returns the Gradient of the crossentropy loss with respect to the preactivation
    This is usualy called the error at the output layer.
    """
    return (A_batch - Y_enc_batch)
end

backward (generic function with 7 methods)

In [262]:
err_4 = backward(mlp_classifer[4], ce_loss_10_classes, act3, y_batch_enc);

In [275]:
size((mlp_classifer[3].W'*err_4) * act3')

(500, 10)

#### Delta term for the hidden layers

      
    def bprop(self, delta_in):
        x_t = np.transpose(self.x)
        self.grad_W = np.dot(x_t, delta_in)
        self.grad_b = delta_in.sum(axis=0)
        W_T = np.transpose(self.W)
        self.delta_out = np.dot(delta_in,W_T)
        return self.delta_out
        

In [235]:
# 2 ways to write the backward mehotd for a SigmoidActivation

# This needs the layer to store the activations during the forward pass
function backward(layer::SigmoidActivation, delta_in)
    delta_out = layer.act * (1 - layer.act) * delta_in
    return delta_out
end

# This DOES NOT need the layer to store the activations during the forward pass
function backward(layer::SigmoidActivation, delta_in, activation_batch)
    delta_out = activation_batch .* (1 .- activation_batch) .* delta_in
    return delta_out
end

backward (generic function with 6 methods)

In [225]:
size(err_3), size(act2)

((500, 25), (500, 25))

In [222]:
act2 * err_3

LoadError: [91mDimensionMismatch("A has dimensions (500,25) but B has dimensions (500,25)")[39m

In [183]:
mlp_classifer[2]

SigmoidActivation{Float32}(500)

In [181]:
backward(mlp_classifer[2], err_3, act1)

LoadError: [91mDimensionMismatch("A has dimensions (500,25) but B has dimensions (500,25)")[39m

In [140]:
# 2 ways to write the backward mehotd for a  Liner Layer

function backward(layer::LinearLayer, delta_in, activation_batch)
    delta_out =  dela_in * a'

    
    return delta_out
end

backward (generic function with 6 methods)

In [None]:

back_3 = backward(mlp_classifer[3], ce_loss_10_classes, act3, y_batch_enc)
back_2 = backward(mlp_classifer[2],  act2, y_batch_enc)
back_1 = 

In [136]:
# Gradient of the grad_W of the softmax
size((act2*delta_out')'), 
size((delta_out * act2')),
size(mlp_classifer[3].W)

((10, 500), (10, 500), (10, 500))

In [130]:

grad_output = delta_out * act2'
grad_input = delta_1 * act1'



((10, 500), (10, 500), (10, 500))

In [131]:
function backward(softmax::SoftMaxLayer, delta_in)
    return delta_out
end

backward (generic function with 3 methods)

In [65]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))
aux = forward(mlp_classifer[2], aux)
print(size(aux))
aux = forward(mlp_classifer[3], aux)
print(size(aux))

(500, 25)(500, 25)(10, 25)

#### Delta term for different layers

## Gradient computation

We can use a the type hierarchy to treat diferently layers. We will now focus on  3 types of layers:

- ActivationFunction
- NeuralNetLayer 
- LossLayer




In [105]:
println(typeof(mlp_classifer[1]) <: ActivationFunction)
println(typeof(mlp_classifer[1]) <: NeuralNetLayer)

println(typeof(mlp_classifer[2]) <: ActivationFunction)
println(typeof(mlp_classifer[2]) <: NeuralNetLayer)

false
true
true
false


In [106]:
mlp_classifer[3]

SoftMaxLayer{Float64} [input_dim: 500, ouput_dim: 10]

In [104]:
typeof(mlp_classifer[3]) <: ActivationFunction

true

In [101]:
mlp_classifier

LoadError: [91mUndefVarError: mlp_classifier not defined[39m

In [68]:
function compute_gradients(mlp, loss, X, Y)
    activations = [X]
    signal = X
    for (l,layer) in enumerate(mlp)
        signal = forward(layer, signal)
        if typeof(layer) <: ActivationFunction  
            push!(activations, signal)
        end
    end
    for (l,layer) in reverse(mlp)
        backsignal = backward(layer, signal)
        if typeof(layer) <: ActivationFunction  
            push!(activations, signal)
        end
    end
    return activations
end

compute_gradients (generic function with 1 method)

In [69]:
Y_train = one_hot_encoding(y_train, unique_classes, class_to_pos);
size(X_train), size(Y_train)

((784, 60000), (10, 60000))

In [70]:
act = compute_gradients(mlp_classifer,
                        crossentropy,
                        X_train[:,1:100],
                        Y_train[:,1:100])

3-element Array{Array{Float32,2},1}:
 Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]                                                                                            
 Float32[0.409869 0.508489 … 0.511245 0.548481; 0.549566 0.502302 … 0.589455 0.429273; … ; 0.440444 0.618377 … 0.419173 0.409908; 0.523728 0.596093 … 0.517019 0.494896]            
 Float32[0.0614294 0.0790431 … 0.0604551 0.0566084; 0.0895738 0.0996882 … 0.0827413 0.0943303; … ; 0.0561401 0.0575206 … 0.0509058 0.0558417; 0.119631 0.112974 … 0.107626 0.101359]

In [71]:
size.(act)

3-element Array{Tuple{Int64,Int64},1}:
 (784, 100)
 (500, 100)
 (10, 100) 

In [72]:
@benchmark act[2][:,1]* rand(100)'

BenchmarkTools.Trial: 
  memory estimate:  393.72 KiB
  allocs estimate:  5
  --------------
  minimum time:     35.655 μs (0.00% GC)
  median time:      165.822 μs (0.00% GC)
  mean time:        200.527 μs (22.15% GC)
  maximum time:     4.218 ms (90.84% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [73]:
mlp_classifer

3-element Array{Any,1}:
 LinearLayer{Float64} [input_dim: 784, ouput_dim: 500]
 SigmoidActivation{Float32}(500)                      
 SoftMaxLayer{Float64} [input_dim: 500, ouput_dim: 10]

In [74]:
# we can use the reverse function to iterate from the last layer 
# of mlp_classifier to the first one.

reverse([1,2,3,4,5])

5-element Array{Int64,1}:
 5
 4
 3
 2
 1

### Benchmark gradient computation

    class LinearLayer():
        def __init__(self, num_inputs, num_units, scale=0.01):
            self.num_units = num_units
            self.num_inputs = num_inputs
            self.W = np.random.random((num_inputs, num_units)) * scale
            self.b = np.zeros(num_units)

        def __str__(self): 
            return "LinearLayer(%i, %i)" % (self.num_inputs,
                                            self.num_units)

        def fprop(self, x, *args):
            self.x = x
            self.a = np.dot(x, self.W) + self.b
            return self.a

        def bprop(self, delta_in):
            x_t = np.transpose(self.x)
            self.grad_W = np.dot(x_t, delta_in)
            self.grad_b = delta_in.sum(axis=0)
            W_T = np.transpose(self.W)
            self.delta_out = np.dot(delta_in,W_T)
            return self.delta_out

        def update_params(self, lr):
            self.W = self.W - self.grad_W*lr
            self.b = self.b - self.grad_b*lr

In [75]:
function backward(linear_layer::LinearLayer, delta_batch_in::Array, Xbatch::Array)
    """
    Given an input batch of delta terms (where the data comes as columns)
    compute
    """
    grad_W = linear_layer.W * Xbatch' 
    grad_b = delta_batch_in
    return grad_W, grad_b
end

function forward(relu_activation::ReluActivation, Xbatch::Array)
    return Xbatch.*( Xbatch .> 0.)
end

function forward(sigmoid_activation::SigmoidActivation, Xbatch::Array)
    return 1./( 1 .+ exp.(-Xbatch))
end

function forward(softmax_layer::SoftMaxLayer, Xbatch::Array)
    """
    Layer shrinking the output to [0,1] values.
    Notice that sum(exp(Xbatch),1) will generate a Matrix with as many elements as
    columns in Xbatch. 
    """
    Xbatch_out = softmax_layer.W * Xbatch
    return exp.(Xbatch_out)./sum(exp.(Xbatch_out), 1)
end

forward (generic function with 6 methods)

In [76]:
Y_train = one_hot_encoding(y_train, unique_classes, class_to_pos);
size(X_train), size(Y_train)

((784, 60000), (10, 60000))

In [77]:
delta

delta (generic function with 1 method)

In [78]:
#backward(mlp)

In [79]:
#mlp_classifer[3]

In [80]:
@benchmark compute_gradients(mlp_classifer,
                             crossentropy,
                             X_train[:,1:1000],
                             Y_train[:,1:1000])

BenchmarkTools.Trial: 
  memory estimate:  20.51 MiB
  allocs estimate:  39
  --------------
  minimum time:     563.439 ms (0.41% GC)
  median time:      582.655 ms (0.48% GC)
  mean time:        591.289 ms (0.40% GC)
  maximum time:     636.607 ms (0.57% GC)
  --------------
  samples:          9
  evals/sample:     1

In [81]:
@benchmark compute_gradients(mlp_classifer,
                             crossentropy,
                             X_train[:,1:100],
                             Y_train[:,1:100])

BenchmarkTools.Trial: 
  memory estimate:  2.05 MiB
  allocs estimate:  34
  --------------
  minimum time:     54.938 ms (0.00% GC)
  median time:      63.822 ms (0.00% GC)
  mean time:        63.734 ms (0.35% GC)
  maximum time:     68.753 ms (3.58% GC)
  --------------
  samples:          79
  evals/sample:     1

In [82]:
aux = compute_gradients(mlp_classifer,
                        crossentropy,
                        X_train[:,1:100],
                        Y_train[:,1:100])

3-element Array{Array{Float32,2},1}:
 Float32[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0]                                                                                            
 Float32[0.409869 0.508489 … 0.511245 0.548481; 0.549566 0.502302 … 0.589455 0.429273; … ; 0.440444 0.618377 … 0.419173 0.409908; 0.523728 0.596093 … 0.517019 0.494896]            
 Float32[0.0614294 0.0790431 … 0.0604551 0.0566084; 0.0895738 0.0996882 … 0.0827413 0.0943303; … ; 0.0561401 0.0575206 … 0.0509058 0.0558417; 0.119631 0.112974 … 0.107626 0.101359]

In [83]:
size(X_train), size(Y_train), size(aux)

((784, 60000), (10, 60000), (3,))

In [84]:
size(aux[1]), size(aux[2]), size(aux[3])

((784, 100), (500, 100), (10, 100))

In [85]:
##@time compute_gradients(mlp_classifer,
#                        crossentropy,
#                        X_train,
#                        Y_train)

In [86]:
#BLAS.vendor()

In [87]:
mlp_classifer

3-element Array{Any,1}:
 LinearLayer{Float64} [input_dim: 784, ouput_dim: 500]
 SigmoidActivation{Float32}(500)                      
 SoftMaxLayer{Float64} [input_dim: 500, ouput_dim: 10]

# SGD on the weights

    y_probs = forward(x_batch)
    loss = LossLayer.fprop(y_probs, target_batch)
    losses += [loss]
    backward(y_probs, target_batch)
    update(learning_rate)
        

In [57]:
mlp_classifer

3-element Array{Any,1}:
 LinearLayer{Float64} [input_dim: 784, ouput_dim: 500]
 ReluActivation{Float32}(500)                         
 SoftMaxLayer{Float64} [input_dim: 500, ouput_dim: 10]

In [99]:
aux[3]

10×100 Array{Float32,2}:
 0.0614294  0.0790431  0.0548797  …  0.0664131  0.0604551  0.0566084
 0.0895738  0.0996882  0.088398      0.0908731  0.0827413  0.0943303
 0.0398186  0.0402922  0.0377777     0.0349294  0.0393121  0.0346216
 0.129657   0.125822   0.127724      0.133245   0.131919   0.125785 
 0.0559305  0.0581756  0.0621343     0.0606674  0.0574023  0.0604792
 0.136068   0.142551   0.138938   …  0.126478   0.14952    0.140141 
 0.209436   0.185521   0.225227      0.227448   0.214934   0.225157 
 0.102316   0.0984117  0.0999063     0.103221   0.105185   0.105676 
 0.0561401  0.0575206  0.0536506     0.0546465  0.0509058  0.0558417
 0.119631   0.112974   0.111365      0.102078   0.107626   0.101359 



## Implementation using BLAS

In [None]:
T = Float32
W1 = rand(T, 784, 500)
out1, out2, out3 = zeros(T, 500), zeros(T, 1024), zeros(T, 10)

BLAS.gemv!('N', T(1.0), W1, Array{Float32}(X_train[:,1]), T(0.0), out1)


In [None]:
T = Float32
W1 = rand(T, 500, 1000)
W2 = rand(T, 500, 500)
W3 = rand(T, 10, 500)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('N', T(1.0), W3', error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('N', T(1.0), W2', dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('N', T(1.0), W1', dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end

In [None]:
input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN(input, error)
end

In [None]:
T = Float32
W1 = rand(T, 2048, 512 * 512)
W2 = rand(T, 1024, 2048)
W3 = rand(T, 10, 1024)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN2(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('T', T(1.0), W3, error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('T', T(1.0), W2, dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('T', T(1.0), W1, dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end

In [None]:
input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN2(input, error)
end
