## Toy NNets for education pourpouses in  Julia

Interesting discussion how to make forward pass efficiently using BLAS:

- https://discourse.julialang.org/t/blas-performance-issues-for-common-neural-network-patterns/565

- http://int8.io/neural-networks-in-julia-hyperbolic-tangent-and-relu/

- http://int8.io/backpropagation-from-scratch-in-julia-part-ii-derivation-and-implementation/



#### 


In [2]:
using MNIST

In [3]:
train = MNIST.traindata()

(
[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0],

[5.0,0.0,4.0,1.0,9.0,2.0,1.0,3.0,1.0,4.0  …  9.0,2.0,9.0,5.0,1.0,8.0,3.0,5.0,6.0,8.0])

In [4]:
X_train = train[1];
y_train = train[2];

In [5]:
sort(unique(train[2]))

10-element Array{Float64,1}:
 0.0
 1.0
 2.0
 3.0
 4.0
 5.0
 6.0
 7.0
 8.0
 9.0

## Defining Linear layer and relu layer

In [6]:
T = Float32
n_visible = 784
n_hidden = 500

srand(1234)
W1 = randn(T, n_hidden, n_visible );
W1 = W1/norm(W1)
b = zeros(n_hidden);

In [7]:
size(W1)

(500,784)

In [8]:
size(X_train[:,1:10])

(784,10)

In [9]:
#linear layer
batch = W1 * X_train[:,1:3].+b

#relu
l1_batch = batch .* (batch .>0);

In [10]:
exp(batch)./sum(exp(batch),1);

In [11]:
sum(exp(l1_batch),1)

1×3 Array{Float64,2}:
 3.69209e74  3.80344e69  4.13157e48

In [12]:
exp(l1_batch)./sum(exp(l1_batch),1);

#### Defining layers

In [13]:
type LinearLayer{T}
    """
    Standard layer between activations.
    The output of this layer for a given input is meant to be a matrix product 
    of the input times W
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    b::Vector{T}
    seed::Int
    
    function LinearLayer(input, output; seed=1234)
        srand(seed)
        return new(input,
                   output,
                   randn(T,output,input)/sqrt(input),
                   zeros(output))
    end
end

In [14]:
input_dim = 784
output_dim = 500
l = LinearLayer{Float32}(input_dim,output_dim)

LinearLayer{Float32}(784,500,Float32[0.0309767 0.0511305 … 0.0373952 -0.00012965; -0.0322051 -0.0588398 … 0.0261659 0.0156726; … ; -0.0580577 0.0256086 … 0.116446 -0.0452519; -0.0132269 -0.00453994 … 0.0292845 0.0139892],Float32[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0  …  0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],4492689440)

In [15]:
type ReluActivation{T}
    """
    Relu Activation function latyer
    """
    dim::Int
end

In [16]:
type SoftMaxLayer{T}
    """
    Standard layer between activations.
    The output of this layer for a given input is meant to be a matrix product 
    of the input times W
    """
    input_dim::Int
    output_dim::Int
    W::Array{T}
    seed::Int
    
    function SoftMaxLayer(input, output; seed=1234)
        srand(seed)
        return new(input,
                   output,
                   randn(T,output, input)/sqrt(input))
    end
end


About Softmax layer

http://stats.stackexchange.com/questions/79454/softmax-layer-in-a-neural-network

### First example

Now let us define the structure, weight types (float type) of a MLP

In [17]:
input_dim = 784
hidden_dim = 500
output_dim = 10

10

In [18]:
mlp_classifer = [LinearLayer{Float32}(input_dim , hidden_dim),
                 ReluActivation{Float32}(hidden_dim),
                 SoftMaxLayer{Float32}(hidden_dim, output_dim)];

In [19]:
# write the code that accepts something like this
# mlp(784,500,10, ["sigmoid", "softmax"])

## Making predictions with the network

We have defined a MLP as list of layers and activation functions.

In order to make a prediction we need to make a forward pass through the network.
Let us assume by now that we have a good set of weights at each layer in the network and
we want to make a prediction.

In [20]:
function forward(linear_layer::LinearLayer, Xbatch::Array)
    """
    Given an input batch where the data comes as columns this method propagates 
    the batch using the weights of the linear layer
    """
    return linear_layer.W * Xbatch .+ linear_layer.b
end

function forward(relu_activation::ReluActivation, Xbatch::Array)
    return Xbatch.*( Xbatch .> 0.)
end

function forward(softmax_layer::SoftMaxLayer, Xbatch::Array)
    """
    Layer shrinking the output to [0,1] values.
    Notice that sum(exp(Xbatch),1) will generate a vector with as many elements as
    columns in Xbatch. 
    """
    Xbatch = softmax_layer.W * Xbatch
    return exp(Xbatch)./sum(exp(Xbatch),1)
end

forward (generic function with 3 methods)

In [21]:
Xbatch = X_train[:,1:25];

In [22]:
aux = forward(mlp_classifer[1], Xbatch)
print(size(aux))
aux = forward(mlp_classifer[2], aux)
print(size(aux))
aux = forward(mlp_classifer[3], aux)
print(size(aux))

(500,25)(500,25)(10,25)

In [23]:
function predict_proba(mlp, Xbatch::Array)
    for l in mlp
        Xbatch = forward(l, Xbatch)
    end
    return Xbatch
end

predict_proba (generic function with 1 method)

In [24]:
# Each column contains a vector that represents
# The conditional probability of the target beein from a particular class having observed
# the input vector.

@time predict_proba(mlp_classifer, Xbatch)

  0.504008 seconds (745.09 k allocations: 27.207 MB, 2.97% gc time)


10×25 Array{Float64,2}:
 5.39905e-22  1.0           9.17328e-30  …  9.1359e-32   8.48121e-20
 2.37617e-5   1.11199e-49   2.55392e-68     5.00068e-19  2.31968e-43
 6.93362e-51  1.8805e-97    5.01312e-33     9.6184e-80   1.53352e-83
 3.41924e-13  1.70124e-72   8.44621e-12     1.31159e-34  4.3633e-48 
 2.1359e-42   2.54938e-76   1.57863e-45     5.86561e-11  9.00681e-47
 7.27582e-16  5.33328e-55   1.75377e-23  …  4.57536e-48  1.0        
 0.000291457  1.23333e-74   1.0             1.0          1.73416e-63
 8.83703e-17  1.10494e-125  1.52251e-36     1.17959e-45  3.82412e-82
 1.68701e-43  4.17899e-67   3.65337e-25     2.82104e-50  3.34436e-60
 0.999685     6.36062e-51   7.87676e-32     2.7335e-64   1.85022e-33

# Training the network

Now we will deal with the learning part. That is, given a MLP architecture we will tune the weights in order to minimize some error function.


## Cross entropy loss for mlp_classifier

We will focus now on an standard loss function for classification problems. The cross entropy loss.

In [25]:
X_train[:,1:2];

In [26]:
unique_classes = sort(unique(y_train));

In [27]:
n_classes = length(unique_classes)

10

In [28]:
encoded_classes = zeros(Int32,n_classes, length(y_train)) ;

In [29]:
encoded_classes[:,2][]

0

In [30]:
for (a,b) in enumerate(unique_classes)
    print(a, " ", b, "\n")
end

1 0.0
2 1.0
3 2.0
4 3.0
5 4.0
6 5.0
7 6.0
8 7.0
9 8.0
10 9.0


In [31]:
class_to_pos = Dict(class =>pos for (pos,class) in enumerate(unique_classes))

Dict{Float64,Int64} with 10 entries:
  0.0 => 1
  4.0 => 5
  7.0 => 8
  9.0 => 10
  2.0 => 3
  3.0 => 4
  5.0 => 6
  8.0 => 9
  6.0 => 7
  1.0 => 2

In [32]:
function one_hot_encoding(y_train)
    unique_classes = sort(unique(y_train))
    class_to_pos = Dict(class =>pos for (pos,class) in enumerate(unique_classes))    
    encoded_classes = zeros(length(unique_classes), length(y_train))
    for (i,y) in enumerate(y_train)
        encoded_classes[:,i][class_to_pos[y]] = 1.
    end
    return encoded_classes
end

one_hot_encoding (generic function with 1 method)

In [35]:
Y_train = one_hot_encoding(y_train)[:,2]

10-element Array{Float64,1}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [36]:
### Cross entropy

In [34]:
type linear_layer

LoadError: syntax: incomplete: premature end of input

In [None]:
T = Float32
W1 = rand(T, 500, 1000)
W2 = rand(T, 500, 500)
W3 = rand(T, 10, 500)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('N', T(1.0), W3', error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('N', T(1.0), W2', dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('N', T(1.0), W1', dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end


In [None]:

input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN(input, error)
end

In [None]:
T = Float32
W1 = rand(T, 2048, 512 * 512)
W2 = rand(T, 1024, 2048)
W3 = rand(T, 10, 1024)
dW1, dW2, dW3 = zeros(W1), zeros(W2), zeros(W3)
out1, out2, out3 = zeros(T, 2048), zeros(T, 1024), zeros(T, 10)
dOut1, dOut2, dOut = zeros(T, 2048), zeros(T, 1024), zeros(T, 512 * 512)

function mockNN2(input::Array{Float32, 1}, error::Array{Float32, 1})
  # Forward
  BLAS.gemv!('N', T(1.0), W1, input, T(0.0), out1)
  BLAS.gemv!('N', T(1.0), W2, out1, T(0.0), out2)
  BLAS.gemv!('N', T(1.0), W3, out2, T(0.0), out3)

  # Backward
  # ∂E/∂inputs and ∂E/∂W
  fill!(dW3, 0)
  fill!(dOut2, 0)
  BLAS.gemv!('T', T(1.0), W3, error, T(0.0), dOut2)
  BLAS.ger!(T(1.0), error, out2, dW3)
  
  fill!(dW2, 0)
  fill!(dOut1, 0)
  BLAS.gemv!('T', T(1.0), W2, dOut2, T(0.0), dOut1)
  BLAS.ger!(T(1.0), dOut2, out1, dW2)

  fill!(dW1, 0)
  fill!(dOut, 0)
  BLAS.gemv!('T', T(1.0), W1, dOut1, T(0.0), dOut)
  BLAS.ger!(T(1.0), dOut1, input, dW1)
end

In [None]:
input = rand(T, 512 * 512)
error = rand(T, 10)
@time mockNN(input, error)
for i in 1:10
  input = rand(T, 512 * 512)
  error = rand(T, 10)
  @time mockNN2(input, error)
end
