In [None]:
using Knet, Plots, JLD, NBInclude
nbinclude("mnist.ipynb")  # loads MNIST, defines dtrn,dtst,Atype,train,softmax,zeroone
mlpdata = load("mlp.jld") # loads MLP results for comparison
ENV["COLUMNS"]=80         # column width for array printing
plotlyjs();               # for interactive plots

## Introduction to convolution

In [2]:
# Convolution operator in Knet
@doc conv4

```
conv4(w, x; kwargs...)
```

Execute convolutions or cross-correlations using filters specified with `w` over tensor `x`.

Currently KnetArray{Float32/64,4/5} and Array{Float32/64,4} are supported as `w` and `x`.  If `w` has dimensions `(W1,W2,...,I,O)` and `x` has dimensions `(X1,X2,...,I,N)`, the result `y` will have dimensions `(Y1,Y2,...,O,N)` where

```
Yi=1+floor((Xi+2*padding[i]-Wi)/stride[i])
```

Here `I` is the number of input channels, `O` is the number of output channels, `N` is the number of instances, and `Wi,Xi,Yi` are spatial dimensions.  `padding` and `stride` are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.

# Keywords

  * `padding=0`: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.
  * `stride=1`: the number of elements to slide to reach the next filtering window.
  * `upscale=1`: upscale factor for each dimension.
  * `mode=0`: 0 for convolution and 1 for cross-correlation.
  * `alpha=1`: can be used to scale the result.
  * `handle`: handle to a previously created cuDNN context. Defaults to a Knet allocated handle.


In [3]:
# Convolution in 1-D
@show w = reshape([1.0,2.0,3.0], (3,1,1,1))
@show x = reshape([1.0:7.0...], (7,1,1,1))
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

w = reshape([1.0, 2.0, 3.0], (3, 1, 1, 1)) = [1.0; 2.0; 3.0]
x = reshape([1.0:7.0...], (7, 1, 1, 1)) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]
y = conv4(w, x) = [10.0; 16.0; 22.0; 28.0; 34.0]


In [4]:
# Padding
@show y2 = conv4(w, x, padding=(1,0));  # size Y = X + 2P - W + 1 = 7 with padding=1
# To preserve input size (Y=X) for a given W, what padding P should we use?

y2 = conv4(w, x, padding=(1, 0)) = [4.0; 10.0; 16.0; 22.0; 28.0; 34.0; 32.0]


In [5]:
# Stride
@show y3 = conv4(w, x; padding=(1,0), stride=3);  # size Y = 1 + floor((X+2P-W)/S)

y3 = conv4(w, x; padding=(1, 0), stride=3) = [4.0; 22.0; 32.0]


In [6]:
# Mode
@show y4 = conv4(w, x, mode=0);  # Default mode (convolution) inverts w
@show y5 = conv4(w, x, mode=1);  # mode=1 (cross-correlation) does not invert w

y4 = conv4(w, x, mode=0) = [10.0; 16.0; 22.0; 28.0; 34.0]
y5 = conv4(w, x, mode=1) = [14.0; 20.0; 26.0; 32.0; 38.0]


In [7]:
# Convolution in more dimensions
x = reshape([1.0:9.0...], (3,3,1,1))

3×3×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0  4.0  7.0
 2.0  5.0  8.0
 3.0  6.0  9.0

In [8]:
w = reshape([1.0:4.0...], (2,2,1,1))

2×2×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0  3.0
 2.0  4.0

In [9]:
y = conv4(w, x)

2×2×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 23.0  53.0
 33.0  63.0

In [10]:
# Convolution with multiple channels, filters, and instances
# size X = [X1,X2,...,Xd,Cx,N] where d is the number of dimensions, Cx is channels, N is instances
x = reshape([1.0:18.0...], (3,3,2,1)) 

3×3×2×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0  4.0  7.0
 2.0  5.0  8.0
 3.0  6.0  9.0

[:, :, 2, 1] =
 10.0  13.0  16.0
 11.0  14.0  17.0
 12.0  15.0  18.0

In [11]:
# size W = [W1,W2,...,Wd,Cx,Cy] where d is the number of dimensions, Cx is input channels, Cy is output channels
w = reshape([1.0:24.0...], (2,2,2,3));

In [12]:
# size Y = [Y1,Y2,...,Yd,Cy,N]  where Yi = 1 + floor((Xi+2Pi-Wi)/Si), Cy is channels, N is instances
y = conv4(w,x)

2×2×3×1 Array{Float64,4}:
[:, :, 1, 1] =
 328.0  436.0
 364.0  472.0

[:, :, 2, 1] =
 808.0  1108.0
 908.0  1208.0

[:, :, 3, 1] =
 1288.0  1780.0
 1452.0  1944.0

See http://cs231n.github.io/assets/conv-demo/index.html for an animated example.

## Introduction to Pooling

In [13]:
# Pooling operator in Knet
@doc pool

```
pool(x; kwargs...)
```

Compute pooling of input values (i.e., the maximum or average of several adjacent values) to produce an output with smaller height and/or width.

Currently 4 or 5 dimensional KnetArrays with `Float32` or `Float64` entries are supported.  If `x` has dimensions `(X1,X2,...,I,N)`, the result `y` will have dimensions `(Y1,Y2,...,I,N)` where

```
Yi=1+floor((Xi+2*padding[i]-window[i])/stride[i])
```

Here `I` is the number of input channels, `N` is the number of instances, and `Xi,Yi` are spatial dimensions.  `window`, `padding` and `stride` are keyword arguments that can be specified as a single number (in which case they apply to all dimensions), or an array/tuple with entries for each spatial dimension.

# Keywords:

  * `window=2`: the pooling window size for each dimension.
  * `padding=0`: the number of extra zeros implicitly concatenated at the start and at the end of each dimension.
  * `stride=window`: the number of elements to slide to reach the next pooling window.
  * `mode=0`: 0 for max, 1 for average including padded values, 2 for average excluding padded values.
  * `maxpoolingNanOpt=0`: Nan numbers are not propagated if 0, they are propagated if 1.
  * `alpha=1`: can be used to scale the result.
  * `handle`: Handle to a previously created cuDNN context. Defaults to a Knet allocated handle.


In [14]:
# 1-D pooling example
@show x = reshape([1.0:6.0...], (6,1,1,1))
@show pool(x);

x = reshape([1.0:6.0...], (6, 1, 1, 1)) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0]
pool(x) = [2.0; 4.0; 6.0]


In [15]:
# Window size
@show pool(x; window=3);  # size Y = floor(X/W)

pool(x; window=3) = [3.0; 6.0]


In [16]:
# Padding
@show pool(x; padding=(1,0));  # size Y = floor((X+2P)/W)

pool(x; padding=(1, 0)) = [1.0; 3.0; 5.0; 6.0]


In [17]:
# Stride
@show x = reshape([1.0:10.0...], (10,1,1,1));
@show pool(x; stride=4);  # size Y = 1 + floor((X+2P-W)/S)

x = reshape([1.0:10.0...], (10, 1, 1, 1)) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0; 8.0; 9.0; 10.0]
pool(x; stride=4) = [2.0; 6.0; 10.0]


In [18]:
# Mode
x = ka(reshape([1.0:6.0...], (6,1,1,1)))
@show Array(x)
@show Array(pool(x; padding=(1,0), mode=0))  # max pooling
@show Array(pool(x; padding=(1,0), mode=1))  # avg pooling
@show Array(pool(x; padding=(1,0), mode=2)); # avg pooling excluding padded values (is not implemented on CPU)

Array(x) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0]
Array(pool(x; padding=(1, 0), mode=0)) = [1.0; 3.0; 5.0; 6.0]
Array(pool(x; padding=(1, 0), mode=1)) = [0.5; 2.5; 4.5; 3.0]
Array(pool(x; padding=(1, 0), mode=2)) = [1.0; 2.5; 4.5; 6.0]


In [19]:
# More dimensions
x = reshape([1.0:16.0...], (4,4,1,1))

4×4×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0  5.0   9.0  13.0
 2.0  6.0  10.0  14.0
 3.0  7.0  11.0  15.0
 4.0  8.0  12.0  16.0

In [20]:
pool(x)

2×2×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 6.0  14.0
 8.0  16.0

In [21]:
# Multiple channels and instances
x = reshape([1.0:32.0...], (4,4,2,1))

4×4×2×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0  5.0   9.0  13.0
 2.0  6.0  10.0  14.0
 3.0  7.0  11.0  15.0
 4.0  8.0  12.0  16.0

[:, :, 2, 1] =
 17.0  21.0  25.0  29.0
 18.0  22.0  26.0  30.0
 19.0  23.0  27.0  31.0
 20.0  24.0  28.0  32.0

In [22]:
# each channel and each instance is pooled separately
pool(x)  # size Y = (Y1,...,Yd,Cx,N) where Yi are spatial dims, Cx and N are identical to input X

2×2×2×1 Array{Float64,4}:
[:, :, 1, 1] =
 6.0  14.0
 8.0  16.0

[:, :, 2, 1] =
 22.0  30.0
 24.0  32.0

## A convolutional neural network model for MNIST

In [None]:
function convnet(w,x; pdrop=(0,0,0))    # pdrop[1]:input, pdrop[2]:conv, pdrop[3]:fc
    for i=1:2:length(w)
        if ndims(w[i]) == 4     # convolutional layer
            x = dropout(x, pdrop[i==1?1:2])
            x = conv4(w[i],x) .+ w[i+1]
            x = pool(relu.(x))
        elseif ndims(w[i]) == 2 # fully connected layer
            x = dropout(x, pdrop[i==1?1:3])
            x = w[i]*mat(x) .+ w[i+1]
            if i < length(w)-1; x = relu.(x); end
        else
            error("Unknown layer type: $(size(w[i]))")
        end
    end
    return x
end;

In [None]:
# Weight initialization for multiple layers
# h[i] is an integer for a fully connected layer, a triple of integers for convolution filters
# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix/tensor and bias vector for the i'th layer
function cinit(h...)  # use cinit(x,h1,h2,...,hn,y) for n hidden layer model
    w = Any[]
    x = h[1]
    for i=2:length(h)
        if isa(h[i],Tuple)
            (x1,x2,cx) = x
            (w1,w2,cy) = h[i]
            push!(w, xavier(w1,w2,cx,cy))
            push!(w, zeros(1,1,cy,1))
            x = (div(x1-w1+1,2),div(x2-w2+1,2),cy) # assuming conv4 with p=0, s=1 and pool with p=0,w=s=2
        elseif isa(h[i],Integer)
            push!(w, xavier(h[i],prod(x)))
            push!(w, zeros(h[i],1))
            x = h[i]
        else
            error("Unknown layer type: $(h[i])")
        end
    end
    map(Atype, w)
end;

In [25]:
lenet=cinit((28,28,1), (5,5,20), (5,5,50), 500, 10)

8-element Array{Knet.KnetArray{Float32,N} where N,1}:
 Knet.KnetArray{Float32,4}(Knet.KnetPtr(Ptr{Void} @0x00000081055e0000, 2000, 0, nothing), (5, 5, 1, 20))   
 Knet.KnetArray{Float32,4}(Knet.KnetPtr(Ptr{Void} @0x00000081052e0e00, 80, 0, nothing), (1, 1, 20, 1))     
 Knet.KnetArray{Float32,4}(Knet.KnetPtr(Ptr{Void} @0x00000081056e0000, 100000, 0, nothing), (5, 5, 20, 50))
 Knet.KnetArray{Float32,4}(Knet.KnetPtr(Ptr{Void} @0x00000081052e1000, 200, 0, nothing), (1, 1, 50, 1))    
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081057e0000, 1600000, 0, nothing), (500, 800))   
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081055e0800, 2000, 0, nothing), (500, 1))        
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081054e8000, 20000, 0, nothing), (10, 500))      
 Knet.KnetArray{Float32,2}(Knet.KnetPtr(Ptr{Void} @0x00000081052e1200, 40, 0, nothing), (10, 1))           

In [26]:
(x,y) = first(dtst)
softmax(lenet,x,y,convnet)

2.2925642f0

In [27]:
if !isfile("cnn.jld")
    setseed(1)
    lenet=cinit((28,28,1), (5,5,20), (5,5,50), 500, 10)
    @time weights=train(lenet,dtrn,convnet,lr=0.1,pdrop=(0,0,0.3)) # 233.8s
    @time trnloss = [ softmax(w,dtrn,convnet) for w in weights ]   # 85.4s
    @time tstloss = [ softmax(w,dtst,convnet) for w in weights ]   # 14.3s
    @time trnerr = [ zeroone(w,dtrn,convnet) for w in weights ]    # 84.9s
    @time tsterr = [ zeroone(w,dtst,convnet) for w in weights ]    # 14.1s
    @save "cnn.jld" trnloss tstloss trnerr tsterr
else    
    @eval (@load "cnn.jld")
end
minimum(tstloss),minimum(tsterr)  # 0.0176, 0.0046

(0.017571094f0, 0.0046000000000000485)

In [None]:
plot([mlpdata["trnloss"] mlpdata["tstloss"] trnloss tstloss],ylim=(0.0,0.1),
    labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Loss")  

In [None]:
plot([mlpdata["trnerr"] mlpdata["tsterr"] trnerr tsterr],ylim=(0.0,0.03),
    labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Error")  

## Convolution vs Matrix Multiplication

In [30]:
# Convolution and matrix multiplication can be implemented in terms of each other.
# Convolutional networks have no additional representational power, only statistical efficiency.
# Our original 1-D example
@show w = reshape([1.0,2.0,3.0], (3,1,1,1))
@show x = reshape([1.0:7.0...], (7,1,1,1))
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

w = reshape([1.0, 2.0, 3.0], (3, 1, 1, 1)) = [1.0; 2.0; 3.0]
x = reshape([1.0:7.0...], (7, 1, 1, 1)) = [1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0]
y = conv4(w, x) = [10.0; 16.0; 22.0; 28.0; 34.0]


In [31]:
# Convolution as matrix multiplication (1)
# Turn w into a (Y,X) sparse matrix
w2 = Float64[3 2 1 0 0 0 0; 0 3 2 1 0 0 0; 0 0 3 2 1 0 0; 0 0 0 3 2 1 0; 0 0 0 0 3 2 1]

5×7 Array{Float64,2}:
 3.0  2.0  1.0  0.0  0.0  0.0  0.0
 0.0  3.0  2.0  1.0  0.0  0.0  0.0
 0.0  0.0  3.0  2.0  1.0  0.0  0.0
 0.0  0.0  0.0  3.0  2.0  1.0  0.0
 0.0  0.0  0.0  0.0  3.0  2.0  1.0

In [32]:
@show y2 = w2 * mat(x);

y2 = w2 * mat(x) = [10.0; 16.0; 22.0; 28.0; 34.0]


In [33]:
# Convolution as matrix multiplication (2)
# Turn x into a (W,Y) dense matrix (aka the im2col operation)
# This is used to speed up convolution with known efficient matmul algorithms
x3 = Float64[1 2 3 4 5; 2 3 4 5 6; 3 4 5 6 7]

3×5 Array{Float64,2}:
 1.0  2.0  3.0  4.0  5.0
 2.0  3.0  4.0  5.0  6.0
 3.0  4.0  5.0  6.0  7.0

In [34]:
@show w3 = [3.0 2.0 1.0]
@show y3 = w3 * x3;

w3 = [3.0 2.0 1.0] = [3.0 2.0 1.0]
y3 = w3 * x3 = [10.0 16.0 22.0 28.0 34.0]


In [35]:
# Matrix multiplication as convolution
# This could be used to make a fully connected network accept variable sized inputs.
w = reshape([1.0:6.0...], (2,3))

2×3 Array{Float64,2}:
 1.0  3.0  5.0
 2.0  4.0  6.0

In [36]:
x = reshape([1.0:3.0...], (3,1))

3×1 Array{Float64,2}:
 1.0
 2.0
 3.0

In [37]:
y = w * x

2×1 Array{Float64,2}:
 22.0
 28.0

In [38]:
# Consider w with size (Y,X)
# Treat each of the Y rows of w as a convolution filter
w2 = reshape(Array(w)', (3,1,1,2))

3×1×1×2 Array{Float64,4}:
[:, :, 1, 1] =
 1.0
 3.0
 5.0

[:, :, 1, 2] =
 2.0
 4.0
 6.0

In [39]:
# Reshape x for convolution
x2 = reshape(x, (3,1,1,1))

3×1×1×1 Array{Float64,4}:
[:, :, 1, 1] =
 1.0
 2.0
 3.0

In [40]:
# Use conv4 for matrix multiplication
y2 = conv4(w2, x2; mode=1)

1×1×2×1 Array{Float64,4}:
[:, :, 1, 1] =
 22.0

[:, :, 2, 1] =
 28.0

In [None]:
# So there is no difference between the class of functions representable with an MLP vs CNN.
# Sparse connections and weight sharing give CNNs more generalization power with images.
# Number of parameters in MLP256: (256x784)+256+(10x256)+10 = 203530
# Number of parameters in LeNet: (5*5*1*20)+20+(5*5*20*50)+50+(500*800)+500+(10*500)+10 = 431080