# Convolutional Neural Networks
(c) Deniz Yuret, 2019

* Objectives: See the effect of sparse and shared weights implemented by convolutional networks.
* Prerequisites: MLP models (mlp.ipynb), KnetArray, param, param0, dropout, relu, nll

In [None]:
# Setup display width, load packages, import symbols
ENV["COLUMNS"]=72
using Pkg; for p in ("Knet","Plots"); haskey(Pkg.installed(),p) || Pkg.add(p); end
using Base.Iterators: flatten
using Statistics: mean
using Knet: Knet, conv4, pool, mat, KnetArray, nll, zeroone, progress, sgd, param, param0, dropout, relu, Data

## Introduction to convolution

In [None]:
# Convolution operator in Knet
@doc conv4

In [None]:
# Convolution in 1-D
w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w
x = reshape([1.0:7.0...], (7,1,1,1)); @show x
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

In [None]:
# Padding
w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w
x = reshape([1.0:7.0...], (7,1,1,1)); @show x
@show y2 = conv4(w, x, padding=(1,0));  # size Y = X + 2P - W + 1 = 7 with padding=1
# To preserve input size (Y=X) for a given W, what padding P should we use?

In [None]:
# Stride
w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w
x = reshape([1.0:7.0...], (7,1,1,1)); @show x
@show y3 = conv4(w, x; padding=(1,0), stride=3);  # size Y = 1 + floor((X+2P-W)/S)

In [None]:
# Mode
w = reshape([1.0,2.0,3.0], (3,1,1,1)); @show w
x = reshape([1.0:7.0...], (7,1,1,1)); @show x
@show y4 = conv4(w, x, mode=0);  # Default mode (convolution) inverts w
@show y5 = conv4(w, x, mode=1);  # mode=1 (cross-correlation) does not invert w

In [None]:
# Convolution in more dimensions
x = reshape([1.0:9.0...], (3,3,1,1))

In [None]:
w = reshape([1.0:4.0...], (2,2,1,1))

In [None]:
y = conv4(w, x)

In [None]:
# Convolution with multiple channels, filters, and instances
# size X = [X1,X2,...,Xd,Cx,N] where d is the number of dimensions, Cx is channels, N is instances
x = reshape([1.0:18.0...], (3,3,2,1)) 

In [None]:
# size W = [W1,W2,...,Wd,Cx,Cy] where d is the number of dimensions, Cx is input channels, Cy is output channels
w = reshape([1.0:24.0...], (2,2,2,3));

In [None]:
# size Y = [Y1,Y2,...,Yd,Cy,N]  where Yi = 1 + floor((Xi+2Pi-Wi)/Si), Cy is channels, N is instances
y = conv4(w,x)

See http://cs231n.github.io/assets/conv-demo/index.html for an animated example.

## Introduction to Pooling

In [None]:
# Pooling operator in Knet
@doc pool

In [None]:
# 1-D pooling example
x = reshape([1.0:6.0...], (6,1,1,1)); @show x
@show pool(x);

In [None]:
# Window size
x = reshape([1.0:6.0...], (6,1,1,1)); @show x
@show pool(x; window=3);  # size Y = floor(X/W)

In [None]:
# Padding
x = reshape([1.0:6.0...], (6,1,1,1)); @show x
@show pool(x; padding=(1,0));  # size Y = floor((X+2P)/W)

In [None]:
# Stride
x = reshape([1.0:10.0...], (10,1,1,1)); @show x
@show pool(x; stride=4);  # size Y = 1 + floor((X+2P-W)/S)

In [None]:
# Mode (using KnetArray here; not all modes are implemented on the CPU)
x = KnetArray(reshape([1.0:6.0...], (6,1,1,1))); @show x
@show pool(x; padding=(1,0), mode=0)  # max pooling
@show pool(x; padding=(1,0), mode=1)  # avg pooling
@show pool(x; padding=(1,0), mode=2); # avg pooling excluding padded values (is not implemented on CPU)

In [None]:
# More dimensions
x = reshape([1.0:16.0...], (4,4,1,1))

In [None]:
pool(x)

In [None]:
# Multiple channels and instances
x = reshape([1.0:32.0...], (4,4,2,1))

In [None]:
# each channel and each instance is pooled separately
pool(x)  # size Y = (Y1,...,Yd,Cx,N) where Yi are spatial dims, Cx and N are identical to input X

## Experiment setup

In [None]:
# Load data (see mnist.ipynb)
include(Knet.dir("data","mnist.jl"))  # Load data
dtrn,dtst = mnistdata();              # dtrn and dtst = [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100

In [None]:
(x,y) = first(dtst)
println.(summary.((x,y)));

In [None]:
# For running experiments
function trainresults(file,model; o...)
    if (print("Train from scratch? "); readline()[1]=='y')
        takeevery(n,itr) = (x for (i,x) in enumerate(itr) if i % n == 1)
        r = ((model(dtrn), model(dtst), zeroone(model,dtrn), zeroone(model,dtst))
             for x in takeevery(length(dtrn), progress(sgd(model,repeat(dtrn,100)))))
        r = reshape(collect(Float32,flatten(r)),(4,:))
        Knet.save(file,"results",r)
        Knet.gc() # To save gpu memory
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        r = Knet.load(file,"results")
    end
    println(minimum(r,dims=2))
    return r
end

## A convolutional neural network model for MNIST

In [None]:
# Define a convolutional layer:
struct Conv; w; b; f; p; end
(c::Conv)(x) = c.f.(pool(conv4(c.w, dropout(x,c.p)) .+ c.b))
Conv(w1::Int,w2::Int,cx::Int,cy::Int,f=relu;pdrop=0) = Conv(param(w1,w2,cx,cy), param0(1,1,cy,1), f, pdrop)

In [None]:
# Redefine dense layer (See mlp.ipynb):
struct Dense; w; b; f; p; end
(d::Dense)(x) = d.f.(d.w * mat(dropout(x,d.p)) .+ d.b) # mat reshapes 4-D tensor to 2-D matrix so we can use matmul
Dense(i::Int,o::Int,f=relu;pdrop=0) = Dense(param(o,i), param0(o), f, pdrop)

In [None]:
# Let's define a chain of layers
struct Chain
    layers
    Chain(layers...) = new(layers)
end
(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y) = nll(c(x),y)
(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)

In [None]:
lenet =   Chain(Conv(5,5,1,20), 
                Conv(5,5,20,50), 
                Dense(800,500,pdrop=0.3), 
                Dense(500,10,identity,pdrop=0.3))
summary.(l.w for l in lenet.layers)

In [None]:
lenet(x,y)

## CNN vs MLP

In [None]:
# 225s [0.000159805; 0.0163911; 0.0; 0.0046]
cnn = trainresults("cnn113.jld2", lenet);

In [None]:
mlp = Knet.load("mlp113f.jld2","results");

In [None]:
using Plots; default(fmt=:png,ls=:auto)

In [None]:
# Comparison to MLP shows faster convergence, better generalization
plot([mlp[1,:], mlp[2,:], cnn[1,:], cnn[2,:]],ylim=(0.0,0.1),
     labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Loss")  

In [None]:
plot([mlp[3,:], mlp[4,:], cnn[3,:], cnn[4,:]],ylim=(0.0,0.03),
    labels=[:trnMLP :tstMLP :trnCNN :tstCNN],xlabel="Epochs",ylabel="Error")  

## Convolution vs Matrix Multiplication

In [None]:
# Convolution and matrix multiplication can be implemented in terms of each other.
# Convolutional networks have no additional representational power, only statistical efficiency.
# Our original 1-D example
@show w = reshape([1.0,2.0,3.0], (3,1,1,1))
@show x = reshape([1.0:7.0...], (7,1,1,1))
@show y = conv4(w, x);  # size Y = X - W + 1 = 5 by default

In [None]:
# Convolution as matrix multiplication (1)
# Turn w into a (Y,X) sparse matrix
w2 = Float64[3 2 1 0 0 0 0; 0 3 2 1 0 0 0; 0 0 3 2 1 0 0; 0 0 0 3 2 1 0; 0 0 0 0 3 2 1]

In [None]:
@show y2 = w2 * mat(x);

In [None]:
# Convolution as matrix multiplication (2)
# Turn x into a (W,Y) dense matrix (aka the im2col operation)
# This is used to speed up convolution with known efficient matmul algorithms
x3 = Float64[1 2 3 4 5; 2 3 4 5 6; 3 4 5 6 7]

In [None]:
@show w3 = [3.0 2.0 1.0]
@show y3 = w3 * x3;

In [None]:
# Matrix multiplication as convolution
# This could be used to make a fully connected network accept variable sized inputs.
w = reshape([1.0:6.0...], (2,3))

In [None]:
x = reshape([1.0:3.0...], (3,1))

In [None]:
y = w * x

In [None]:
# Consider w with size (Y,X)
# Treat each of the Y rows of w as a convolution filter
w2 = copy(reshape(Array(w)', (3,1,1,2)))

In [None]:
# Reshape x for convolution
x2 = reshape(x, (3,1,1,1))

In [None]:
# Use conv4 for matrix multiplication
y2 = conv4(w2, x2; mode=1)

* So there is no difference between the class of functions representable with an MLP vs CNN.
* Sparse connections and weight sharing give CNNs more generalization power with images.
* Number of parameters in MLP256: (256x784)+256+(10x256)+10 = 203530
* Number of parameters in LeNet: (5*5*1*20)+20+(5*5*20*50)+50+(500*800)+500+(10*500)+10 = 431080