In [1]:
using LinearAlgebra
import Base: \
function LinearAlgebra.Bidiagonal(dv::Vector{T}, ev::Vector{S}, uplo::Symbol) where {T,S}
    TS = promote_type(T,S)
    return Bidiagonal{TS,Vector{TS}}(dv, ev, uplo)
end


## The base method narrows the type too much. We'll have to ensure that it's as least as wide as the input
function  \(adjA::Adjoint{<:Any,<:Union{UnitUpperTriangular,UnitLowerTriangular}}, B::AbstractVector)
    A = adjA.parent
    TAB = promote_type(eltype(A), eltype(B), typeof(zero(eltype(A))*zero(eltype(B)) + zero(eltype(A))*zero(eltype(B))))
    BB = similar(B, TAB, size(B))
    copyto!(BB, B)
    ldiv!(adjoint(convert(AbstractArray{TAB}, A)), BB)
end

\ (generic function with 152 methods)

In [73]:
h(x) = exp(-x)
h′(x,y) = -y
𝓁(x,y) = sum(abs2,x-y)/2
𝓁′(x,y) = x-y
init(sizes...) = 0.01randn(sizes...)

init (generic function with 1 method)

In [3]:
𝜀 = .0001
n = [5,4,3,1]
N = length(n)-1
B = 7

7

### Scalar Neural Network

In [4]:
function neural_net(params, input; h=h, h′=h′, N=length(params))
    δ = [];
    X = [input];
    for i=1:N
        x = sum(params[i] .* [X[i],1])
        push!(X,h(x))
        push!(δ, h′.(x,X[i+1]))
    end
    return X,δ
end


neural_net (generic function with 1 method)

In [5]:
params =[[init(),init()] for i=1:N] # W and B
x,y = init(),init() # input and output

(0.09969491180448342, 0.17669682675943965)

In [6]:
X,δ = neural_net(params,x)
L   = Bidiagonal(zeros(N),[δ[i] * params[i][1] for i=2:N],:L)
D   = Diagonal(δ.*[[X[i],1]' for i=1:N])
g   = [zeros(N-1);𝓁′(X[N+1],y)]
∇J  = D'*((I-L')\g)

3-element Array{Array{Float64,1},1}:
 [0.000332745, 0.00333763]
 [0.0391587, 0.0434531]   
 [-0.585641, -0.560265]   

In [7]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = ∇J * 0
ϵ    = ∇J * 0
for i=1:N, j=1:2       
    ϵ[i][j] = 𝜀
    ∇Jfd[i][j]=(𝓁(neural_net(params.+ϵ,x)[1][N+1],y)-𝓁(neural_net(params.-ϵ,x)[1][N+1],y))/2𝜀
    ϵ[i][j] = .0
end
∇Jfd

3-element Array{Array{Float64,1},1}:
 [0.000332745, 0.00333763]
 [0.0391587, 0.0434531]   
 [-0.585642, -0.560265]   

### Matrix Neural Network

In [8]:
import Base: +,-,*,/,∘

struct LinearMatrixOp # Is parametric type necessary? It causes un-readable error messages and some other issues.
    f
    fadj
end
LinearMatrixOp(f::Function) = LinearMatrixOp(f,f)

LeftMul(A::Matrix) = LinearMatrixOp(X->A*X, X->A'*X)
RightMul(A::Matrix) = LinearMatrixOp(X->X*A, X->X*A')
HadMul(A::Matrix) = LinearMatrixOp(X->X.*A)
ZeroMul() = LinearMatrixOp(X->Zero())
IdentMul() = LinearMatrixOp(X->X) #not neccessary, can be commented

Base.zero(::Type{LinearMatrixOp}) = ZeroMul() 
Base.one(::Type{LinearMatrixOp}) = IdentMul()
Base.adjoint(A::LinearMatrixOp) = LinearMatrixOp(A.fadj,A.f)
Base.copy(A::LinearMatrixOp) =  LinearMatrixOp(A.f,A.fadj)

*(A::LinearMatrixOp,X::Union{AbstractArray,Number}) = A.f(X)
-(A::LinearMatrixOp) = LinearMatrixOp(X->-A.f(X), X->-A.fadj(X))
∘(A::LinearMatrixOp, B::LinearMatrixOp) = LinearMatrixOp(A.f ∘ B.f, B.fadj ∘ A.fadj)

# A zero
struct Zero end
Base.zero(::Type{Any}) = Zero()
+(::Zero, ::Zero) = Zero()
-(::Zero, A) = -A
+(::Zero, A) = A
*(::Zero, ::Zero) = Zero()
*(X, ::Zero) = Zero()

* (generic function with 349 methods)

In [9]:
function neural_net(params,input;h=h,h′= h′)
    X     = [input]
    δ     = []
    for i=1:length(params)
        x = params[i][1]*X[i] .+ params[i][2]         
        push!(X,h.(x))
        push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end
array(x)= fill(x,1,1)

array (generic function with 1 method)

In [10]:
# params: `W_i` and `b_i`s: x_{i+1} <- Wi*x_i .+ b_i
params =[[init(n[i+1],n[i]),init(n[i+1])] for i=1:N]
x, y = init(n[1],B), init(1,B);

In [11]:
X,δ = neural_net(params,x)
D = Diagonal([[HadMul(δ[i]) ∘ RightMul(X[i]) HadMul(δ[i])] for i=1:N])
ImL = Bidiagonal([I for i in 1:N], -[HadMul(δ[i]) ∘ LeftMul(params[i][1]) for i=2:N] , :L)
g = push!(Any[Zero() for i=1:N-1],𝓁′(X[N+1],y))
∇J = D'*array.(ImL'\g);

In [12]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(params), wb=1:2
    for j=1:length(ϵ[i][wb])
        ϵ[i][wb][j] = 𝜀
        ∇Jfd[i][wb][j] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][wb][j] = .0
     end
end
∇Jfd;

In [13]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
 -0.00918551   0.0105151   -0.0103996    0.0021086   -0.00231879
  0.0078814   -0.00946623   0.010044    -0.00155979   0.0025162 
  0.00581915  -0.00715716   0.00616917  -0.00195801   0.00147022
  0.0050518   -0.00622673   0.00536373  -0.00161102   0.00129137

In [14]:
∇J[1][1]

4×5 Array{Float64,2}:
 -0.00918551   0.0105151   -0.0103996    0.0021086   -0.00231879
  0.0078814   -0.00946623   0.010044    -0.00155979   0.0025162 
  0.00581915  -0.00715716   0.00616917  -0.00195801   0.00147022
  0.0050518   -0.00622673   0.00536373  -0.00161102   0.00129137

### A Showcase: Densely Connected Matrix Network

In [15]:
function neural_net(params,input;h=h, h′= h′)
    X     = [input]
    δ     = []
    for i in 1:length(params)
       x = broadcast(+,(params[i] .* [X..., I])...)
       push!(X,h.(x))
       push!(δ,h′.(x,X[i+1]))
    end 
    X,δ
end;
array(x) = fill(x,1,1);

In [16]:
params = [[j==i+1 ?  init(n[i+1],1) : init(n[i+1],n[j])  for j=1:i+1] for i=1:N]
x,y = init(n[1],B), init(1,B);

In [17]:
X,δ = neural_net(params,x)
D = Diagonal([[[(HadMul(δ[i]) ∘ RightMul(X[j]))' for j=1:i]' HadMul(δ[i])] for i=1:N])
ImL = UnitLowerTriangular(Matrix{Any}(undef,N,N))
for i=2:N, j=1:i-1
    ImL[i,j] = -HadMul(δ[i]) ∘ LeftMul(params[i][j+1]) 
end
g = push!(Any[Zero() for i=1:N-1],𝓁′(X[N+1],y))
∇J = D'*array.(ImL'\g)

3-element Array{Array{Any,2},1}:
 [[-0.0146373 -0.0171189 … -0.0210885 0.0410024; -0.0143086 -0.018308 … -0.0222181 0.0421256; -0.00966715 -0.0121301 … -0.0141614 0.0269208; 0.00152569 0.00182626 … 0.00191509 -0.00355642]; [-0.10794 -0.0975917 … -0.118599 -0.103915; -0.108819 -0.0990499 … -0.121469 -0.11092; -0.0704849 -0.0637224 … -0.0766635 -0.0709802; 0.0094296 0.00829117 … 0.009679 0.00939663]]                         
 [[0.0142146 0.0169136 … 0.0204834 -0.0398483; 0.0325147 0.0388068 … 0.0432487 -0.082327; -0.000177677 -0.000228124 … -0.000263707 0.000485406]; [0.720403 0.635437 0.737404 0.637144; 1.49741 1.32101 1.53354 1.32525; -0.00880898 -0.00777354 -0.00902285 -0.00779906]; [0.104437 0.0943859 … 0.114782 0.101125; 0.218172 0.194104 … 0.230717 0.213583; -0.0012477 -0.0011209 … -0.0013769 -0.0012992]]
 [[-0.168753 -0.210445 … -0.249917 0.47049]; [-8.52873 -7.52481 -8.73304 -7.54717]; [-10.1459 -10.852 -6.52011]; [-1.22108 -1.10112 … -1.34801 -1.23476]]                          

In [18]:
# ∇Jfd is gradient calculated with finite differences method
∇Jfd = params*0
ϵ=params*0
for i=1:length(ϵ), j=1:length(ϵ[i]), k=1:length(ϵ[i][j])
        ϵ[i][j][k] = 𝜀
        ∇Jfd[i][j][k] =(𝓁(neural_net(params+ϵ,x)[1][N+1],y)-𝓁(neural_net(params-ϵ,x)[1][N+1],y))/2𝜀
        ϵ[i][j][k] = .0
end

In [19]:
∇Jfd[1][1]

4×5 Array{Float64,2}:
 -0.0146373   -0.0171189   -0.0466687   -0.0210885    0.0410024 
 -0.0143086   -0.018308    -0.0482024   -0.0222181    0.0421256 
 -0.00966715  -0.0121301   -0.0302245   -0.0141614    0.0269208 
  0.00152569   0.00182626   0.00356372   0.00191509  -0.00355642

In [20]:
∇J[1][1]

4×5 Array{Float64,2}:
 -0.0146373   -0.0171189   -0.0466687   -0.0210885    0.0410024 
 -0.0143086   -0.018308    -0.0482024   -0.0222181    0.0421256 
 -0.00966715  -0.0121301   -0.0302245   -0.0141614    0.0269208 
  0.00152569   0.00182626   0.00356372   0.00191509  -0.00355642

## MNIST MLP Example

In [219]:
# Data
using Knet
import Knet: Data
include(Knet.dir("data","mnist.jl"))
dtrn,dtst = mnistdata(xsize=(784,:)); # dtrn and dtst = [ (x1,y1), (x2,y2), ... ] where xi,yi are

#Layers
n = [784,128,64,10]
N = length(n)-1
init(sizes...) = 0.1randn(sizes...)

#Nonlinearity
h(x)    = x>0 ? x : zero(x) # relu
h′(x,y) = y>0 ? one(x) : zero(x) # derivative of relu

#Loss
𝓁(x,a) = nll(x,a;average=true) # negative log likelihood loss, x is dxb matrix, 
                               # a is d-length integer array keeps the correct answers 
function 𝓁′(x,a)  # Note!: this will be simplified if we can figure out how to integrate derivative of getindex in to our formulatin
    indices = Knet.findindices(x,a,dims=1)
    yz = zero(x)
    yz[indices] .= 1
    return (softmax(x,dims=1) .- yz)./length(a)
end

#Forward Function
function neural_net(params,input;h=h,h′= h′)
    X     = [input]
    δ     = []
    for i=1:length(params)-1
        x = params[i][1]*X[end] .+ params[i][2]         
        push!(X,h.(x)); push!(δ,h′.(x,X[end]))
    end 
    x = params[end][1]*X[end] .+ params[end][2]    
    push!(X,x)
    push!(δ,one.(x))
    X,δ
end

neural_net (generic function with 1 method)

In [None]:
params =[[init(n[i+1],n[i]),zeros(n[i+1])] for i=1:N] # model parameters
α = 0.5 # learning rate 
epochs=3# number of epochs to train model 
@time for i=1:epochs # 1 epoch takes ~ 65 seconds  in my macbook
    for (x,y) in dtrn
        X,δ = neural_net(params,x;h=h, h′= h′)
        D = Diagonal([[HadMul(δ[i]) ∘ RightMul(X[i]) HadMul(δ[i])] for i=1:N])
        ImL = Bidiagonal([I for i in 1:N], -[HadMul(δ[i]) ∘ LeftMul(params[i][1]) for i=2:N] , :L)
        g = push!(Any[Zero() for i=1:N-1],𝓁′(X[N+1],y))
        ∇J = D'*array.(ImL'\g);
        for i =1:length(params)
            params[i][1] = params[i][1] - α*∇J[i][1]
            params[i][2] = params[i][2] - α*sum(∇J[i][2],dims=2)
        end
    end
end

In [None]:
zeroone=total=0
for (x,y) in dtst
    yn        = neural_net(params,x;h=h, h′= h′)[1][end]
    answers   = vec(getindex.(argmax(yn,dims=1),1))
    global zeroone += sum(y .== answers)
    global total   += length(answers)
end
accuracy = 100zeroone/total