# Part 1: Introduction to Tensors

In [None]:
import Base:println,+

mutable struct Tensor
    data 
end

+(a::Tensor, b::Tensor) = a.data + b.data

println(t::Tensor) = println(t.data)
    
x = Tensor([1,2,3,4,5])
print(x)

y = x + x
print(y)

In [None]:
function workspace()
   atexit() do
       run(`$(Base.julia_cmd())`)
   end
   exit()
end

In [None]:
workspace()

# Part 2: Introduction to Autograd

In [None]:
import Base:println,+

mutable struct Tensor
    data
    creators
    creation_op
    grad 
    Tensor(data; creators=nothing, creation_op = nothing) = 
    new(data, creators, creation_op)
end

function backward(t::Tensor, grad)
    t.grad = grad
    
    if t.creation_op == "add"
        backward(t.creators[1], grad)
        backward(t.creators[2], grad)
    end
end

+(a::Tensor, b::Tensor) = Tensor(a.data + b.data; creators=[a,b], creation_op="add")
println(t::Tensor) = println(t.data)
println(t::Array{Tensor,1}) = println([i.data for i in t])
    
x = Tensor([1,2,3,4,5])
y = Tensor([2,2,2,2,2])

z = x + y
backward(z, Tensor([1,1,1,1,1]))

In [None]:
println(x.grad)
println(y.grad)
println(z.creators)
println(z.creation_op)

In [None]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])
d = Tensor([-1,-2,-3,-4,-5])

e = a + b
f = c + d
g = e + f

backward(g, Tensor([1,1,1,1,1]))

println(a.grad)

# Part 3: Tensors That Are Used Multiple Times

In [None]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])

d = a + b
e = b + c
f = d + e
backward(f, Tensor([1,1,1,1,1]))

b.grad.data == [2,2,2,2,2]

In [None]:
b.grad.data

# Part 4: Upgrading Autograd to Support Multiple Tensors

In [None]:
using Random
import Base:+,println

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        grad = Tensor(ones(size(t.data)))
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
        end
    end
end

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data .+ b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

println(t::Tensor) = println(t.data)

a = Tensor([1,2,3,4,5]; autograd=true)
b = Tensor([2,2,2,2,2]; autograd=true)
c = Tensor([5,4,3,2,1]; autograd=true)

d = a + b
e = b + c
f = d + e

backward(f, Tensor([1,1,1,1,1]))

println(b.grad.data == [2,2,2,2,2])

# Part 5: Add Support for Negation

In [None]:
using Random
import Base:+,-,println

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
        end
    end
end

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data .+ b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

function -(a::Tensor)
    if (a.autograd)
        return Tensor(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor(a.data .* -1)
end


println(t::Tensor) = println(t.data)

a = Tensor([1,2,3,4,5]; autograd=true)
b = Tensor([2,2,2,2,2]; autograd=true)
c = Tensor([5,4,3,2,1]; autograd=true)

d = a + (-b)
e = (-b) + c
f = d + e

backward(f, Tensor([1,1,1,1,1]))

print(b.grad.data == [-2,-2,-2,-2,-2])

# Part 6: Add Support for Additional Functions

In [None]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end              
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
        end
    end
end

size(a::Tensor) = size(a.data)

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

function -(a::Tensor)
    if (a.autograd)
        return Tensor(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor(a.data .* -1)
end

function -(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor(new_data)
end

function broadcasted(f::typeof(-), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor(new_data)
end

function sum(a::Tensor; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor(new_)
end

function dropdims(a::Tensor;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor(new_data)
end

#transpose
function adjoint(a::Tensor)
    if (a.autograd)
        return Tensor(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor(a.data')
end

#matrix multiply 
function *(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor(a.data * b.data)
end


println(t::Tensor) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor) = show(io,m,a.data)

a = Tensor([1,2,3,4,5]; autograd=true)
b = Tensor([2,2,2,2,2]; autograd=true)
c = Tensor([5,4,3,2,1]; autograd=true)

d = a + (-b)
e = (-b) + c
f = d + e

backward(f, Tensor([1,1,1,1,1]))

print(b.grad.data .== [-2,-2,-2,-2,-2])

# A few Notes on Sum and Expand

In [None]:
x = Tensor([1 2 3;4 5 6];autograd=true)

In [None]:
sum(x;dims=2)

In [None]:
sum(x;dims=1)

In [None]:
expand(x,3,4)

# Part 7: Use Autograd to Train a Neural Network

#### Previously we would train a model like this

In [None]:
using Random: seed!
seed!(0)

data = [ 0 0; 0 1; 1 0; 1 1;]
target = [0; 1; 0; 1]

weights_0_1 = rand(2,3)
weights_1_2 = rand(3,1)

for i=1:10
    
#     # Predict
    layer_1 = data * weights_0_1
    layer_2 = layer_1 * weights_1_2
    
#     # Compare
    diff = (layer_2 - target)
    sqdiff = (diff .* diff)
    loss = sum(sqdiff;dims=1) # mean squared error loss

#     # Learn: this is the backpropagation piece
    layer_1_grad = diff * weights_1_2'
    weight_1_2_update = layer_1' * diff
    weight_0_1_update = data' * layer_1_grad
    
    weights_1_2 .-= weight_1_2_update .* 0.1
    weights_0_1 .-= weight_0_1_update .* 0.1
    println(loss[1])
end

In [None]:
using Random: seed!
seed!(0)

data = Tensor([ 0 0; 0 1; 1 0; 1 1;], autograd=true)
target = Tensor([0; 1; 0; 1], autograd=true)

w = []
push!(w, Tensor(rand(2,3), autograd=true))
push!(w, Tensor(rand(3,1), autograd=true))

for i=1:10

#     # Predict
    pred_1 = data * w[1]
    pred_2 = pred_1 * w[2]
    diff_1 = pred_2 .- target
    diff_2 = diff_1 .* diff_1
    
#     # Compare
    loss = sum(diff_2;dims=1)
    
#     # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))

    for w_ in w
        w_.data .-= w_.grad.data .* 0.1
        w_.grad.data .*= 0
    end

    println(loss)
end

# Part 8: Adding Automatic Optimization

In [None]:
mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

In [None]:
using Random: seed!
seed!(0)

data = Tensor([ 0 0; 0 1; 1 0; 1 1;], autograd=true)
target = Tensor([0; 1; 0; 1], autograd=true)

w = []
push!(w, Tensor(rand(2,3), autograd=true))
push!(w, Tensor(rand(3,1), autograd=true))

opt = SGD(w, 0.1)

for i=1:10

#     # Predict
    pred_1 = data * w[1]
    pred_2 = pred_1 * w[2]
    diff_1 = pred_2 .- target
    diff_2 = diff_1 .* diff_1
    
#     # Compare
    loss = sum(diff_2;dims=1)
    
#     # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))

    step(opt)

    println(loss)
end

# Part 9: Adding Support for Layer Types

In [None]:
abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
#         W = Tensor(randn(n_inputs, n_outputs) .* sqrt(1.0/n_inputs), autograd=true)
        W = Tensor(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end

function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

# Part 10: Layers Which Contain Layers

In [None]:
mutable struct Sequential <: Layer
    layers
    function Sequential(layers)
        return new(layers)
    end
end

function add(s::Sequential, layer)
    push!(s.layers, layer)
end

function forward(s::Sequential, input)
    for layer in s.layers
        input = forward(layer, input)
    end
    return input
end

function get_parameters(s::Sequential)
    parameters = [get_parameters(layer) for layer in s.layers]
    return collect(Iterators.flatten(parameters))
end

using Random: seed!; seed!(0)
data = Tensor([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Linear(3,1)])

optim = SGD(get_parameters(model),0.1)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = sum((pred - target) .* (pred - target);dims=2)
    
    # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

# Part 11: Loss Function Layers

In [None]:
struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end

using Random: seed!; seed!(0)
data = Tensor([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Linear(3,1)])
criterion = MSELoss()

optim = SGD(get_parameters(model),0.1)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

# Part 12: Non-linearity Layers

In [None]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
        end
    end
end

size(a::Tensor) = size(a.data)

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

function -(a::Tensor)
    if (a.autograd)
        return Tensor(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor(a.data .* -1)
end

function -(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor(new_data)
end

function broadcasted(f::typeof(-), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor(new_data)
end

function sum(a::Tensor; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor(new_)
end

function dropdims(a::Tensor;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor(new_data)
end

#transpose
function adjoint(a::Tensor)
    if (a.autograd)
        return Tensor(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor(a.data')
end

#matrix multiply 
function *(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor(a.data * b.data)
end

σ(x) = 1/(1+exp(-x))        

println(t::Tensor) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor) = show(io,m,a.data)

######Layers
abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
#         W = Tensor(randn(n_inputs, n_outputs) .* sqrt(1.0/n_inputs), autograd=true)
        W = Tensor(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end

function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end


struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end   

mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

In [10]:
mutable struct Sequential <: Layer
    layers
    function Sequential(layers)
        return new(layers)
    end
end

function add(s::Sequential, layer)
    push!(s.layers, layer)
end

function forward(s::Sequential, input)
    for layer in s.layers
        input = forward(layer, input)
    end
    return input
end

function get_parameters(s::Sequential)
    parameters = [get_parameters(layer) for layer in s.layers]
    return collect(Iterators.flatten(parameters))
end

get_parameters (generic function with 4 methods)

In [None]:
struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end

In [None]:
using Random: seed!; seed!(0)
data = Tensor([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

In [None]:
pred = forward(model, data)

In [None]:
target.data

In [None]:
diff1 = pred - target

In [None]:
diff1 .* diff1

In [None]:
sum(diff1 .* diff1;dims=2)

In [None]:
forward(criterion,pred, target)

In [None]:
loss

# Part 13: The Embedding Layer

In [None]:
mutable struct Embedding <: Layer
    vocab_size
    dim
    weight
    
    # this random initialiation style is just a convention from word2vec
    Embedding(vocab_size, dim) = new(vocab_size, dim, (randn(dim, vocab_size) .- 0.5) ./ dim) 
end

In [276]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh
using Base.Iterators:partition, flatten

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    index_select_indices
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        T.index_select_indices = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
            
            if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1]))
                indices = t.index_select_indices.data
                major_chunks = partition(1:size(t.grad,2),length(indices))
                grad_chunks = [t.grad.data[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1]]
    
                for (i,ind) in enumerate(flatten(indices))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor(new_grad))
            end
        end
    end
end

size(a::Tensor) = size(a.data)
size(a::Tensor, ind::Int) = size(a.data, ind)

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

function -(a::Tensor)
    if (a.autograd)
        return Tensor(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor(a.data .* -1)
end

function -(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor(new_data)
end

function broadcasted(f::typeof(-), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor(new_data)
end

function sum(a::Tensor; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor(new_)
end

function dropdims(a::Tensor;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor(new_data)
end

#transpose
function adjoint(a::Tensor)
    if (a.autograd)
        return Tensor(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor(a.data')
end

#matrix multiply 
function *(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor(a.data * b.data)
end


function index_select_helper(a::Array, indices)
    return reduce(hcat,map(ind -> a[:,ind], indices))
end

function index_select(a::Tensor, indices::Tensor)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor(new_)
end


σ(x) = 1/(1+exp(-x))            

println(t::Tensor) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor) = show(io,m,a.data)

struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end    

ErrorException: invalid redefinition of constant Tensor

In [277]:
using LinearAlgebra:I
x = Tensor(1.0* Matrix(I, 5, 5), autograd=true)
backward(index_select(x, Tensor([[2,3,4],[3,4,5]])))
x.grad

5×5 Array{Float64,2}:
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0

In [None]:
struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end


struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end   

mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

# Part 15: The Embedding Layer (revisited)

In [8]:
mutable struct Embedding <: Layer
    vocab_size
    dim
    weight
    parameters
    # this random initialiation style is just a convention from word2vec
    function Embedding(dim, vocab_size) 
        E = new(vocab_size, dim, Tensor((randn(dim, vocab_size) .- 0.5) ./ dim; autograd=true))
        E.parameters = [E.weight]
        return E
    end
end

function forward(E::Embedding, indices)
    return index_select(E.weight, indices)
end

forward (generic function with 6 methods)

In [None]:
embed.weight.data[:,[1,2,1,2]]

In [11]:
data = Tensor([[1,2,1,2]], autograd=true)
target = Tensor([0 1 0 1], autograd=true)

embed = Embedding(3,5)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()


optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

[1.5092489280966477]
[0.8497104549613475]
[0.5855322382497556]
[0.30006913685114744]
[0.14342165299394644]
[0.08387454091466479]
[0.05714767010896077]
[0.04263753720419443]
[0.03369465167571825]
[0.027691594175269455]


# Part 16: The Cross Entropy Layer

In [2]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh
using Base.Iterators:partition, flatten

mutable struct Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    index_select_indices
    softmax_output
    target_dist
    
    function Tensor(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        T.index_select_indices = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
            
            if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1]))
                indices = t.index_select_indices.data
                major_chunks = partition(1:size(t.grad,2),length(indices))
                grad_chunks = [t.grad.data[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1]]
    
                for (i,ind) in enumerate(flatten(indices))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor(new_grad))
            end
            if t.creation_op == "cross_entropy"
                dx = t.softmax_output .- t.target_dist
                backward(t.creators[1], Tensor(dx)
            end
        end
    end
end

size(a::Tensor) = size(a.data)
size(a::Tensor, ind::Int) = size(a.data, ind)

function +(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor(a.data+b.data)
end

function -(a::Tensor)
    if (a.autograd)
        return Tensor(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor(a.data .* -1)
end

function -(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor(new_data)
end

function broadcasted(f::typeof(-), a::Tensor, b::Tensor)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor(new_data)
end

function sum(a::Tensor; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor(new_)
end

function dropdims(a::Tensor;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor(new_data)
end

#transpose
function adjoint(a::Tensor)
    if (a.autograd)
        return Tensor(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor(a.data')
end

#matrix multiply 
function *(a::Tensor, b::Tensor)
    if (a.autograd && b.autograd)
        return Tensor(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor(a.data * b.data)
end


function index_select_helper(a::Array, indices)
    return reduce(hcat,map(ind -> a[:,ind], indices))
end

function index_select(a::Tensor, indices::Tensor)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor(new_)
end


σ(x) = 1/(1+exp(-x))            

println(t::Tensor) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor) = show(io,m,a.data)

struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end    

show (generic function with 234 methods)

In [242]:
using Statistics: mean
using LinearAlgebra: I
function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp;dims=1)
end

struct CrossEntropyLoss 
    CrossEntropyLoss() = new()
end

function forward(l::CrossEntropyLoss, a::Tensor, target::Tensor)
    softmax_output = softmax(a.data)
    log_out = log.(softmax_output)
    sz = size(a.data, 1)
    identity = 1.0 .* Matrix(I, (sz, sz))
    target_dist = reshape(identity[:,target.data],(size(a.data)))
    loss = -mean(sum(log_out .* target_dist;dims=1))
    if a.autograd
        loss = Tensor(loss; autograd=true, creators=[a], creation_op = "cross_entropy")
        loss.softmax_output = softmax_output
        loss.target_dist = target_dist
        return loss
    end
    return Tensor(loss)
end

forward (generic function with 9 methods)

In [243]:
using Random:seed!;seed!(0)
data = Tensor([[1,2,1,2]], autograd=true)
target = Tensor([4 2 4 2], autograd=true)

embed = Embedding(3,3)
model = Sequential([embed, Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()


optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

1.4461881980473767
0.32134282627951916
0.07050547863798148
0.030112221326762667
0.02294860815584766
0.01865850502637032
0.015762310723879135
0.013664053181812933
0.012069325190316341
0.010814145520457268


In [244]:
pred = forward(model, data)

4×4 Array{Float64,2}:
 -1.14958   -2.2855   -1.14958   -2.2855
 -0.906039   4.40229  -0.906039   4.40229
 -1.54885   -1.14252  -1.54885   -1.14252
  4.4664    -1.16551   4.4664    -1.16551

In [104]:
data = Tensor([[1,2]], autograd=true)
target = Tensor([0 1], autograd=true)

embed = Embedding(3,3)
model = Sequential([embed, Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()

CrossEntropyLoss()

In [105]:
pred = forward(model, data)

4×2 Array{Float64,2}:
  0.225142  -0.172391
  0.444496  -0.309534
 -0.170179   0.032395
  0.468161  -0.335253

In [106]:
softmax_output = softmax(pred.data)
#     log_out = log.(softmax_output)

4×2 Array{Float64,2}:
 0.238446  0.25324
 0.29693   0.220787
 0.160585  0.310792
 0.30404   0.215181

In [100]:
identity = 1.0 .* Matrix(I, size(pred.data))

5×2 Array{Float64,2}:
 1.0  0.0
 0.0  1.0
 0.0  0.0
 0.0  0.0
 0.0  0.0

In [101]:
identity[:,target.data.+1]

5×1×2 Array{Float64,3}:
[:, :, 1] =
 1.0
 0.0
 0.0
 0.0
 0.0

[:, :, 2] =
 0.0
 1.0
 0.0
 0.0
 0.0

In [103]:
target_dist = reshape(identity[:,target.data.+1],(size(pred.data)))

5×2 Array{Float64,2}:
 1.0  0.0
 0.0  1.0
 0.0  0.0
 0.0  0.0
 0.0  0.0

In [None]:
target_dist = reshape(identity[target.data .+ 1,:],(size(a.data)))

# Part 17: The Recurrent Neural Network Layer

In [249]:
mutable struct RNNCell_ <: Layer
    n_hidden
    
    activation
    
    w_ih
    w_hh
    w_ho
    
    parameters
    
    function RNNCell_(n_inputs, n_hidden, n_output, activation="sigmoid")
        if activation == "sigmoid"
            act = Sigmoid()
        elseif activation == "tanh"
            act = Tanh()
        else
            throw("Non-linearity not found")
        end
        
        parameters = []

        w_ih = Linear(n_inputs, n_hidden)
        w_hh = Linear(n_hidden, n_hidden)
        w_ho = Linear(n_hidden, n_output)
        
        push!(parameters, get_parameters(w_ih))
        push!(parameters, get_parameters(w_hh))
        push!(parameters, get_parameters(w_ho))
        parameters = collect(Iterators.flatten(parameters))
        return new(n_hidden, act, w_ih, w_hh, w_ho, parameters)
    end
end

function forward(rnn::RNNCell_, input::Tensor, hidden::Tensor)
    from_prev_hidden = forward(rnn.w_hh, hidden)
    combined = forward(rnn.w_ih, input) + from_prev_hidden
    new_hidden = forward(rnn.activation, combined)
    output = forward(rnn.w_ho, new_hidden)
    return output, new_hidden
end

function init_hidden(rnn::RNNCell_; batch_size=1)
    return Tensor(zeros(rnn.n_hidden, batch_size), autograd=true)
end

init_hidden (generic function with 2 methods)

In [250]:
raw = readlines("tasksv11/en/qa1_single-supporting-fact_train.txt")

tokens = []
for line in raw[1:1000]
    push!(tokens, split(lowercase(line)," ")[2:end])
end

new_tokens = []
for line in tokens
    push!(new_tokens, cat(repeat(["-"],6-length(line)), line;dims=1))
end

tokens = new_tokens

vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

indices = []
for line in tokens
    idx = []
    for w in line
        push!(idx,word2index[w])
    end
    push!(indices, idx)
end

data = reduce(hcat,indices);

In [251]:
size(data)

(6, 1000)

In [252]:
data[:,1:3]

6×3 Array{Any,2}:
 16  16  16
 26  51  16
 78   8  54
 19  19  24
 76  76  29
 79  47  34

In [255]:
embed = Embedding(16, length(vocab))
# model = RNNCell(n_inputs=16, n_hidden=16, n_output=len(vocab))
model = RNNCell_(16, 16, length(vocab))
# optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)
criterion = CrossEntropyLoss()
optim = SGD(cat(get_parameters(model), get_parameters(embed); dims=1), 0.05);

In [256]:
for iter=1:1000
    batch_size = 100
    total_loss = 0
    
    hidden = init_hidden(model, batch_size=batch_size)
    output = nothing #to access from for loop
    for t=1:5
        input = Tensor(data[t,1:batch_size], autograd=true)
        rnn_input = forward(embed, input)
        output, hidden = forward(model, rnn_input, hidden)
    end
    target = Tensor(data[6,1:batch_size], autograd=true)
    loss = forward(criterion, output, target)
    backward(loss)
    step(optim)
    total_loss += loss.data
    
    if (iter-1) %200 ==0
        max_ind = argmax(output.data;dims=1)
        p_correct = dropdims(map(x->x.I[1], max_ind);dims=1)
        p_correct = mean(target.data .== p_correct)
        println("Loss: $(total_loss/10) correct: $(p_correct)")
    end
end

Loss: 0.4420528179387115 correct: 0.01
Loss: 0.18246921958357343 correct: 0.22
Loss: 0.1689847350724935 correct: 0.29
Loss: 0.15953953989369668 correct: 0.31
Loss: 0.14527913965836317 correct: 0.34


In [275]:
batch_size = 1
hidden = init_hidden(model, batch_size=batch_size)

output = nothing #to access from for loop
for t=1:5
    input = Tensor(data[t,1:batch_size], autograd=true)
    rnn_input = forward(embed, input)
    output, hidden = forward(model, rnn_input, hidden)
end

target = Tensor(data[6,1:batch_size], autograd=true)   
loss = forward(criterion, output, target)

ctx = ""
for idx in data[:,1][1:end-1]
    global ctx *= vocab[idx] * " "
end
println("Context: ",ctx)
println("True: ",vocab[target.data[1]])
println("Pred: ", vocab[argmax(output.data).I[1]])

Context: - mary moved to the 
True: bathroom.
Pred: garden.


In [272]:
argmax(output.data).I[1]

80

In [264]:
ctx

"- mary moved to the "

In [263]:
for idx in data[:,1][1:end-1]
    global ctx *= vocab[idx] * " "
end

In [217]:
output1 = nothing
for t=1:5
        input = Tensor(data[t,1:100], autograd=true)
        rnn_input = forward(embed, input)
        output1, hidden = forward(model, rnn_input, hidden)
    end

In [218]:
target = Tensor(data[6,1:100], autograd=true);

In [220]:
loss = forward(criterion, output1, target)

1.328779950086296

In [207]:
max_ind = argmax(output1.data;dims=1)

1×100 Array{CartesianIndex{2},2}:
 CartesianIndex(81, 1)  CartesianIndex(81, 2)  …  CartesianIndex(81, 100)

In [229]:
output_states

UndefVarError: UndefVarError: output_states not defined

In [213]:
output1.data[:,1]

83-element Array{Float64,1}:
 -1.2804134890495997
 -0.6347676486185593
 -1.7461817631847776
 -1.0208964867225314
 -0.6528698780254869
 -0.9156203628562507
 -1.510750661413532
 -0.7465251833226221
 -1.2718211548000362
 11.861122086093369
 -0.011673233720826293
 -0.9172690321238208
 -1.509406875013792
  ⋮
 -1.9214146251264579
  0.43806584074565247
 -0.8626455647947088
 -1.4045141461918484
 -1.2207898535130632
 -0.67708099448368
  7.414199956870108
 -0.9657370420256977
 11.995593282581163
 12.273136203149102
 -1.5010536303728732
 -0.8132843049002384

In [None]:
for iter in range(1000):
    batch_size = 100
    total_loss = 0
    
    hidden = model.init_hidden(batch_size=batch_size)

    for t in range(5):
        input = Tensor(data[0:batch_size,t], autograd=True)
        rnn_input = embed.forward(input=input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)

    target = Tensor(data[0:batch_size,t+1], autograd=True)    
    loss = criterion.forward(output, target)
    loss.backward()
    optim.step()
    total_loss += loss.data
    if(iter % 200 == 0):
        p_correct = (target.data == np.argmax(output.data,axis=1)).mean()
        print("Loss:",total_loss / (len(data)/batch_size),"% Correct:",p_correct)

In [155]:
hidden = init_hidden(model, batch_size=100);

In [158]:
t=5
input = Tensor(data[t,1:100], autograd=true)
        rnn_input = forward(embed, input)
        output, hidden = forward(model, rnn_input, hidden);

In [159]:
target = Tensor(data[6,1:100], autograd=true);

In [161]:
loss = forward(criterion, output, target)

4.733301387326336

In [162]:
typeof(loss)

Tensor

In [142]:
input = Tensor(data[1:4,1], autograd=true)


4-element Array{Any,1}:
 16
 26
 78
 19

In [115]:
 rnn_input = forward(embed, input)

16×2 Array{Float64,2}:
  0.0631712   -0.0375915
  0.0581039   -0.00897466
 -0.068481     0.0134853
 -0.025504     0.0989633
  0.105523    -0.0881361
 -0.0072102   -0.0684704
  0.0688439   -0.157776
 -0.0446448   -0.0911237
 -0.0920594   -0.0205452
  0.0728195   -0.16497
  0.041313    -0.0991488
 -0.00904722  -0.122643
  0.0446028   -0.130049
  0.00830057  -0.0368163
 -0.128805    -0.097122
 -0.0560882    0.0575168

In [116]:
output, hidden = forward(model, rnn_input, hidden);

In [117]:
target = Tensor(data[1:2,2], autograd=true)

2-element Array{Any,1}:
 16
 51

In [118]:
loss = forward(criterion, output, target)

4.534548642241808

In [128]:
max_ind = argmax(output.data;dims=1)

1×2 Array{CartesianIndex{2},2}:
 CartesianIndex(3, 1)  CartesianIndex(3, 2)

In [137]:
dropdims(map(x->x.I[1], max_ind);dims=1)

2-element Array{Int64,1}:
 3
 3

In [139]:
target.data|>typeof

Array{Any,1}

In [140]:
target.data .== dropdims(map(x->x.I[1], max_ind);dims=1)

2-element BitArray{1}:
 0
 0

In [29]:
for iter in range(1000):
    batch_size = 100
    total_loss = 0
    
    hidden = model.init_hidden(batch_size=batch_size)

    for t in range(5):
        input = Tensor(data[0:batch_size,t], autograd=True)
        rnn_input = embed.forward(input=input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)

    target = Tensor(data[0:batch_size,t+1], autograd=True)    
    loss = criterion.forward(output, target)
    loss.backward()
    optim.step()
    total_loss += loss.data
    if(iter % 200 == 0):
        p_correct = (target.data == np.argmax(output.data,axis=1)).mean()
        print("Loss:",total_loss / (len(data)/batch_size),"% Correct:",p_correct)

1000-element Array{Any,1}:
 Any[16, 26, 78, 19, 76, 79]
 Any[16, 51, 8, 19, 76, 47]
 Any[16, 16, 54, 24, 29, 34]
 Any[6, 8, 25, 19, 76, 47]
 Any[16, 2, 78, 19, 76, 80]
 Any[16, 16, 54, 24, 49, 50]
 Any[16, 51, 78, 19, 76, 9]
 Any[16, 2, 40, 19, 76, 79]
 Any[16, 16, 54, 24, 49, 50]
 Any[16, 26, 78, 19, 76, 47]
 Any[16, 6, 15, 19, 76, 9]
 Any[16, 16, 54, 24, 49, 20]
 Any[51, 8, 25, 19, 76, 80]
 ⋮
 Any[16, 2, 15, 19, 76, 30]
 Any[16, 16, 54, 24, 35, 11]
 Any[16, 2, 78, 19, 76, 77]
 Any[16, 26, 40, 19, 76, 77]
 Any[16, 16, 54, 24, 29, 31]
 Any[16, 26, 78, 19, 76, 79]
 Any[16, 6, 15, 19, 76, 77]
 Any[16, 16, 54, 24, 35, 59]
 Any[16, 26, 78, 19, 76, 77]
 Any[16, 2, 8, 19, 76, 30]
 Any[16, 16, 54, 24, 29, 5]
 Any[16, 2, 8, 19, 76, 80]

In [None]:
target_dist = reshape(identity[:,target.data.+1],(size(pred.data)))

In [None]:
mean(sum(softmax_output .* target_dist;dims=1))

In [None]:
softmax_output .- target_dist

In [None]:
dropdims(identity[:,target.data.+1];dims=2)

In [None]:
softmax_output =  softmax(pred.data)

In [None]:
l = CrossEntropyLoss()

In [None]:
loss = forward(l, pred, target)

In [None]:
identity  = 

In [None]:
function forward(l::CrossEntropyLoss, a::Tensor, target::Tensor)
    softmax_output = softmax(a.data)
    log_out = log.(softmax_output)
    
    identity = 1.0 .* Matrix(I, size(a.data))
    target_dist = reshape(identity[:,target.data.+1],(size(pred.data)))
    loss = -mean(sum(log_out .* target_dist;dims=1))
    if a.autograd
#         loss.softmax_output = softmax_output
#         loss.target_dist = target_dist
        return Tensor(loss; autograd=true, creators=[a], creation_op = "cross_entropy")
    end
    return Tensor(loss)
end

In [None]:
temp = exp.(pred.data)

In [None]:
function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp;dims=1)
end

In [None]:
logts = softmax(pred.data)

In [None]:
label_logits = -mean([log(logts[j+1,i]) for (i,j) in enumerate(target.data)])

In [None]:
using Statistics: mean

In [None]:
backward(pred, )

In [None]:
pred1 = forward(embed, data)

In [None]:
pred1.index_select_indices

In [None]:
if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1]))
                indices = t.index_select_indices
                major_chunks = partition(1:size(t.grad,2),length(indices))
                grad_chunks = [t.grad.data[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1]]
    
                for (i,ind) in enumerate(flatten(indices))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor(new_grad))
            end

In [None]:
parent = zeros(size(embed.weight.data))

In [None]:
indices = pred1.index_select_indices

In [None]:
grad_chunks = [pred1.data[:,i] for i in partition(1:size(pred1.data,2),length(indices.data))]

In [None]:
embed.weight

In [None]:
x = index_select_helper(embed.weight.data, [[1,2,1,2],[2,3,1,1]])

In [None]:
new_grad = zeros(size(embed.weight))

In [None]:
indices = [[1,2,1,2],[2,3,1,1]];

In [None]:
major_chunks = partition(1:size(x,2),length(indices[1]))

In [None]:
major_chunks = collect(major_chunks)

In [None]:
model.layers

In [None]:
x[:,major_chunks[2]]

In [None]:
size(major_chunks[2])[1]

In [None]:
grad_chunks = [x[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1] ]

In [None]:
for (i,ind) in enumerate(flatten(indices))
    new_grad[:,ind] +=  grad_chunks[i]
end

In [None]:
new_grad

In [None]:
new_grad[:,1] ./4

In [None]:
major_chunk = collect(partition(1:size(x,2), length([[1,2,1,2],[2,3,1,1]])))

In [None]:
grad_chunks = [x[:,i] for i in partition(1:trunc(Int,size(x,2)/2),length([1,2,1,2]))]

In [None]:
partition(1:8,1)|>collect

In [None]:
function index_select(a::Tensor, indices::Tensor)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor(new_)
end

In [None]:
identity = embed.weight.data

In [None]:
embed_nd = index_select_helper(embed.weight.data, [1,2,1,2])

In [None]:
using Base.Iterators:partition

In [None]:
new_grad = zeros(size(identity))

In [None]:
pred1.index_select_indices

In [None]:
size()

In [None]:
grad_chunks = [embed_nd[:,i] for i in partition(1:size(t.grad.data,1),size(new_grad,1))]

In [None]:
indices = t.index_select_indices
grad_chunks = [t.grad.data[i,:] for i in partition(1:size(t.grad.data,1),size(new_grad,1))]


In [None]:
map_f(x,y) = reshape(reduce(hcat,map(a -> y[:,a], x)), (size(y,1),:,length(x) ))

In [None]:
slices = map_f([2:4, 1:3, [1,2,4]],identity)

In [None]:
ref_[:,[1,2,4]] += slices[:,:,3]

In [None]:
ref_[:,1:3] += slices[:,:,2]

In [None]:
ref_[:,2:4] += slices[:,:,1]

In [None]:
ref_[:,1] = ref_[:,1] ./2

In [None]:
identity[:,1:3]

In [None]:
identity[:,[1,2,4]]

In [None]:
sd = map(x -> identity[:,x], [2:4])

In [None]:
reshape(reduce(hcat, sd), (4,3,1))

In [None]:
typeof(identity)

In [None]:
collect(1:3:12)

In [None]:
using Base.Iterators:partition

In [None]:
partition(1:12,3)|>collect

In [None]:
using LinearAlgebra:I

In [None]:
function index_select_helper(a::Array, indices)
    return reduce(vcat,map(ind -> a[:,ind], indices))
end

In [None]:

function backward(t::Tensor, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
            
            if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1].data))
                indices = t.index_select_indices
                grad_chunks = [t.grad.data[i,:] for i in partition(1:size(t.grad.data,1),size(new_grad,1))]
                for (i,ind) in enumerate(indices)
                    println(size(new_grad[:,ind]),"   ", size(grad_chunks[i]))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor(new_grad))
            end
        end
    end
end

In [None]:
struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

In [None]:
function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end    

In [None]:
function index_select_helper(a::Array, indices)
    return reduce(hcat,map(ind -> a[:,ind], indices))
end

function index_select(a::Tensor, indices::Tensor)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor(new_)
end


In [None]:
embed = Embedding(3,5)

In [None]:
function forward(l::Linear, input)
    println(size(l.W),"   ", size(input),"  ", size(expand(l.b,2,size(input.data, 2))))
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end


In [7]:
function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end


struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end   

mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

step (generic function with 2 methods)

In [None]:
using Base.Iterators:partition, flatten

In [6]:
abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
#         W = Tensor(randn(n_inputs, n_outputs) .* sqrt(1.0/n_inputs), autograd=true)
        W = Tensor(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end

function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end


struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor)
    if a.autograd
        return Tensor(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor)
    if a.autograd
        return Tensor(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor(tanh.(a.data))
end   

mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

step (generic function with 2 methods)