# Part 1: Introduction to Tensors

In [2]:
import Base:println,+

abstract type Tensor end

mutable struct Tensor_v1 <: Tensor
    data 
end

+(a::Tensor, b::Tensor) = a.data + b.data

println(t::Tensor) = println(t.data)
    
x = Tensor_v1([1,2,3,4,5])
print(x)

y = x + x
print(y)

Tensor_v1([1, 2, 3, 4, 5])[2, 4, 6, 8, 10]

# Part 2: Introduction to Autograd

In [9]:
import Base:println, +, show

mutable struct Tensor_v2 <: Tensor
    data
    creators
    creation_op
    grad 
    Tensor_v2(data; creators=nothing, creation_op = nothing) = 
    new(data, creators, creation_op)
end

function backward(t::Tensor, grad)
    t.grad = grad
    
    if t.creation_op == "add"
        backward(t.creators[1], grad)
        backward(t.creators[2], grad)
    end
end

+(a::Tensor, b::Tensor) = Tensor_v2(a.data + b.data; creators=[a,b], creation_op="add")

println(t::Tensor) = println(t.data)
println(t::Array{Tensor_v2,1}) = println([i.data for i in t])
show(io::IO,m::MIME"text/plain",a::Tensor) = show(io,m,a.data)
    
x = Tensor_v2([1,2,3,4,5])
y = Tensor_v2([2,2,2,2,2])

z = x + y
backward(z, Tensor_v2([1,1,1,1,1]))

In [10]:
println(x.grad)
println(y.grad)
println(z.creators)
println(z.creation_op)

[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[[1, 2, 3, 4, 5], [2, 2, 2, 2, 2]]
add


In [11]:
a = Tensor_v2([1,2,3,4,5])
b = Tensor_v2([2,2,2,2,2])
c = Tensor_v2([5,4,3,2,1])
d = Tensor_v2([-1,-2,-3,-4,-5])

e = a + b
f = c + d
g = e + f

backward(g, Tensor_v2([1,1,1,1,1]))

println(a.grad)

[1, 1, 1, 1, 1]


# Part 3: Tensors That Are Used Multiple Times

In [12]:
a = Tensor_v2([1,2,3,4,5])
b = Tensor_v2([2,2,2,2,2])
c = Tensor_v2([5,4,3,2,1])

d = a + b
e = b + c
f = d + e
backward(f, Tensor_v2([1,1,1,1,1]))

b.grad.data == [2,2,2,2,2]

false

In [13]:
b.grad.data

5-element Array{Int64,1}:
 1
 1
 1
 1
 1

# Part 4: Upgrading Autograd to Support Multiple Tensors

In [14]:
using Random
import Base:+,println

mutable struct Tensor_v3 <: Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    function Tensor_v3(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor_v3, grad=nothing, grad_origin=nothing)
    if t.autograd
        grad = Tensor_v3(ones(size(t.data)))
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
        end
    end
end

function +(a::Tensor_v3, b::Tensor_v3)
    if (a.autograd && b.autograd)
        return Tensor_v3(a.data .+ b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor_v3(a.data+b.data)
end

println(t::Tensor_v3) = println(t.data)

a = Tensor_v3([1,2,3,4,5]; autograd=true)
b = Tensor_v3([2,2,2,2,2]; autograd=true)
c = Tensor_v3([5,4,3,2,1]; autograd=true)

d = a + b
e = b + c
f = d + e

backward(f, Tensor_v3([1,1,1,1,1]))

println(b.grad.data == [2,2,2,2,2])

true


# Part 5: Add Support for Negation

In [15]:
using Random
import Base:+,-,println


function backward(t::Tensor_v3, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor_v3(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
        end
    end
end

function -(a::Tensor_v3)
    if (a.autograd)
        return Tensor_v3(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor_v3(a.data .* -1)
end

a = Tensor_v3([1,2,3,4,5]; autograd=true)
b = Tensor_v3([2,2,2,2,2]; autograd=true)
c = Tensor_v3([5,4,3,2,1]; autograd=true)

d = a + (-b)
e = (-b) + c
f = d + e

backward(f, Tensor_v3([1,1,1,1,1]))

print(b.grad.data == [-2,-2,-2,-2,-2])

true

# Part 6: Add Support for Additional Functions

In [17]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims


function backward(t::Tensor_v3, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor_v3(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end              
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
        end
    end
end

size(a::Tensor_v3) = size(a.data)

function -(a::Tensor_v3, b::Tensor_v3)
    if (a.autograd && b.autograd)
        return Tensor_v3(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor_v3(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor_v3, b::Tensor_v3)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v3(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor_v3(new_data)
end

function broadcasted(f::typeof(-), a::Tensor_v3, b::Tensor_v3)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v3(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor_v3(new_data)
end

function sum(a::Tensor_v3; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor_v3(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor_v3(new_)
end

function dropdims(a::Tensor_v3;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor_v3(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor_v3(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor_v3, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor_v3(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor_v3(new_data)
end

#transpose
function adjoint(a::Tensor_v3)
    if (a.autograd)
        return Tensor_v3(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor_v3(a.data')
end

#matrix multiply 
function *(a::Tensor_v3, b::Tensor_v3)
    if (a.autograd && b.autograd)
        return Tensor_v3(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor_v3(a.data * b.data)
end

a = Tensor_v3([1,2,3,4,5]; autograd=true)
b = Tensor_v3([2,2,2,2,2]; autograd=true)
c = Tensor_v3([5,4,3,2,1]; autograd=true)

d = a + (-b)
e = (-b) + c
f = d + e

backward(f, Tensor_v3([1,1,1,1,1]))

print(b.grad.data .== [-2,-2,-2,-2,-2])

Bool[1, 1, 1, 1, 1]

# A few Notes on Sum and Expand

In [18]:
x = Tensor_v3([1 2 3;4 5 6];autograd=true)

2×3 Array{Int64,2}:
 1  2  3
 4  5  6

In [19]:
sum(x;dims=2)

2×1 Array{Int64,2}:
  6
 15

In [20]:
sum(x;dims=1)

1×3 Array{Int64,2}:
 5  7  9

In [21]:
expand(x,3,4)

2×3×4 Array{Int64,3}:
[:, :, 1] =
 1  2  3
 4  5  6

[:, :, 2] =
 1  2  3
 4  5  6

[:, :, 3] =
 1  2  3
 4  5  6

[:, :, 4] =
 1  2  3
 4  5  6

# Part 7: Use Autograd to Train a Neural Network

#### Previously we would train a model like this

In [22]:
using Random: seed!
seed!(0)

data = [ 0 0; 0 1; 1 0; 1 1;]
target = [0; 1; 0; 1]

weights_0_1 = rand(2,3)
weights_1_2 = rand(3,1)

for i=1:10
    
#     # Predict
    layer_1 = data * weights_0_1
    layer_2 = layer_1 * weights_1_2
    
#     # Compare
    diff = (layer_2 - target)
    sqdiff = (diff .* diff)
    loss = sum(sqdiff;dims=1) # mean squared error loss

#     # Learn: this is the backpropagation piece
    layer_1_grad = diff * weights_1_2'
    weight_1_2_update = layer_1' * diff
    weight_0_1_update = data' * layer_1_grad
    
    weights_1_2 .-= weight_1_2_update .* 0.1
    weights_0_1 .-= weight_0_1_update .* 0.1
    println(loss[1])
end

1.319677517363771
0.645370221071993
0.48046558059136113
0.42673209144608215
0.3930618130835104
0.363594896295545
0.33605797739677096
0.3100545140899324
0.28544274299374384
0.2621351937537131


In [23]:
using Random: seed!
seed!(0)

data = Tensor_v3([ 0 0; 0 1; 1 0; 1 1;], autograd=true)
target = Tensor_v3([0; 1; 0; 1], autograd=true)

w = []
push!(w, Tensor_v3(rand(2,3), autograd=true))
push!(w, Tensor_v3(rand(3,1), autograd=true))

for i=1:10

#     # Predict
    pred_1 = data * w[1]
    pred_2 = pred_1 * w[2]
    diff_1 = pred_2 .- target
    diff_2 = diff_1 .* diff_1
    
#     # Compare
    loss = sum(diff_2;dims=1)
    
#     # Learn
    backward(loss, Tensor_v3(ones(Float32, size(loss.data))))

    for w_ in w
        w_.data .-= w_.grad.data .* 0.1
        w_.grad.data .*= 0
    end

    println(loss)
end

[0.6764031397217589]
[0.23583373606478314]
[0.09934872031663397]
[0.039683547503183744]
[0.015016987627295843]
[0.0054452677736085454]
[0.0019163531702008321]
[0.0006614488249024953]
[0.0002255690501204964]
[7.636649139276776e-5]


# Part 8: Adding Automatic Optimization

In [24]:
mutable struct SGD
    parameters
    alpha
    SGD(parameters, alpha) = new(parameters, alpha)
end

function zero!(opt::SGD)
    for p in opt.parameters
        p.grad.data .*= 0.0
    end
end

function step(opt::SGD, zero=true)
    for p in opt.parameters
        p.data -= (p.grad.data .* opt.alpha)
        if zero
            p.grad.data .*= 0.0
        end
    end
end

step (generic function with 2 methods)

In [25]:
using Random: seed!
seed!(0)

data = Tensor_v3([ 0 0; 0 1; 1 0; 1 1;], autograd=true)
target = Tensor_v3([0; 1; 0; 1], autograd=true)

w = []
push!(w, Tensor_v3(rand(2,3), autograd=true))
push!(w, Tensor_v3(rand(3,1), autograd=true))

opt = SGD(w, 0.1)

for i=1:10

#     # Predict
    pred_1 = data * w[1]
    pred_2 = pred_1 * w[2]
    diff_1 = pred_2 .- target
    diff_2 = diff_1 .* diff_1
    
#     # Compare
    loss = sum(diff_2;dims=1)
    
#     # Learn
    backward(loss, Tensor_v3(ones(Float32, size(loss.data))))

    step(opt)

    println(loss)
end

[0.6764031397217589]
[0.23583373606478314]
[0.09934872031663397]
[0.039683547503183744]
[0.015016987627295843]
[0.0054452677736085454]
[0.0019163531702008321]
[0.0006614488249024953]
[0.0002255690501204964]
[7.636649139276776e-5]


# Part 9: Adding Support for Layer Types

In [26]:
abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
        W = Tensor_v3(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor_v3(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end

function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

forward (generic function with 1 method)

# Part 10: Layers Which Contain Layers

In [27]:
mutable struct Sequential <: Layer
    layers
    function Sequential(layers)
        return new(layers)
    end
end

function add(s::Sequential, layer)
    push!(s.layers, layer)
end

function forward(s::Sequential, input)
    for layer in s.layers
        input = forward(layer, input)
    end
    return input
end

function get_parameters(s::Sequential)
    parameters = [get_parameters(layer) for layer in s.layers]
    return collect(Iterators.flatten(parameters))
end

using Random: seed!; seed!(0)
data = Tensor_v3([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor_v3([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Linear(3,1)])

optim = SGD(get_parameters(model),0.1)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = sum((pred - target) .* (pred - target);dims=2)
    
    # Learn
    backward(loss, Tensor_v3(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

[2.097122200876772]
[1.090072584706214]
[0.8848315119040038]
[0.7588875978275563]
[0.6498970345986242]
[0.5416092338969248]
[0.43359472039645397]
[0.32996001398541697]
[0.23650275816675004]
[0.1586239346924714]


# Part 11: Loss Function Layers

In [28]:
struct MSELoss <: Layer
    MSELoss() = new()
end

function forward(l::MSELoss, pred, target)
    return sum((pred - target) .* (pred - target);dims=2)
end

using Random: seed!; seed!(0)
data = Tensor_v3([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor_v3([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Linear(3,1)])
criterion = MSELoss()

optim = SGD(get_parameters(model),0.1)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor_v3(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

[2.097122200876772]
[1.090072584706214]
[0.8848315119040038]
[0.7588875978275563]
[0.6498970345986242]
[0.5416092338969248]
[0.43359472039645397]
[0.32996001398541697]
[0.23650275816675004]
[0.1586239346924714]


# Part 12: Non-linearity Layers

In [31]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh


function backward(t::Tensor_v3, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor_v3(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor_v3(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor_v3(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
        end
    end
end

σ(x) = 1/(1+exp(-x))        

struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor_v3)
    if a.autograd
        return Tensor_v3(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor_v3(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor_v3)
    if a.autograd
        return Tensor_v3(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor_v3(tanh.(a.data))
end   


forward (generic function with 7 methods)

In [32]:
using Random: seed!; seed!(0)
data = Tensor_v3([ 0  0  1  1;0  1  0  1], autograd=true)
target = Tensor_v3([0 1 0 1], autograd=true)

model = Sequential([Linear(2,3), Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor_v3(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

[1.0640387281517036]
[0.9685119577123646]
[0.8989054438898241]
[0.8006851582324737]
[0.65271510312216]
[0.46696476003616044]
[0.2989261336464314]
[0.18861277315979982]
[0.12615799262916702]
[0.09048263176049438]


# Part 13: The Embedding Layer

In [38]:
mutable struct Embedding_v1 <: Layer
    vocab_size
    dim
    weight
    
    # this random initialiation style is just a convention from word2vec
    Embedding_v1(vocab_size, dim) = new(vocab_size, dim, (randn(dim, vocab_size) .- 0.5) ./ dim) 
end

In [40]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh
using Base.Iterators:partition, flatten

mutable struct Tensor_v4 <: Tensor
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    index_select_indices
    function Tensor_v4(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        T.index_select_indices = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor_v4)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor_v4, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor_v4(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor_v4(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor_v4(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
            
            if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1]))
                indices = t.index_select_indices.data
                major_chunks = partition(1:size(t.grad,2),length(indices))
                grad_chunks = [t.grad.data[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1]]
    
                for (i,ind) in enumerate(flatten(indices))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor_v4(new_grad))
            end
        end
    end
end

size(a::Tensor_v4) = size(a.data)
size(a::Tensor_v4, ind::Int) = size(a.data, ind)

function +(a::Tensor_v4, b::Tensor_v4)
    if (a.autograd && b.autograd)
        return Tensor_v4(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor_v4(a.data+b.data)
end

function -(a::Tensor_v4)
    if (a.autograd)
        return Tensor_v4(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor_v4(a.data .* -1)
end

function -(a::Tensor_v4, b::Tensor_v4)
    if (a.autograd && b.autograd)
        return Tensor_v4(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor_v4(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor_v4, b::Tensor_v4)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v4(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor_v4(new_data)
end

function broadcasted(f::typeof(-), a::Tensor_v4, b::Tensor_v4)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v4(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor_v4(new_data)
end

function sum(a::Tensor_v4; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor_v4(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor_v4(new_)
end

function dropdims(a::Tensor_v4;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor_v4(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor_v4(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor_v4, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor_v4(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor_v4(new_data)
end

#transpose
function adjoint(a::Tensor_v4)
    if (a.autograd)
        return Tensor_v4(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor_v4(a.data')
end

#matrix multiply 
function *(a::Tensor_v4, b::Tensor_v4)
    if (a.autograd && b.autograd)
        return Tensor_v4(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor_v4(a.data * b.data)
end


function index_select_helper(a::Array, indices)
    return reduce(hcat,map(ind -> a[:,ind], indices))
end

function index_select(a::Tensor_v4, indices::Tensor_v4)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor_v4(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor_v4(new_)
end


abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
        W = Tensor_v4(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor_v4(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end

function forward(l::Linear, input)
    return (l.W * input)  + expand(l.b,2,size(input.data, 2))
end

σ(x) = 1/(1+exp(-x))            

println(t::Tensor_v4) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor_v4) = show(io,m,a.data)

struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor_v4)
    if a.autograd
        return Tensor_v4(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor_v4(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor_v4)
    if a.autograd
        return Tensor_v4(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor_v4(tanh.(a.data))
end    

forward (generic function with 10 methods)

In [35]:
using LinearAlgebra:I
x = Tensor_v4(1.0* Matrix(I, 5, 5), autograd=true)
backward(index_select(x, Tensor_v4([[2,3,4],[3,4,5]])))
x.grad

5×5 Array{Float64,2}:
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0
 0.0  1.0  2.0  2.0  1.0

# Part 15: The Embedding Layer (revisited)

In [37]:
mutable struct Embedding_v2 <: Layer
    vocab_size
    dim
    weight
    parameters
    # this random initialiation style is just a convention from word2vec
    function Embedding_v2(dim, vocab_size) 
        E = new(vocab_size, dim, Tensor_v4((randn(dim, vocab_size) .- 0.5) ./ dim; autograd=true))
        E.parameters = [E.weight]
        return E
    end
end

function forward(E::Embedding_v2, indices)
    return index_select(E.weight, indices)
end

forward (generic function with 10 methods)

In [42]:
using Random:seed!;seed!(0)
data = Tensor_v4([[1,2,1,2]], autograd=true)
target = Tensor_v4([0 1 0 1], autograd=true)

embed = Embedding_v2(3,5)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()


optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor_v4(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

[0.8728139692976189]
[0.35474837968994166]
[0.19046978251082405]
[0.1201225262673748]
[0.0849317092081909]
[0.06457754793742479]
[0.05154228148446023]
[0.042573745189638766]
[0.03607112398001075]
[0.0311650999160659]


# Part 16: The Cross Entropy Layer

In [50]:
using Random
import Base:+,-,*,println, sum, broadcasted, size, adjoint, show, dropdims, tanh
using Base.Iterators:partition, flatten

mutable struct Tensor_v5
    data
    autograd
    creators
    creation_op
    id
    children
    grad 
    index_select_indices
    softmax_output
    target_dist
    
    function Tensor_v5(data; autograd=false, creators=nothing, creation_op = nothing, id=nothing)
        if isnothing(id)
            id = rand(1:100000)
        end
        T = new(data, autograd, creators, creation_op, id)
        T.children = Dict()
        T.grad = nothing
        T.index_select_indices = nothing
        
        if !(isnothing(creators))
            for c in creators
                if haskey(c.children, T.id)
                    c.children[T.id] += 1
                else
                    c.children[T.id] = 1
                end
            end
        end
        return T
    end
end

function all_children_grads_accounted_for(t::Tensor_v5)
    for (id, cnt) in t.children
        if (cnt != 0)
            return false
        end
    end
    return true
end

function backward(t::Tensor_v5, grad=nothing, grad_origin=nothing)
    if t.autograd
        if isnothing(grad)
            grad = Tensor_v5(ones(size(t.data)))
        end
    
        if !(isnothing(grad_origin))
            if t.children[grad_origin.id] == 0
                throw("cannot backprop more than once")
            else
                t.children[grad_origin.id] -= 1
            end
        end
        
        if isnothing(t.grad)
            t.grad = grad
        else
            t.grad += grad
        end
        
        # grads must not have grads of their own
        @assert !grad.autograd
        
        # only continue backpropping if there's something to
        # backprop into and if all gradients (from children)
        # are accounted for override waiting for children if
        # "backprop" was called on this variable directly
        
        if (!isnothing(t.creators) && (all_children_grads_accounted_for(t) || isnothing(grad_origin)))
            if t.creation_op == "add"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], t.grad, t)
            end
            
            if t.creation_op == "sub"
                backward(t.creators[1], t.grad, t)
                backward(t.creators[2], -t.grad, t)
            end
            
            if t.creation_op == "mul"
                new_ = t.grad .* t.creators[2]
                backward(t.creators[1], new_, t)
                new_ = t.grad .* t.creators[1]
                backward(t.creators[2], new_, t)
            end
            
            if t.creation_op == "mm"
                c1 = t.creators[1]
                c2 = t.creators[2]
                new_ =  t.grad * c2' ################
                backward(c1, new_)
                new_ = c1' * t.grad
                backward(c2, new_)
            end
                  
            if t.creation_op == "transpose"
                backward(t.creators[1], t.grad')
            end
            
            if occursin("sum", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                backward(t.creators[1], expand(t.grad, dim, size(t.creators[1].data)[dim]))
            end
            
            if occursin("expand", t.creation_op)
                dim = parse(Int, split(t.creation_op, "_")[2])
                ndims_cr = ndims(t.creators[1].data)
                backward(t.creators[1], dropdims(sum(t.grad;dims=dim);dims=dim, ndims_cr=ndims_cr))
            end
            
            if t.creation_op == "neg"
                backward(t.creators[1], -t.grad)
            end
            
            if t.creation_op == "sigmoid"
                ones_ = Tensor_v5(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* t .* (ones_ - t) )
            end
            
            if t.creation_op == "tanh"
                ones_ = Tensor_v5(ones(size(t.grad.data)))
                backward(t.creators[1], t.grad .* (ones_ - (t .* t)))
            end
            
            if t.creation_op == "index_select"
                new_grad = zeros(size(t.creators[1]))
                indices = t.index_select_indices.data
                major_chunks = partition(1:size(t.grad,2),length(indices))
                grad_chunks = [t.grad.data[:,inds][:,j]  for(i,inds) in enumerate(major_chunks) for j=1:size(inds)[1]]
    
                for (i,ind) in enumerate(flatten(indices))
                    new_grad[:,ind] +=  grad_chunks[i]
                end
                backward(t.creators[1], Tensor_v5(new_grad))
            end
            if t.creation_op == "cross_entropy"
                dx = t.softmax_output .- t.target_dist
                backward(t.creators[1], Tensor_v5(dx))
            end
        end
    end
end
                        
size(a::Tensor_v5) = size(a.data)
size(a::Tensor_v5, ind::Int) = size(a.data, ind)

function +(a::Tensor_v5, b::Tensor_v5)
    if (a.autograd && b.autograd)
        return Tensor_v5(a.data + b.data; autograd=true, creators=[a,b], creation_op = "add")
    end
    return Tensor_v5(a.data+b.data)
end

function -(a::Tensor_v5)
    if (a.autograd)
        return Tensor_v5(a.data .* -1; autograd=true, creators=[a], creation_op = "neg")
    end
    return Tensor_v5(a.data .* -1)
end

function -(a::Tensor_v5, b::Tensor_v5)
    if (a.autograd && b.autograd)
        return Tensor_v5(a.data - b.data; autograd=true, creators=[a,b], creation_op = "sub")
    end
    return Tensor_v5(a.data-b.data)
end

#element-wise multiplication
function broadcasted(f::typeof(*), a::Tensor_v5, b::Tensor_v5)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = f(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v5(new_data; autograd=true, creators=[a,b], creation_op ="mul")
    end
    return Tensor_v5(new_data)
end

function broadcasted(f::typeof(-), a::Tensor_v5, b::Tensor_v5)
    new_data = zeros(size(a.data))
    for i=1:length(new_data)
        new_data[i] = -(a.data[i] ,b.data[i])
    end
    if (a.autograd && b.autograd)
        return Tensor_v5(new_data; autograd=true, creators=[a,b], creation_op ="sub")
    end
    return Tensor_v5(new_data)
end

function sum(a::Tensor_v5; dims=dims)
    new_ = dropdims(sum(a.data ;dims=dims), dims = tuple(findall(size(a) .== 1)...))
    if (a.autograd)
        return Tensor_v5(new_; autograd=true, creators=[a], creation_op = "sum_"*string(dims))
    end
    return Tensor_v5(new_)
end

function dropdims(a::Tensor_v5;dims=dims,ndims_cr=ndims_cr)
    if ndims(a.data) == ndims_cr
        return a
    end
    if (a.autograd)
        return Tensor_v5(dropdims(a.data ;dims=dims); autograd=true, creators=[a], creation_op = "dropdims")
    end
    return Tensor_v5(dropdims(a.data ;dims=dims))
end

function expand(a::Tensor_v5, dim, copies)
    sz = size(a)
    rep = ntuple(d->d==dim ? copies : 1, length(sz)+1)
    new_size = ntuple(d->d<dim ? sz[d] : d == dim ? 1 : sz[d-1], length(sz)+1)
    new_data =  repeat(reshape(a.data, new_size), outer=rep)
    if (a.autograd)
        return Tensor_v5(new_data; autograd=true, creators=[a], creation_op = "expand_"*string(dim))
    end
    return Tensor_v5(new_data)
end

#transpose
function adjoint(a::Tensor_v5)
    if (a.autograd)
        return Tensor_v5(a.data';autograd=true, creators=[a], creation_op = "transpose")
    end
    return Tensor_v5(a.data')
end

#matrix multiply 
function *(a::Tensor_v5, b::Tensor_v5)
    if (a.autograd && b.autograd)
        return Tensor_v5(a.data * b.data; autograd=true, creators=[a,b], creation_op = "mm")
    end
    return Tensor_v5(a.data * b.data)
end


function index_select_helper(a::Array, indices)
    return reduce(hcat,map(ind -> a[:,ind], indices))
end

function index_select(a::Tensor_v5, indices::Tensor_v5)
    new_ = index_select_helper(a.data, indices.data)
    if (a.autograd)
        T = Tensor_v5(new_, autograd=true, creators=[a], creation_op = "index_select")
        T.index_select_indices = indices
        return T
    end
    return Tensor_v5(new_)
end

println(t::Tensor_v5) = println(t.data)
show(io::IO,m::MIME"text/plain",a::Tensor_v5) = show(io,m,a.data)
                        
abstract type Layer end

function get_parameters(l::Layer)
    return l.parameters
end

mutable struct Linear <: Layer
    W
    b
    parameters
    
    function Linear(n_inputs, n_outputs)
        W = Tensor_v5(randn(n_outputs, n_inputs) .* sqrt(1.0/n_inputs), autograd=true)
        b = Tensor_v5(zeros(n_outputs), autograd=true)
        parameters = [W,b]
        return new(W,b,parameters)
    end
end


σ(x) = 1/(1+exp(-x))                        

struct Tanh <: Layer
    Tanh() = new()
end

struct Sigmoid <: Layer
    Sigmoid() = new()
end

function get_parameters(l::Tanh)
    return []
end

function get_parameters(l::Sigmoid)
    return []
end

function forward(l::Sigmoid, a::Tensor_v5)
    if a.autograd
        return Tensor_v5(σ.(a.data); autograd=true, creators=[a], creation_op = "sigmoid")
    end
    return Tensor_v5(σ.(a.data))
end
        
function forward(l::Tanh, a::Tensor_v5)
    if a.autograd
        return Tensor_v5(tanh.(a.data); autograd=true, creators=[a], creation_op = "tanh")
    end
    return Tensor_v5(tanh.(a.data))
end    

forward (generic function with 13 methods)

In [48]:
mutable struct Embedding_v2 <: Layer
    vocab_size
    dim
    weight
    parameters
    # this random initialiation style is just a convention from word2vec
    function Embedding_v2(dim, vocab_size) 
        E = new(vocab_size, dim, Tensor_v5((randn(dim, vocab_size) .- 0.5) ./ dim; autograd=true))
        E.parameters = [E.weight]
        return E
    end
end

In [45]:
using Statistics: mean
using LinearAlgebra: I
function softmax(x)
    temp = exp.(x)
    return temp ./ sum(temp;dims=1)
end

struct CrossEntropyLoss 
    CrossEntropyLoss() = new()
end

function forward(l::CrossEntropyLoss, a::Tensor_v5, target::Tensor_v5)
    softmax_output = softmax(a.data)
    log_out = log.(softmax_output)
    sz = size(a.data, 1)
    identity = 1.0 .* Matrix(I, (sz, sz))
    target_dist = reshape(identity[:,target.data],(size(a.data)))
    loss = -mean(sum(log_out .* target_dist;dims=1))
    if a.autograd
        loss = Tensor_v5(loss; autograd=true, creators=[a], creation_op = "cross_entropy")
        loss.softmax_output = softmax_output
        loss.target_dist = target_dist
        return loss
    end
    return Tensor_v5(loss)
end

forward (generic function with 13 methods)

In [51]:
using Random:seed!;seed!(0)
data = Tensor_v5([[1,2,1,2]], autograd=true)
target = Tensor_v5([4 2 4 2], autograd=true)

embed = Embedding_v2(3,3)
model = Sequential([embed, Tanh(), Linear(3,4)])
criterion = CrossEntropyLoss()


optim = SGD(get_parameters(model),1.0)

for i=1:10
    
    # Predict
    pred = forward(model, data)
    
    # Compare
    loss = forward(criterion,pred, target)
    
    # Learn
    backward(loss, Tensor_v5(ones(Float32, size(loss.data))))
    step(optim)
    println(loss)
end

1.4461881980473767
0.32134282627951916
0.07050547863798148
0.030112221326762667
0.02294860815584766
0.01865850502637032
0.015762310723879135
0.013664053181812933
0.012069325190316341
0.010814145520457268


# Part 17: The Recurrent Neural Network Layer

In [53]:
mutable struct RNNCell <: Layer
    n_hidden
    
    activation
    
    w_ih
    w_hh
    w_ho
    
    parameters
    
    function RNNCell(n_inputs, n_hidden, n_output, activation="sigmoid")
        if activation == "sigmoid"
            act = Sigmoid()
        elseif activation == "tanh"
            act = Tanh()
        else
            throw("Non-linearity not found")
        end
        
        parameters = []

        w_ih = Linear(n_inputs, n_hidden)
        w_hh = Linear(n_hidden, n_hidden)
        w_ho = Linear(n_hidden, n_output)
        
        push!(parameters, get_parameters(w_ih))
        push!(parameters, get_parameters(w_hh))
        push!(parameters, get_parameters(w_ho))
        parameters = collect(Iterators.flatten(parameters))
        return new(n_hidden, act, w_ih, w_hh, w_ho, parameters)
    end
end

function forward(rnn::RNNCell, input::Tensor_v5, hidden::Tensor_v5)
    from_prev_hidden = forward(rnn.w_hh, hidden)
    combined = forward(rnn.w_ih, input) + from_prev_hidden
    new_hidden = forward(rnn.activation, combined)
    output = forward(rnn.w_ho, new_hidden)
    return output, new_hidden
end

function init_hidden(rnn::RNNCell; batch_size=1)
    return Tensor_v5(zeros(rnn.n_hidden, batch_size), autograd=true)
end

init_hidden (generic function with 1 method)

In [54]:
raw = readlines("tasksv11/en/qa1_single-supporting-fact_train.txt")

tokens = []
for line in raw[1:1000]
    push!(tokens, split(lowercase(line)," ")[2:end])
end

new_tokens = []
for line in tokens
    push!(new_tokens, cat(repeat(["-"],6-length(line)), line;dims=1))
end

tokens = new_tokens

vocab = Set()
for sent in tokens
    for word in sent
        if length(word)>0
            push!(vocab, word)
        end
    end
end
vocab = collect(vocab)

word2index = Dict()
for (i,word) in enumerate(vocab)
    word2index[word] = i
end

indices = []
for line in tokens
    idx = []
    for w in line
        push!(idx,word2index[w])
    end
    push!(indices, idx)
end

data = reduce(hcat,indices);

In [55]:
embed = Embedding_v2(16, length(vocab))
model = RNNCell(16, 16, length(vocab))
criterion = CrossEntropyLoss()
optim = SGD(cat(get_parameters(model), get_parameters(embed); dims=1), 0.05);

In [57]:
for iter=1:1000
    batch_size = 100
    total_loss = 0
    
    hidden = init_hidden(model, batch_size=batch_size)
    output = nothing #to access from for loop
    for t=1:5
        input = Tensor_v5(data[t,1:batch_size], autograd=true)
        rnn_input = forward(embed, input)
        output, hidden = forward(model, rnn_input, hidden)
    end
    target = Tensor_v5(data[6,1:batch_size], autograd=true)
    loss = forward(criterion, output, target)
    backward(loss)
    step(optim)
    total_loss += loss.data
    
    if (iter-1) %200 ==0
        max_ind = argmax(output.data;dims=1)
        p_correct = dropdims(map(x->x.I[1], max_ind);dims=1)
        p_correct = mean(target.data .== p_correct)
        println("Loss: $(total_loss/10) correct: $(p_correct)")
    end
end

Loss: 0.44647163118068134 correct: 0.0
Loss: 0.18065613931415564 correct: 0.21
Loss: 0.17841670806583004 correct: 0.22
Loss: 0.16513976605062758 correct: 0.31
Loss: 0.1459613675665265 correct: 0.36


In [58]:
batch_size = 1
hidden = init_hidden(model, batch_size=batch_size)

output = nothing #to access from for loop
for t=1:5
    input = Tensor_v5(data[t,1:batch_size], autograd=true)
    rnn_input = forward(embed, input)
    output, hidden = forward(model, rnn_input, hidden)
end

target = Tensor_v5(data[6,1:batch_size], autograd=true)   
loss = forward(criterion, output, target)

ctx = ""
for idx in data[:,1][1:end-1]
    global ctx *= vocab[idx] * " "
end
println("Context: ",ctx)
println("True: ",vocab[target.data[1]])
println("Pred: ", vocab[argmax(output.data).I[1]])

Context: - mary moved to the 
True: bathroom.
Pred: garden.
