# Tensorial Automatic Differentiation
The aim of this notebook is to generalize the previous autodiff notebook for tensorized parameters, as opposed to only scalar parameters.

## Computationnal Graph
Operations to manipulate the directed acyclic computationnal graphs created using Julia's built-in dictionnary data structure.

In [1]:

function AddEdge(Edges, source, sink)
    if source in keys(Edges) merge!(Edges, IdDict(source=>Set([sink, get(Edges, source, false)...])))
    else merge!(Edges, IdDict(source=>Set([sink]))) end 
end

function RemoveEdge(Edges, source, sink)
    if source in keys(Edges) delete!(get(Edges, source, false), sink) end
end

function ReverseEdges(Edges)::IdDict
    ReversedEdges = IdDict{}
    for (source, sinks) in edges
        for sink in sinks AddEdge(ReversedEdges, source, sink) end
    end
    return ReversedEdges
end

function HasIncomingEdge(Edges, Node)::Bool
    for sinks in values(Edges)  if (Node in sinks) return true end end
    return false
end


function KahnTopoSort(Nodes::Set, edges::IdDict)
    # Kahn's topological sorting algorithm
    # edges is a DAG in the form of a dictionnary (source => sinks)
    Edges = deepcopy(edges)
    Sorted = []
    NoIncomingEdges = Set( [Node for Node in Nodes  if !HasIncomingEdge(Edges, Node)] )
    while !isempty(NoIncomingEdges)
        source = pop!(NoIncomingEdges)
        Sorted = cat(Sorted, [source], dims=1)
        sinks = get(Edges, source, false)
        if sinks!=false for sink in sinks
            RemoveEdge(Edges, source, sink)
            if !HasIncomingEdge(Edges, sink) NoIncomingEdges = union(NoIncomingEdges, Set(sink)) end
        end end
    end
    return Sorted
end


Nodes = Set([1, 2, 3, 4, 5])
Edges = IdDict(1=>Set([2, 3]), 4=>Set([5]), 2=>Set([4, 5]), 3=>Set([4]))
@show KahnTopoSort(Nodes, Edges) #should return [5, 4, 2, 3, 1] or [5, 2, 4, 3, 1]

KahnTopoSort(Nodes, Edges) = Any[1, 2, 3, 4, 5]


5-element Vector{Any}:
 1
 2
 3
 4
 5

## Forward Pass
Almost every function can be broken down into a chain of very simple operations.

In [2]:
import Base.:+
import Base.:-
import Base.:*
import Base.:/
import Base.:^
import Base.sin
import Base.cos

function AddJacobian(Jacobians, source, sink, Jacobian)
    # adds jacobian of sink with respect to source to jacobians
    merge!(Jacobians, IdDict((source, sink) => Jacobian)) 
end

function GenNode(Nodes)
    # generate a new Node (Nodeentification in the graph)
    Node = convert(Int64, floor(10000000 * rand()))
    while Node in Nodes
        Node = convert(Int64, floor(10000000 * rand()))
    end
    Nodes = union!(Nodes, Node)
    return Node
end

mutable struct Tracked{T} <: Real
    val::T
    Node::Int64 # identification in the computationnal graph
    Tracked(val, Nodes) = return new{typeof(val)}(val, GenNode(Nodes))
    Tracked(val, Node) = return new{typeof(val)}(val, Node)
end

#Base.convert(::Type{AbstractFloat}, t::Type{Tracked{Float64}}) = t
#Base.promote_rule(::Type{AbstractFloat}, ::Type{Tracked{Float64}}) =AbstractFloat

for t in (Symbol(Integer), Symbol(AbstractFloat))
    eval(
    quote
    GetJacobian(f::typeof(+), a::($t), b::Tracked) = 1
    GetJacobian(f::typeof(+), a::Tracked, b::($t)) = 1

    GetJacobian(f::typeof(-), a::($t), b::Tracked) = -1
    GetJacobian(f::typeof(-), a::Tracked, b::($t)) = 1

    GetJacobian(f::typeof(*), a::($t), b::Tracked) = a
    GetJacobian(f::typeof(*), a::Tracked, b::($t)) = b

    GetJacobian(f::typeof(/), a::($t), b::Tracked) = (-1)/b.val^2
    GetJacobian(f::typeof(/), a::Tracked, b::($t)) = 1/b

    GetJacobian(f::typeof(^), a::Tracked, b::($t)) = b*a.val^(b-1)
    GetJacobian(f::typeof(^), a::($t), b::Tracked) = a^(b.val)*log(a)
    
    GetJacobian(f::typeof(sin), a::Tracked) = cos(a.val)
    GetJacobian(f::typeof(cos), a::Tracked) = -sin(a.val)
    end
    )
end


function WithGradient(f, x, w::Set)
    
    # Overcharge the operators to create a computationnal graph as well as 
    # the intermediate jacobians for backpropagation at a later stage

    global Nodes = deepcopy(w)
    global Edges = IdDict()
    global Jacobians = IdDict()

    for op in (Symbol(+), Symbol(-), Symbol(*), Symbol(/), Symbol(^))
        for t in (Symbol(Integer), Symbol(AbstractFloat))

        eval(:(global function ($op)(a::T, b::Tracked) where {T <: ($t)}
        Node = GenNode(Nodes)
        J = GetJacobian(($op), a, b)
        AddEdge(Edges, b.Node, Node)
        AddJacobian(Jacobians, b.Node, Node, J)
        return Tracked(($op)(a, b.val), Node) end))

        eval(:(global function ($op)(a::Tracked, b::T) where {T <: ($t)}
        Node = GenNode(Nodes)
        J = GetJacobian(($op), a, b)
        AddEdge(Edges, a.Node, Node)
        AddJacobian(Jacobians, a.Node, Node, J)
        return Tracked(($op)(a.val, b), Node) end))

        eval(:(global function ($op)(a::Tracked, b::Tracked)
        Node = GenNode(Nodes)
        Ja = GetJacobian(($op), a, b.val)
        AddEdge(Edges, a.Node, Node)
        AddJacobian(Jacobians, a.Node, Node, Ja)
        Jb = GetJacobian(($op), a.val, b)
        AddEdge(Edges, b.Node, Node)
        AddJacobian(Jacobians, b.Node, Node, Jb)
        return Tracked(($op)(a.val, b.val), Node) end))

        end

    end

    for op in (Symbol(sin), Symbol(cos))
        eval(
            :(
            global function ($op)(a::Tracked)
                Node = GenNode(Nodes)
                J = GetJacobian(($op), a)
                AddEdge(Edges, a.Node, Node)
                AddJacobian(Jacobians, a.Node, Node, J)
                return Tracked(($op)(a.val), Node) 
            end
            )
        )
    end

    y = Base.invokelatest(f, x)
    return (y, Nodes, Edges, Jacobians)
end


WithGradient (generic function with 1 method)

## Backward Pass
$$\frac{d f \circ g}{dx} = \frac{d f}{d g} \frac{dg}{dx}$$

In [3]:

function Backprop(f, x, w)::IdDict
    y, Nodes, Edges, Jacobians = WithGradient(f, x, w) # forward pass (intermediate jacobians are computed)
    TopoSortNodes = KahnTopoSort(Nodes, Edges) 
    ChainedJacobians = IdDict{Any, Float64}(y.Node=>1)
    for source in reverse(TopoSortNodes[1:end-1])
        CJ = 0
        sinks = get(Edges, source, false)
        for sink in sinks
            CJ += get(Jacobians, (source, sink), false) * get(ChainedJacobians, sink, false)
        end
        merge!(ChainedJacobians, IdDict(source=>CJ))
    end
    Gradients = IdDict([source=>get(ChainedJacobians, source, false) for source in keys(ChainedJacobians) if source in w])
    return Gradients
end

Backprop (generic function with 1 method)

## Tensor Structures

In [9]:
#TODO: create abstract type tensor

mutable struct Dense
    W # linear map weight tensor
    B # linear bias tensor
    af # element-wise function (activation funct.)
end

function Dense(InDim::Int, OutDim::Int, f::Function)
    w = rand(Float16, (OutDim, InDim))
    b = rand(Float16, (OutDim))
    return Dense(w, b, f)
end

function (d::Dense)(v::AbstractArray{<:Real, 1})
    return d.af.( +(*(d.W, v), d.B) )
end

function (d::Dense)(x::Array{<:Real, 2})
    y = Array{Number, 2}(undef, size(x)[1], size(d.W)[1])
    for (RowInd, v) in enumerate(eachrow(x))
        y[RowInd, :] = d(v)
    end
    return y
end

function Track(Nodes, Tensor)
    Tensor = Tracked(Tensor, Nodes)
end

function Track(Nodes, d::Dense) 
    Track(d.W); Track(d.B)
    d.B = Tracked(B, Vector)
end

e = 2.71828
Sigmoid(x) = 1 / (1 + e^(-x))

# very simple feed forward neural network
Da = Dense(5, 16, Sigmoid)
Db = Dense(16, 16, Sigmoid)
Dc = Dense(16, 1, Sigmoid)
FFNN(x) = Dc ∘ Db ∘ Da(x)

# very simple dataset
X = rand(1000, 5)
@show Y = Dense(5, 1, Sigmoid)(X) 

# simple training loop
for step in 1:10
    # x <- random data point from X
    ∇l = Backprop(x->y-FFNN(x), w, x)
    GradStep(∇l, 0.001)
    println(y - FFNN(x)) # print current loss
end

Y = (Dense(5, 1, Sigmoid))(X) = Number[0.9080292585687291; 0.8564901408475858; 0.8523414433797354; 0.8645437706491295; 0.7831680425623144; 0.8188060674287045; 0.8812721662262332; 0.8455189551892558; 0.8738407355778981; 0.8761731260903656; 0.8630952516752345; 0.8998914670819859; 0.8225994708945382; 0.8676477660245053; 0.7751220587206836; 0.830656130301018; 0.8571127618543188; 0.9046609594309414; 0.7036996894139926; 0.8841508551132914; 0.8568248805448078; 0.907135610718878; 0.7853229855362016; 0.8901308639005044; 0.7248987792685629; 0.9294241214051233; 0.8743692106325728; 0.8324090425970911; 0.7992805658513482; 0.7627403667702118; 0.8197904521791337; 0.8803536699975644; 0.7834602715013538; 0.7960530677271381; 0.8150375713970585; 0.8442523979525213; 0.8503018317772694; 0.7819994710693793; 0.9081624079994182; 0.8918897069492316; 0.7827238476015407; 0.8890643782355109; 0.9101569852378032; 0.8116818074044986; 0.8424101016125997; 0.8530588706854765; 0.7957549735629282; 0.8689590981658831; 0.8




1000×1 Matrix{Number}:
 0.9080292585687291
 0.8564901408475858
 0.8523414433797354
 0.8645437706491295
 0.7831680425623144
 0.8188060674287045
 0.8812721662262332
 0.8455189551892558
 0.8738407355778981
 0.8761731260903656
 ⋮
 0.817609236329076
 0.8990066246557379
 0.8113374322344309
 0.8658901756228635
 0.7782016510189036
 0.9061928746864668
 0.8391371205046636
 0.8846899848134575
 0.8746660651049465