![](intro_0.png)

In [2]:
class Node:

    def __init__(self, nary=0):
        self.children = [None]*nary

    def __repr__(self):
        return self.__str__()

In [3]:
class Symbol(Node):

    def __init__(self, name):
        super(Symbol, self).__init__(nary=0)
        self.name = name

    def __str__(self):
        return self.name

    def __hash__(self):
        return hash(self.name)            
    
    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.name == other.name      
        else:
            return False

In [5]:
class Constant(Node):

    def __init__(self, value):
        super(Constant, self).__init__(nary=0)
        self.value = value

    def __str__(self):
        return str(self.value)

In [6]:
class Add(Node):

    def __init__(self):
        super(Add, self).__init__(nary=2)

    def __str__(self):
        return '({}+{})'.format(str(self.children[0]), str(self.children[1]))
    
class Mul(Node):

    def __init__(self):
        super(Mul, self).__init__(nary=2)

    def __str__(self):
        return '({}*{})'.format(str(self.children[0]), str(self.children[1]))

In [7]:
def gen_f(x, y): 
    a = Add()
    a.children[0] = x
    a.children[1] = y
    
    m = Mul()
    m.children[0] = a
    m.children[1] = x
    
    return m

x = Symbol('x')
y = Symbol('y')
f = gen_f(x, y)
f

((x+y)*x)

In [9]:
def postorder(node):
    for c in node.children:
        yield from postorder(c)
    yield node

In [10]:
[n for n in postorder(f)]

[x, y, (x+y), x, ((x+y)*x)]

In [11]:
def values(f, fargs):
    """Returns a dictionary of computed values for each node in the expression tree including `f`."""
    v = {}
    v.update(fargs)
    for n in postorder(f):
        if not n in v:
            v[n] = n.compute_value(v)
    return v

In [13]:
Symbol.compute_value = lambda self, values: values[self]
Constant.compute_value = lambda self, values : self.value
Add.compute_value = lambda self, values: values[self.children[0]] + values[self.children[1]]
Mul.compute_value = lambda self, values: values[self.children[0]] * values[self.children[1]]

In [14]:
values(f, {x:2, y:3})[f]

10

In [15]:
def value(f, fargs):
    return values(f, fargs)[f]

value(f, {x:2, y:3})

10

In [16]:
from numbers import Number

def wrap_args(func):
    """Wraps function arguments that are numbers as Constant objects."""
    def wrapped(*args, **kwargs):
        new_args = []
        for a in args:
            if isinstance(a, Number):
                a = Constant(a)
            new_args.append(a)
        return func(*new_args, **kwargs)
    return wrapped

In [18]:
@wrap_args
def sym_add(x, y):
    n = Add()
    n.children[0] = x
    n.children[1] = y
    return n

@wrap_args
def sym_mul(x, y):
    n = Mul()
    n.children[0] = x
    n.children[1] = y
    return n

In [19]:
Node.__add__ = lambda self, other: sym_add(self, other)
Node.__radd__ = lambda self, other: sym_add(other, self)
Node.__mul__ = lambda self, other: sym_mul(self, other)
Node.__rmul__ = lambda self, other: sym_mul(other, self)

In [20]:
f = (x + y)*x
f

((x+y)*x)

In [21]:
def bfs(node, node_data):
    q = [(node, node_data)]
    while q:
        t = q.pop(0)
        node_data = yield t
        for idx, c in enumerate(t[0].children):
            q.append((c, node_data[idx]))

In [22]:
from collections import defaultdict

def numeric_gradient(f, fargs):
    vals = values(f, fargs)
    derivatives = defaultdict(int) # by default 0 is the derivative for unknown nodes.

    gen = bfs(f, 1)
    try:
        n, in_grad = next(gen)
        while True:
            derivatives[n] += in_grad
            local_grad = n.compute_gradient(vals)
            n, in_grad = gen.send([l*in_grad for l in local_grad])
    except StopIteration:
        return derivatives

In [23]:
# Monkey patch for compute_gradient
Symbol.compute_gradient = lambda self, values: [] # Nothing todo
Constant.compute_gradient = lambda self, values: [] # Nothing todo

Add.compute_gradient = lambda self, values: [1, 1] # dx+y/dx = 1, dx+y/dy = 1
Mul.compute_gradient = lambda self, values: [values[self.children[1]], values[self.children[0]]] # dx*y/dx = y, dx*y/dy = x

In [24]:
numeric_gradient(f, {x:2, y:3})

defaultdict(int, {y: 2, ((x+y)*x): 1, x: 7, (x+y): 2})

In [25]:
numeric_gradient(x*x+y*y, {x:2, y:3})

defaultdict(int, {y: 6, (y*y): 1, ((x*x)+(y*y)): 1, (x*x): 1, x: 4})

In [26]:
z = Symbol('z')
numeric_gradient((x+3)*(y+4)*z*z, {x:2, y:3, z:5})

defaultdict(int,
            {4: 125,
             3: 175,
             ((x+3)*(y+4)): 25,
             (y+4): 125,
             ((((x+3)*(y+4))*z)*z): 1,
             y: 125,
             (((x+3)*(y+4))*z): 5,
             z: 350,
             x: 175,
             (x+3): 175})

In [27]:
def symbolic_gradient(f):
    derivatives = defaultdict(lambda: Constant(0))
    
    gen = bfs(f, Constant(1))
    try:
        n, in_grad = next(gen) # Need to use edge info when expressions are reused!
        while True:
            derivatives[n] = derivatives[n] + in_grad
            local_grad = n.symbolic_gradient()
            n, in_grad = gen.send([l * in_grad for l in local_grad])
    except StopIteration:
        return derivatives

In [28]:
# Monkey patch for symbolic_gradient
Symbol.symbolic_gradient = lambda self: [] # Nothing todo
Constant.symbolic_gradient = lambda self: [] # Nothing todo

Add.symbolic_gradient = lambda self: [Constant(1), Constant(1)] # dx+y/dx = 1, dx+y/dy = 1
Mul.symbolic_gradient = lambda self: [self.children[1], self.children[0]] # dx*y/dx = y, dx*y/dy = x

In [29]:
symbolic_gradient(f)

defaultdict(<function __main__.symbolic_gradient.<locals>.<lambda>>,
            {y: (0+(1*(x*1))),
             ((x+y)*x): (0+1),
             x: ((0+((x+y)*1))+(1*(x*1))),
             (x+y): (0+(x*1))})

In [30]:
d = symbolic_gradient(f)
print('df/dx at (x=2,y=3) is {}'.format(value(d[x], {x:2, y:3})))
print('df/dy at (x=2,y=3) is {}'.format(value(d[y], {x:2, y:3})))

# Let's try second order derivatives
ddx = symbolic_gradient(d[x])
ddy = symbolic_gradient(d[y])
print('ddf/dxdx at (x=2,y=3) is {}'.format(value(ddx[x], {x:2, y:3})))
print('ddf/dxdy at (x=2,y=3) is {}'.format(value(ddx[y], {x:2, y:3})))
print('ddf/dydx at (x=2,y=3) is {}'.format(value(ddy[x], {x:2, y:3})))
print('ddf/dydy at (x=2,y=3) is {}'.format(value(ddy[y], {x:2, y:3})))

df/dx at (x=2,y=3) is 7
df/dy at (x=2,y=3) is 2
ddf/dxdx at (x=2,y=3) is 2
ddf/dxdy at (x=2,y=3) is 1
ddf/dydx at (x=2,y=3) is 1
ddf/dydy at (x=2,y=3) is 0


In [31]:
def applies_to(*klasses):
    """Decorates rule functions to match specific nodes in simplification."""

    def wrapper(func):
        def wrapped_func(node):
            if isinstance(node, klasses):
                return func(node)
            else:
                return node
        return wrapped_func
    return wrapper

In [32]:
def is_const(node, value=None):
    if isinstance(node, Constant):
        if value is not None:
            return node.value == value
        else:
            return True            
    return False

In [33]:
@applies_to(Mul)
def mul_identity_rule(node):
    if is_const(node.children[0], 1):
        return node.children[1]
    elif is_const(node.children[1], 1):
        return node.children[0]
    else:
        return node

In [34]:
import copy

rules = [mul_identity_rule]
def simplify(node):
    """Returns a simplified version of the forward graph associated with the given node."""
    
    nodemap = {}
    for n in postorder(node):
        if isinstance(n, Symbol):
            continue

        nc = copy.copy(n)
        for i in range(len(nc.children)):
            c = nc.children[i]
            nc.children[i] = nodemap.get(c, c)
        for r in rules:
            nc = r(nc)
        nodemap[n] = nc
        
    return nodemap[node]

In [35]:
d = symbolic_gradient(f)
print(d[x]) # Hardly readable
print(simplify(d[x])) # A bit better

((0+((x+y)*1))+(1*(x*1)))
((0+(x+y))+x)


In [36]:
@applies_to(Add)
def add_identity_rule(node):
    if is_const(node.children[0], 0):
        return node.children[1]
    elif is_const(node.children[1], 0):
        return node.children[0]
    else:
        return node
    
rules.append(add_identity_rule)

In [37]:
print(simplify(d[x])) # Even better

((x+y)+x)


In [38]:
k = x + x + x + x + x
d = symbolic_gradient(k)
print(d[x])
print(simplify(d[x]))

(((((0+(1*1))+(1*(1*1)))+(1*(1*(1*1))))+(1*(1*(1*(1*1)))))+(1*(1*(1*(1*1)))))
((((1+1)+1)+1)+1)


As you can see $\frac{\mathrm{d}f(x,y)}{\mathrm{d}x}\Bigr|_{\substack{x=2\\y=3}} = 7$ and $\frac{\mathrm{d}f(x,y)}{\mathrm{d}y}\Bigr|_{\substack{x=2\\y=3}} = 2$. But the dictionary provides more, also the derivative w.r.t $(x+y)$ is given! Some more examples:

Before diving into code, we need to cover the concept of [expression trees](https://en.wikipedia.org/wiki/Binary_expression_tree). Expression trees will be used to represent function decompositions in Python code to be developed. While there are not a fundamentally new concept they deserve some words at this point.

An expression tree is similar to the computational graphs introduced, but the arrows by default point backwards. It turns out that constructing function expression in a tree like manner (top node is the function itself, function parameters are leafs) simplifies development dramatically.

Take the CG of the toy example used $f(x,y)=(x+y)x$

<img src="intro_0.png" width="400">

The following expression tree represents the same function

<img src="exp_tree.png" width="400">

Notice that we now have a tree like structure. Our root node is the final operation to be executed to receive the result of $f(x,y)$. Also notice that $x$ shows up twice. Finding the value of an expression tree requires to compute values for nodes in lower layers first and bubble information up towards the root node. Backpropagation on the other hand ist just a matter of following the forward edges. Again, when computing derivatives, a summation over all paths from the top that lead to a given node will be performed.
