In [10]:
import matplotlib.pyplot as plt
import numpy as np
import math

In [12]:
class Value():
    def __init__(self, data, _children=(), _op = '', label=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self.grad = 0.0     # x.grad means derivative of loss fn (final node) wrt x
        self._backward = lambda: None # it is defined None function as default in case it is used for a leaf node

    def __repr__(self):
        return f'Value {self.label}(data = {self.data})'

    def __add__(self, other):
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            '''
            if  x = y + z (think of it in terms of graph, will be easier)
            then gradient of x wrt y or z will be 1, i.e.,
            gradient of an addition operator propagates to its components, 
            therefore gradient of loss fn wrt components (y or z) will be
            equal to gradient of loss fn wrt x
            '''
            self.grad += 1.0 * out.grad      #since it is backpropagation, we already know gradient of loss fn wrt out
            other.grad += 1.0 * out.grad    # += because gradients accumulate, helful when a same node is used multiple times

        out._backward = _backward       #since self._backward is already defined as a function, therefore _backward not _backward()
        return out

    def __mul__(self, other):
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            '''
            if  x = y * z (think of it in terms of graph, will be easier)
            then gradient of x wrt y and z will be z and y respectively, 
            therefore gradient of loss fn wrt components y or z will be
            equal to gradient of loss fn wrt x * z or gradient of loss fn wrt x * y,
            respectively
            '''
            self.grad += other.data * out.grad      
            other.grad += self.data * out.grad

        out._backward = _backward
        return out

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            self.grad += (1 - t**2)*out.grad
        out._backward = _backward
        return out
    
    def backward(self):
        self.grad = 1.0

        # using topological sort to arrange nodes in topological order
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        print('Gradients before backprop')
        for node in topo:
            print(node, node.grad)

        for node in reversed(topo):
            node._backward()

        print('\nGradients after backprop')
        for node in topo:
            print(node, node.grad)

### A workflow where a, b and c are three leaf nodes, d is intermediary result and e is the final output

In [180]:
a = Value(2, label='a')
b = Value(-3, label='b')
c = Value(10.0, label='c')
d = a*b; d.label = 'd'
e = d + c; e.label='e'
print(d, e)

Value d(data = -6) Value e(data = 4.0)


In [181]:
d._prev

{Value a(data = 2), Value b(data = -3)}

In [182]:
d._op

'*'

### Manually calling the backward function to check if the derivatives are flowing correctly

In [183]:
e.grad = 1 # this has to be set since e is the output node and e.grad means derivative of output node e wrt e

In [184]:
e._backward()

In [185]:
print(d.grad, c.grad)

1.0 1.0


In [186]:
d._backward()

In [187]:
print(a.grad, b.grad)

-3.0 2.0


In [188]:
a._backward()
print(a.grad)   #since a is a leaf node, a._backward() returns None function

-3.0


### Automating backpropagation

#### Topological Sort
All the arrows go in a single direction (left to right).
Required because we want to automate gradient calculation (calling backward function) and this should occur in order going from back to front.

In [189]:
e.backward()

Gradients before backprop
Value c(data = 10.0) 1.0
Value b(data = -3) 2.0
Value a(data = 2) -3.0
Value d(data = -6) 1.0
Value e(data = 4.0) 1.0

Gradients after backprop
Value c(data = 10.0) 1.0
Value b(data = -3) 2.0
Value a(data = 2) -3.0
Value d(data = -6) 1.0
Value e(data = 4.0) 1.0


### Correcting backpropagation code

#### Before Correction

If a node is used multiple times as shown in example below, the code where accumulation of gradients is not taken into consideration i.e., only **`=`** is used like `self.grad =` won't work correctly. **`=`** should be replaced by **`+=`** to accumulate gradients.

In [4]:
n = Value(4.0, label='n')
m = n + n; m.label = 'm'
print(m, n)

Value m(data = 8.0) Value n(data = 4.0)


In [5]:
m._prev

{Value n(data = 4.0)}

In [6]:
print(n.grad, m.grad)

0.0 0.0


In [8]:
m.grad = 1.0
print(n.grad, m.grad)

0.0 1.0


In [10]:
m.backward()

Gradients before backprop
Value n(data = 4.0) 0.0
Value m(data = 8.0) 1.0

Gradients after backprop
Value n(data = 4.0) 1.0
Value m(data = 8.0) 1.0


#### After Correction

In [13]:
n = Value(4.0, label='n')
m = n + n; m.label = 'm'
print(m, n)

Value m(data = 8.0) Value n(data = 4.0)


In [14]:
m._prev

{Value n(data = 4.0)}

In [15]:
print(n.grad, m.grad)

0.0 0.0


In [16]:
m.grad = 1.0
print(n.grad, m.grad)

0.0 1.0


In [17]:
m.backward()

Gradients before backprop
Value n(data = 4.0) 0.0
Value m(data = 8.0) 1.0

Gradients after backprop
Value n(data = 4.0) 2.0
Value m(data = 8.0) 1.0
