In [1]:
import matplotlib.pyplot as plt
import numpy as np
import math
import torch

In [None]:
class Value():
    def __init__(self, data, _children=(), _op = '', label=''):
        self.data = data
        self._prev = set(_children)
        self._op = _op
        self.label = label
        self.grad = 0.0     # x.grad means derivative of loss fn (final node) wrt x
        self._backward = lambda: None # it is defined None function as default in case it is used for a leaf node

    def __repr__(self):
        return f'Value {self.label}(data = {self.data})'

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        
        def _backward():
            '''
            if  x = y + z (think of it in terms of graph, will be easier)
            then gradient of x wrt y or z will be 1, i.e.,
            gradient of an addition operator propagates to its components, 
            therefore gradient of loss fn wrt components (y or z) will be
            equal to gradient of loss fn wrt x
            '''
            self.grad += 1.0 * out.grad      #since it is backpropagation, we already know gradient of loss fn wrt out
            other.grad += 1.0 * out.grad    # += because gradients accumulate, helful when a same node is used multiple times

        out._backward = _backward       #since self._backward is already defined as a function, therefore _backward not _backward()
        return out
    
    def __radd__(self, other):
        '''python redirects addition to radd when it sees 
        it can't handle addition in one order but can perform it in reverse order'''
        return self + other

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            '''
            if  x = y * z (think of it in terms of graph, will be easier)
            then gradient of x wrt y and z will be z and y respectively, 
            therefore gradient of loss fn wrt components y or z will be
            equal to gradient of loss fn wrt x * z or gradient of loss fn wrt x * y,
            respectively
            '''
            self.grad += other.data * out.grad      
            other.grad += self.data * out.grad

        out._backward = _backward
        return out
    
    def __rmul__(self, other):
        '''python redirects multiplication to rmul when it sees 
        it can't handle multiplication in one order but can perform it in reverse order'''
        return self * other

    def exp(self): #e^self
        out = Value(math.exp(self.data), (self, ), _op = 'exp')

        def _backward():
            self.grad += out.data * out.grad

        out._backward = _backward
        return out

    def __pow__(self, other): #self**other
        assert isinstance(other, (int, float)), 'Power operator only supports int or float'

        out = Value(self.data**other, (self, ), _op = f'**{other}')

        def _backward():
            self.grad += other * self.data**(other-1) * out.grad
        out._backward = _backward
        return out

    def __truediv__(self, other): #self/other
        return self * other**-1

    def __neg__(self):  #-self
        return self * -1 

    def __sub__(self, other):   #self - other
        return self + (-other)

    def tanh(self):
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self, ), 'tanh')

        def _backward():
            self.grad += (1 - t**2)*out.grad
        out._backward = _backward
        return out
    
    def backward(self):
        self.grad = 1.0

        # using topological sort to arrange nodes in topological order
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        print('Gradients before backprop')
        for node in topo:
            print(node, node.grad)

        for node in reversed(topo):
            node._backward()

        print('\nGradients after backprop')
        for node in topo:
            print(node, node.grad)

## Using tanh function

### A workflow where a, b and c are three leaf nodes, d and e are intermediary results and f is the final output

In [None]:
a = Value(2, label='a')
b = Value(-3, label='b')
c = Value(10.0, label='c')
d = a*b; d.label = 'd'
e = d + c; e.label='e'
f = e.tanh(); f.label='f'
print(d, e, f)

In [None]:
f._prev

In [None]:
f._op

### Manually calling the backward function to check if the derivatives are flowing correctly

In [None]:
f.grad = 1 # this has to be set since f is the output node and f.grad means derivative of output node f wrt f

In [None]:
f._backward()

In [None]:
print(e.grad)

In [None]:
e._backward()

In [None]:
print(d.grad, c.grad)

In [None]:
d._backward()

In [None]:
print(a.grad, b.grad)

In [None]:
a._backward()
print(a.grad)   #since a is a leaf node, a._backward() returns None function

### Automating backpropagation

#### Topological Sort
All the arrows go in a single direction (left to right).
Required because we want to automate gradient calculation (calling backward function) and this should occur in order going from back to front.

In [None]:
f.backward()

### Correcting backpropagation code

#### Before Correction

If a node is used multiple times as shown in example below, the code where accumulation of gradients is not taken into consideration i.e., only **`=`** is used like `self.grad =` won't work correctly. **`=`** should be replaced by **`+=`** to accumulate gradients.

In [None]:
n = Value(4.0, label='n')
m = n + n; m.label = 'm'
print(m, n)

In [None]:
m._prev

In [None]:
print(n.grad, m.grad)

In [None]:
m.grad = 1.0
print(n.grad, m.grad)

In [None]:
m.backward()

#### After Correction

In [None]:
n = Value(4.0, label='n')
m = n + n; m.label = 'm'
print(m, n)

In [None]:
m._prev

In [None]:
print(n.grad, m.grad)

In [None]:
m.grad = 1.0
print(n.grad, m.grad)

In [None]:
m.backward()

## Breaking tanh into components

In [None]:
a = Value(2, label='a')
b = Value(-3, label='b')
c = Value(10.0, label='c')
d = a*b; d.label = 'd'
e = d + c; e.label='e'
f = (2*e).exp(); f.label='f'
g = (f - 1)/(f + 1); g.label='g'
print(d, e, f, g)

In [None]:
g.grad = 1.0

In [None]:
g.backward() #we see similar gradients as when tanh was used directly for the output node

## PyTorch

In [36]:
a = torch.Tensor([2.0]).double(); a.requires_grad = True
b = torch.Tensor([-3.0]).double(); b.requires_grad = True
c = torch.Tensor([10.0]).double(); c.requires_grad = True
d = a*b;
e = d + c
f = e.tanh()

In [37]:
f.backward()

In [46]:
print(f'Gradient of a: {a.grad.item()}')
print(f'Gradient of b: {b.grad.item()}')
print(f'Gradient of c: {c.grad.item()}')

Gradient of a: -0.0040228520490775965
Gradient of b: 0.002681901366051731
Gradient of c: 0.0013409506830258655
