# Let's now look at an implementation of a neuron

In [8]:
import random
from graphviz import Digraph
import math

# Reusing the draw function from the previous notebook
def trace(root):
  # builds a set of all nodes and edges in the graph
  nodes, edges = set(), set()
  def build(v):
    if v not in nodes:
      nodes.add(v)
      for child in v._prev:
        edges.add((child, v))
        build(child)
  build(root)
  return nodes, edges

def draw_dot(root):
  dot = Digraph(format='svg', graph_attr={'rankdir':'LR'}) # Left to right graph
  nodes, edges = trace(root)
  for n in nodes:
    uid = str(id(n))
    dot.node(name=uid, label= "{ %s | data %.4f | grad %.4f }" % (n.label, n.data, n.grad), shape='record')  # added grad to the node 

    if n._op: # will work for non empty _op for every node, e.g for e, d, L 
      dot.node(name=uid + n._op, label= n._op ) # creating a node for the operation
      dot.edge(uid + n._op, uid) # connecting the operation (oval) node to the Value [rectangle] node

  for n1, n2 in edges:
    dot.edge(str(id(n1)), str(id(n2)) + n2._op) # connecting the parent to the child with the operation, if there is any

  return dot


# Reusing the value class from the previous notebook
class Value:
    def __init__(self, data, _children=(), _op='', label=''): # _op is empty for leaves
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None # empty function by default to calculate the gradient of the node
        self._prev = set(_children) 
        self._op = _op 
        self.label = label

    def __repr__(self): 
        return f"Value(data={self.data})"
    
    def __add__(self, other): 
        out = Value(self.data + other.data, (self, other), '+') 
        # we want to take out's grad and propagate it to self's grad and other's grad
        # in general, we multiply the local derivative with the derivative which has propagated
        # from the last (rightmost) node
        def _backward():
            self.grad += 1.0 * out.grad # 1 is the local derivative of the addition operation
            other.grad += 1.0 * out.grad # out.grad is the derivative propagated from the last node
        out._backward = _backward

        return out
    
    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)  # if other is not a Value object, convert it to one
        out = Value(self.data * other.data, (self, other), '*') 
        def _backward():
            self.grad += other.data * out.grad # other.data is the local derivative of the multiplication operation
            other.grad += self.data * out.grad # self.data is the local derivative of the multiplication operation
            # out.grad is the derivative propagated from the last node
        out._backward = _backward
        return out
    
    def __rmul__(self, other):
        return self.__mul__(other)  

    def tanh(self): 
        x = self.data
        t = (math.exp(2*x) - 1)/(math.exp(2*x) + 1)
        out = Value(t, (self,), 'tanh') 

        def _backward():
            self.grad += (1 - t**2) * out.grad # 1 - tanh^2(x) is the local derivative of tanh, out.grad is the derivative propagated from the last node
        out._backward = _backward
        return out
    
    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

## Again let's look at the diagram of a mathematical neuron object

![neuron](./neuron.jpeg)

### The task of a single neuron is to do multiply and accumulate. In other words, it's doing a vector dot product between two vectors. One is the input $x$ and the other is the weights $w$. Then it passes that through an activation function $(\sigma)$ to introduce non linearity.

In other words, if we consider a neuron to be a function $f(x)$, $$f(x) = \sigma(<w|x> + b)= \sigma(\sum_{i}^{n}w_i\cdot x_i + b)$$


Let's now look at it in action

In [11]:
class Neuron:

  def __init__ (self, nin): # nin is the number of inputs to the neuron
    self.w = [Value(random.uniform(-1,1)) for _ in range(nin)] # weights
    self.b = Value(random.uniform(-1,1))

  def __call__(self, x):
    # w*x + b
    act = sum([wi*xi for wi, xi in zip(self.w, x)], self.b)
    out = act.tanh()
    return out
  
x = [2.0, 5.0]
n = Neuron(2)
n(x)

Value(data=0.9867920106715578)

### Great. Now we want to build a neural network. 

A neural network is an example of an $n-\text{partite}$ graph. Neurons are connected to each other in layers. And no two neurons from the same layer have a connection between them. Generally all the neurons from one level are connected to all the other neurons in the next level (though in practice we sometimes delete edges between neurons)


![neural-network](./neural-network.png)

## Let's now introduce a layer class which implements the collection on neurons abstraction

In [17]:
class Layer:

  def __init__(self, nin, nout):
    self.neurons = [Neuron(nin) for _ in range(nout)] # number of neurons in the layer

  def __call__(self, x):
    outs = [n(x) for n in self.neurons]
    return outs[0] if len(outs) == 1 else outs
  
x = [3.0, 5.0]
l = Layer(2, 3)
l(x)

[Value(data=-0.9989084311917241),
 Value(data=0.7335905736362098),
 Value(data=-0.987596485248243)]

### Finally, let's build a multi layer perceptron using the classes defined above

In [14]:
class MLP:
  def __init__(self, nin, nouts): # list of number of neurons in each layer
    sz = [nin] + nouts
    self.layers = [Layer(sz[i], sz[i+1]) for i in range(len(sz)-1)] # creating the layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    return x


<div>
<img src="./neural_net2.jpeg" width="500"/>
</div>
<h3>Language models are statistical models which predict what text to generate based on a given seed text</h3>

In [18]:

## Implementing the above diagram in code
x = [3.0, 5.0, -2.0] # 3 inputs
n = MLP(3, [4, 4, 1]) # 3 inputs, two hidden layers with 4 neurons each, 1 output
n(x)


Value(data=0.906060695088756)

## The diagram of the expression graph of this whole MLP would be

In [20]:
# draw_dot(n(x)) # uncomment to see the diagram