# micrograd exercises

1. watch the [micrograd video](https://www.youtube.com/watch?v=VMj-3S1tku0) on YouTube
2. come back and complete these exercises to level up :)

## section 1: derivatives

In [1]:
# here is a mathematical expression that takes 3 inputs and produces one output
from math import sin, cos

def f(a, b, c):
  return -a**3 + sin(3*b) - 1.0/c + b**2.5 - a**0.5

print(f(2, 3, 4))

6.336362190988558


In [9]:
# write the function df that returns the analytical gradient of f
# i.e. use your skills from calculus to take the derivative, then implement the formula
# if you do not calculus then feel free to ask wolframalpha, e.g.:
# https://www.wolframalpha.com/input?i=d%2Fda%28sin%283*a%29%29%29

def gradf(a, b, c):
  return [-1 / (2 * (a ** 0.5)) - 3 * a**2, 2.5 * b**1.5 + 3 * cos(3 * b), 1 / (c ** 2)] # todo, return [df/da, df/db, df/dc]

def df(a, b, c):
  return 
  


# expected answer is the list of 
ans = [-12.353553390593273, 10.25699027111255, 0.0625]
yours = gradf(2, 3, 4)
for dim in range(3):
  ok = 'OK' if abs(yours[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {yours[dim]}")


OK for dim 0: expected -12.353553390593273, yours returns -12.353553390593273
OK for dim 1: expected 10.25699027111255, yours returns 10.25699027111255
OK for dim 2: expected 0.0625, yours returns 0.0625


In [42]:
# now estimate the gradient numerically without any calculus, using
# the approximation we used in the video.
# you should not call the function df from the last cell

# -----------
a = 2
b = 3
c = 4
h = 0.000001
fa = ((f(a + h, b, c) - f(a, b, c)) / h)
fb = ((f(a, b + h, c) - f(a, b, c)) / h)
fc = ((f(a, b, c + h) - f(a, b, c)) / h)

numerical_grad = [fa, fb, fc] # TODO
# -----------

for dim in range(3):
  ok = 'OK' if abs(numerical_grad[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad[dim]}")


OK for dim 0: expected -12.353553390593273, yours returns -12.353559348809995
OK for dim 1: expected 10.25699027111255, yours returns 10.256991666679482
OK for dim 2: expected 0.0625, yours returns 0.062499984743169534


In [56]:
# there is an alternative formula that provides a much better numerical 
# approximation to the derivative of a function.
# learn about it here: https://en.wikipedia.org/wiki/Symmetric_derivative
# implement it. confirm that for the same step size h this version gives a
# better approximation.

# -----------
a = 2
b = 3
c = 4
h = 0.00001 #can use a 10x bigger step size and it will pass the test

fa = ((f(a + h, b, c) - f(a - h, b, c)) / (2 * h))
fb = ((f(a, b + h, c) - f(a, b - h, c)) / (2 * h))
fc = ((f(a, b, c + h) - f(a, b, c - h)) / (2 * h))

numerical_grad2 = [fa, fb, fc] # TODO
# -----------

for dim in range(3):
  ok = 'OK' if abs(numerical_grad2[dim] - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {numerical_grad2[dim]}")


OK for dim 0: expected -12.353553390593273, yours returns -12.353553390820336
OK for dim 1: expected 10.25699027111255, yours returns 10.256990271617639
OK for dim 2: expected 0.0625, yours returns 0.06250000001983835


## section 2: support for softmax

In [107]:
# Value class starter code, with many functions taken out
from math import exp, log


class Value:

    def __init__(self, data, _children=(), _op='', label=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op
        self.label = label

    def __repr__(self):
        return f"Value(data={self.data})"

    def __add__(self, other):  # exactly as in the video
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += 1.0 * out.grad
        out._backward = _backward

        return out

    # ------
    # re-implement all the other functions needed for the exercises below
    # your code here
    # TODO

    def __radd__(self, other):
        if other == 0:
            return self
        else:
            return self.__add__(other)

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __truediv__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data / other.data, (self, other), '/')

        def _backward():
            self.grad += 1.0 / other.data * out.grad
            other.grad += -self.data / (other.data ** 2) * out.grad
        out._backward = _backward

        return out

    def __sub__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data - other.data, (self, other), '-')

        def _backward():
            self.grad += 1.0 * out.grad
            other.grad += -1.0 * out.grad
        out._backward = _backward

        return out

    def __neg__(self):
        out = Value(-self.data, (self,), 'neg')

        def _backward():
            self.grad += -1.0 * out.grad
        out._backward = _backward

        return out

    def exp(self):
        out = Value(exp(self.data), (self,), 'exp')

        def _backward():
            self.grad += exp(self.data) * out.grad
        out._backward = _backward

        return out

    def log(self):
        out = Value(log(self.data), (self,), 'log')

        def _backward():
            self.grad += 1.0 / self.data * out.grad
        out._backward = _backward

        return out

    # ------

    def backward(self):  # exactly as in video
        topo = []
        visited = set()

        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)

        self.grad = 1.0
        for node in reversed(topo):
            node._backward()


In [104]:
# without referencing our code/video __too__ much, make this cell work
# you'll have to implement (in some cases re-implemented) a number of functions
# of the Value object, similar to what we've seen in the video.
# instead of the squared error loss this implements the negative log likelihood
# loss, which is very often used in classification.

# this is the softmax function
# https://en.wikipedia.org/wiki/Softmax_function
def softmax(logits):
  counts = [logit.exp() for logit in logits]
  denominator = sum(counts)
  out = [c / denominator for c in counts]
  return out

# this is the negative log likelihood loss function, pervasive in classification
logits = [Value(0.0), Value(3.0), Value(-2.0), Value(1.0)]
probs = softmax(logits)
print(probs)
loss = -probs[3].log() # dim 3 acts as the label for this input example
loss.backward()
print(loss.data)

ans = [0.041772570515350445, 0.8390245074625319, 0.005653302662216329, -0.8864503806400986]
for dim in range(4):
  ok = 'OK' if abs(logits[dim].grad - ans[dim]) < 1e-5 else 'WRONG!'
  print(f"{ok} for dim {dim}: expected {ans[dim]}, yours returns {logits[dim].grad}")


[Value(data=0.04177257051535045), Value(data=0.839024507462532), Value(data=0.005653302662216329), Value(data=0.11354961935990121)]
2.1755153626167147
OK for dim 0: expected 0.041772570515350445, yours returns 0.041772570515350445
OK for dim 1: expected 0.8390245074625319, yours returns 0.8390245074625319
OK for dim 2: expected 0.005653302662216329, yours returns 0.005653302662216329
OK for dim 3: expected -0.8864503806400986, yours returns -0.886450380640099


In [110]:
# First, we need the torch library. It's a tool that helps us do math (like addition, multiplication, etc.) on big lists of numbers (which we call "tensors") all at once.
import torch

# We start with some numbers (we call these "logits"). We tell torch that we want to remember what we do with these numbers because we'll need that information later.
logits_torch = torch.tensor([0.0, 3.0, -2.0, 1.0], requires_grad=True)

# We then use a special function called "softmax" to turn our logits into probabilities. This makes the logits into numbers between 0 and 1 that add up to 1, like rolling a dice.
probs_torch = torch.softmax(logits_torch, dim=0)

# We then look at the probability of the fourth event (counting starts from zero, so it's at position 3). We take the negative of its log as our "loss". Think of it as a measure of how surprised we are if the fourth event happens.
loss_torch = -probs_torch[3].log()

# Now we ask torch to figure out how our loss changes when we nudge the logits a little bit. This is called "computing the gradient", and it's like figuring out which way is downhill if you are hiking.
loss_torch.backward()

# Finally, we print out these gradients we computed. These numbers tell us how to change our logits next time to make our loss smaller (like a hint to reach the bottom of the hill faster).
print(logits_torch.grad)


tensor([ 0.0418,  0.8390,  0.0057, -0.8865])
