In [None]:
# To launch this notebook

# python3 -m venv venv
# .\venv\Scripts\activate
# code .\gradients.ipynb

In [None]:
%pip install tensorflow
%pip install matplotlib
%pip install pydot

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

from tensorflow.python.framework import ops
import keras

<br><br><br><br><br><br><br>

It turns out that Deep Learning is about minimizing a loss function.

Say we have a function that we want to minimize

In [None]:
def test_function(x):
    return x**2 - 6 * x + 5

<br><br><br><br><br><br><br>
We'd like to be able to get quickly to the minimum at x=3

In [None]:
xs = [ x / 10.0 for x in range(-20, 100)]
ys = list(map(test_function, xs))

plt.plot(xs, ys)
plt.show()

<br><br><br><br><br><br><br>
So we define a function that calculates the gradient and we can use that.

In [None]:
def grad(x):
    return 2 * x - 6

x = 0
path = [x]

for _ in range(20):
    x = x - 0.1 * grad(x)
    path.append(x)

x

<br><br><br><br><br><br><br>
And plot the stpes we took

In [None]:
plt.plot(xs, ys)
plt.plot(path, list(map(test_function, path)), "o")
plt.show()

<br><br><br><br><br><br><br>
Taking gradients is hard, particularly when we have to use something like the chain rule.

We can use TensorFlow's inbuilt gradient descent.

We'll start at x=0 and take small steps in the direction where the function decreases.

- This is obviously sensitive to how far we step, and we may step too far and never get closer.
 - We may also just find a local minimum as we aren't looking at the global view of the surface.

In [None]:
path = []

alpha = 0.1

x = tf.Variable(0.0)

for _ in range(20):
  
  with tf.GradientTape() as tape:
    y = test_function(x)

  dy_dx = tape.gradient(y, x)
  x.assign_sub(alpha * dy_dx)

  path.append(x.numpy())
  
x.numpy()

<br><br><br><br><br><br><br>
And that took the same steps.

In [None]:
plt.plot(xs, ys)
plt.plot(path, list(map(test_function, path)), "o")
plt.show()

<br><br><br><br><br><br><br>
And it is worth saying, we can use this technique in higher dimensions (and later we are going to have a lot of dimensions).

In [None]:
def test_function2(x,y):
    return (x - 1) ** 2 + (y - 1) ** 2 

<br><br><br><br><br><br><br>
Which looks like

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = y = np.arange(-3.0, 4.0, 0.05)
X, Y = np.meshgrid(x, y)
zs = np.array(test_function2(np.ravel(X), np.ravel(Y)))
Z = zs.reshape(X.shape)

ax.plot_surface(X, Y, Z)

plt.show()

<br><br><br><br><br><br><br>
But it's just the same old code for minimizing.

In [None]:
path = []

x = tf.Variable(-3.0)
y = tf.Variable(3.0)

for _ in range(100):
  
  with tf.GradientTape() as tape:
   
    z = test_function2(x,y)

  dz_dx, dz_dy = tape.gradient(z, [x,y])
  x.assign_sub(alpha * dz_dx)
  y.assign_sub(alpha * dz_dy)

  path.append((x.numpy(),y.numpy(),z))
  
print(x.numpy(),y.numpy())
  

<br><br><br><br><br><br><br>
And these are the stpes we took.

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for (x,y,z) in path:
    ax.plot(x,y,z, "o")

plt.show()

<br><br><br><br><br><br><br>
There's lots we missed out there.
- the choice of alpha and ideas like momentum
- where we start
- how we avoid just picking a local minimum 

<br><br><br><br><br><br><br>
We apply the same idea to train the neural network.

We're going to build an AND gate. 
- TRUE is 1
- FALSE is 0

And the rules
- TRUE AND TRUE = TRUE
- FALSE AND TRUE = FALSE
- TRUE AND FALSE = FALSE
- FALSE AND FALSE = FALSE

This is where we'll be using the chain rule as we have a neuron, an activation function and a loss function that we seek to minimize.

First, we'll build and train the neuron by hand.

In [None]:
weights = tf.Variable(np.array([0.5,0.5]))
bias = tf.Variable(0.3, dtype="float64")

def neuron(input):
  n = tf.reduce_sum(tf.multiply(weights, input)) + bias
  return tf.keras.activations.relu(n)

for _ in range(100):
  for datum in [[1., 1.], [0., 1.], [1., 0.], [0., 0.]]:
    x = tf.constant(datum, dtype="float64")

    with tf.GradientTape() as tape:
      a = neuron(x)
      expected = datum[0] * datum[1]
      loss = (a - expected) ** 2

    grad = tape.gradient(loss, [weights, bias])

    for var in zip( [weights, bias], grad):
      var[0].assign_sub(var[1] * 0.1)

print ("True, True, ", neuron(tf.constant([1., 1.], dtype="float64")).numpy())
print ("False, True, ", neuron(tf.constant([0., 1.], dtype="float64")).numpy())
print ("True, False, ", neuron(tf.constant([1., 0.], dtype="float64")).numpy())
print ("False, False, ", neuron(tf.constant([0., 0.], dtype="float64")).numpy())

In [None]:
print(weights)
print(bias)

<br><br><br><br><br><br><br>
Use a [layer](https://keras.io/api/layers/), in particular a [Dense layer](https://keras.io/api/layers/core_layers/dense/).

In [None]:
layer = tf.keras.layers.Dense(1, activation='relu' )

for _ in range(100):
  for datum in [[1., 1.], [0., 1.], [1., 0.], [0., 0.]]:
    x = tf.constant([datum])

    with tf.GradientTape() as tape:
      y = layer(x)
      expected = datum[0] * datum[1]
      loss = (y - expected)**2

    grad = tape.gradient(loss, layer.trainable_variables)

    for var in zip(layer.trainable_variables, grad):
      var[0].assign_sub(var[1] * 0.1)

print ("True, True, ", layer(tf.constant([[1., 1.]])).numpy())
print ("False, True, ", layer(tf.constant([[0., 1.]])).numpy())
print ("True, False, ", layer(tf.constant([[1., 0.]])).numpy())
print ("False, False, ", layer(tf.constant([[0., 0.]])).numpy())

In [None]:
print(layer.trainable_variables)

<br><br><br><br><br><br><br>
And this is how you really do it, using a declarative model

In [None]:
model = keras.Sequential(keras.layers.Dense(1, activation='relu'))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

X = np.array([[1. ,1. ],[1., 0.],[0., 1.],[0., 0.]])
y = np.array([[1.],[0.],[0.],[0.]])
model.fit(X, y, epochs=1000, batch_size=4)



In [None]:
model.predict(np.array([[1.,1.], [1.,0.],[0.,1.],[0.,0.]]))

In [None]:
model.layers[0].trainable_variables

<br><br><br><br><br><br><br>
So when we are in the context of the GradientTape the system is watching the forward calculation, and then using the associated functions in the backwards propagation.

In [None]:
x = tf.Variable(100.)

def log1pexp(x):
  return tf.math.log(1 + tf.exp(x))

with tf.GradientTape() as tape:
  y=log1pexp(x)
dy_dx = tape.gradient(y, x) 

print(dy_dx)


In [None]:
@tf.custom_gradient
def log1pexp(x):
  e = tf.exp(x)
  def grad(upstream):
    return upstream * (1 - 1 / (1 + e))
  return tf.math.log(1 + e), grad

with tf.GradientTape() as tape:
  y=log1pexp(x)
dy_dx = tape.gradient(y, x) 

print(dy_dx)

<br><br><br><br><br><br><br>
To the sources....

[GradientTape is defined here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/backprop.py#L705).

The [custom_gradient decorator is defined here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/custom_gradient.py#L47) with [the action defined here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/custom_gradient.py#L292), branching on [eager](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/custom_gradient.py#L536) and [graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/custom_gradient.py#L402) modes.

In graph mode, we end up using [RegisterGradient](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/custom_gradient.py#L512)

We register [a gradient handler here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/math_grad.py#L1405) and do so for all of the builtin ops.





<br><br><br><br><br><br><br>
These are the handlers that are registered for the inbuilt ops.

In [None]:
ops._gradient_registry._registry.keys()

<br><br><br><br><br><br><br>
In eager mode, we [record the gradient function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/eager/record.py#L81) into the objects registered with the C++ kernel code.


In [None]:
import traceback

traceback.print_stack()

@tf.custom_gradient
def log1pexp(x):
  e = tf.exp(x)
  def grad(upstream):
    traceback.print_stack()
    return upstream * (1 - 1 / (1 + e))
  return tf.math.log(1 + e), grad

with tf.GradientTape() as tape:
  y=log1pexp(x)
dy_dx = tape.gradient(y, x) 

print(dy_dx)
