In [77]:
# Library imports
import numpy as np
from PIL import Image as im
import matplotlib.pyplot as plt

# Parameters
POINT_COLOR = 'b'
POINT_SIZE = 10
POINT_ZORDER = 10

SURFACE_ALPHA = .4
SURFACE_COLOR = 'r'

TEMPFILE_NAME = "temp.png"

def save_to_gif(filename, images, duration=100):
    images[0].save(
        filename,
        optimize=False,
        save_all=True,
        append_images=images[1:],
        loop=0,
        duration=duration,
    )

# Functions
def bowl(x, y):
    return x ** 2 + y ** 2 + 1

def bowl_gradient(xy):
    return np.array([2 * xy[0], 2 * xy[1]])

def wiggle_bowl(x, y):
    return np.sin(x**2 + y ** 2)

def wiggle_bowl_gradient(xy):
    return 2 * xy * np.cos(sum(xy**2))

def super_wiggle(x, y):
    return np.sin(10*x) * np.cos(10* y) + x ** 2 + y ** 2 + 1

def super_wiggle_gradient(xy):
    return (10) * (np.cos(xy[0]*5) * np.cos(xy[1]*5) - np.sin(xy[0]*5) * np.sin(xy[1])*5) + np.array([2 * xy[0], 2 * xy[1]])

## Example Function

In [76]:
# Define the function
def f(x):
    return x ** 5 - 40 * x ** 3 + 5

x = np.linspace(-7, 7, 100)

# Plot the function surface
fig, ax = plt.subplots()
ax.plot(x, f(x), c='r')
fig.savefig("000.png")
plt.close()

## 3D Tangent

In [50]:
# Define the function
def f(x, y):
    return np.sin(x* y)

# Define the partial derivatives of the function
def df_dx(x, y):
    return y * np.cos(x*y)

def df_dy(x, y):
    return x * np.cos(x*y)

# Define the gradient of the function
def gradient(x, y):
    return np.array([df_dx(x, y), df_dy(x, y)])

x = np.linspace(-np.pi, np.pi, 1000)
y = np.linspace(-np.pi, np.pi, 1000)
x, y = np.meshgrid(x, y)

z = f(x, y)

# Define the point of interest
x_point = 0
y_point = 0
z_point = f(x_point, y_point)

# Compute partial derivatives and gradient at the point of interest
partial_dx = df_dx(x_point, y_point)
partial_dy = df_dy(x_point, y_point)
grad = gradient(x_point, y_point)

# Plot the function surface
fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
ax = plt.axes(projection ='3d')
ax.view_init(20, -20)
ax.plot_surface(x, y, z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)

# Plot the point of interest
ax.scatter(x_point, y_point, z_point, color='black', s=50)

# input space
ax.scatter(x_point, y_point, -1, color='black', s=25)

# Plot the gradient
ax.quiver(x_point, y_point, -1, grad[0], grad[1], 0, color='blue', label='$\\nabla f$')

# Plot the partial derivatives
ax.quiver(x_point, y_point, -1, partial_dx, 0, 0, color='red', label='$\\frac{\\partial f}{\\partial x}$')
ax.quiver(x_point, y_point, -1, 0, partial_dy, 0, color='green', label='$\\frac{\\partial f}{\\partial y}$')

# Add labels and legend
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.legend(fontsize=40)
fig.patch.set_facecolor('none')
fig.savefig("001.png", transparent=True)
plt.close()

## Simple zoom out

In [82]:
# WARNING THIS TAKES A FEW MINUTES TO RUN

FRAMES = 350
zoom_out_speed = .005
initial_xlim = [-.1, .1]
initial_ylim = [-.1, .1]
end_zoom = FRAMES * zoom_out_speed
final_xlim = [initial_xlim[0] - end_zoom, initial_xlim[1] + end_zoom]
final_ylim = [initial_ylim[0] - end_zoom, initial_ylim[1] + end_zoom]

my_func = wiggle_bowl

x = np.linspace(final_xlim[0], final_xlim[1], 1000)
y = np.linspace(final_ylim[0], final_ylim[1], 1000)
X, Y = np.meshgrid(x, y)
Z = my_func(X, Y)

max_z = np.max(Z)
min_z = np.min(Z)

def snap_zoom_out(zoom_out_speed, initial_xlim, initial_ylim):
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    # zoomed out view
    x = np.linspace(initial_xlim[0] - zoom_out_speed, initial_xlim[1] + zoom_out_speed, 1000)
    y = np.linspace(initial_ylim[0] - zoom_out_speed, initial_ylim[1] + zoom_out_speed, 1000)
    X, Y = np.meshgrid(x, y)
    Z = my_func(X, Y)
    
    ax.plot_surface(X, Y, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
    ax.plot_surface(X, Y, np.zeros_like(X) + min_z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA / 2)
    ax.set_xlim([final_xlim[0], final_xlim[1]])
    ax.set_ylim([final_ylim[0], final_ylim[1]])
    ax.set_zlim([min_z, max_z])
    fig.savefig(TEMPFILE_NAME, bbox_inches='tight')
    plt.close()

    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

images = []

for i in range(FRAMES):
    images.append(snap_zoom_out(zoom_out_speed * i, initial_xlim, initial_ylim))

save_to_gif("002.gif", images)

## Zoom out with window

In [79]:
FRAMES = 350
zoom_out_speed = .005
initial_xlim = [-.1, .1]
initial_ylim = [-.1, .1]
my_func = wiggle_bowl

def snap_zoom(zoom_out_amount, initial_xlim, initial_ylim):
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    # zoomed out view
    x = np.linspace(initial_xlim[0] - zoom_out_amount, initial_xlim[1] + zoom_out_amount, 100)
    y = np.linspace(initial_ylim[0] - zoom_out_amount, initial_ylim[1] + zoom_out_amount, 100)
    X, Y = np.meshgrid(x, y)

    Z = my_func(X, Y)
    min_Z = -.75
    ax.plot_surface(X, Y, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)

    # initial view
    x = np.linspace(initial_xlim[0], initial_xlim[1], 100)
    y = np.linspace(initial_ylim[0], initial_ylim[1], 100)
    X, Y = np.meshgrid(x, y)
    
    Z = my_func(X, Y)
    
    ax.plot_surface(X, Y, Z, color='b', alpha=.8)
    ax.plot_surface(X, Y, np.zeros_like(X) + min_Z, color='b', alpha=.8)
    
    fig.savefig(TEMPFILE_NAME, bbox_inches='tight')
    plt.close()

    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

images = []

for i in range(FRAMES):
    images.append(snap_zoom(i, initial_xlim, initial_ylim))

save_to_gif("003.gif", images)

## Expand Range

In [53]:
FRAMES = 70
expand_speed = .025
initial_xlim = [-.1, .1]
initial_ylim = [-.1, .1]
end_expand = FRAMES * expand_speed
final_xlim = [initial_xlim[0] - end_expand, initial_xlim[1] + end_expand]
final_ylim = [initial_ylim[0] - end_expand, initial_ylim[1] + end_expand]
my_func = wiggle_bowl

def snap_exp(expansion):
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor='white')
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    # full view
    x = np.linspace(final_xlim[0], final_xlim[1], 100)
    y = np.linspace(final_ylim[0], final_ylim[1], 100)
    X, Y = np.meshgrid(x, y)

    Z = my_func(X, Y)
    min_Z = -.75
    ax.plot_surface(X, Y, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)

    # window view
    x = np.linspace(initial_xlim[0] - expansion, initial_xlim[1] + expansion, 100)
    y = np.linspace(initial_ylim[0] - expansion, initial_ylim[1] + expansion, 100)
    X, Y = np.meshgrid(x, y)
    
    Z = my_func(X, Y)
    
    ax.plot_surface(X, Y, Z, color='b', alpha=.7)
    ax.plot_surface(X, Y, np.zeros_like(X) + min_Z, color='b', alpha=.7)
    
    fig.savefig(TEMPFILE_NAME, bbox_inches='tight', transparent=True)
    plt.close()

    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

images = []

for i in range(FRAMES):
    images.append(snap_exp(expand_speed * i))

save_to_gif("004.gif", images)

## Random Window

In [141]:
FRAMES = 10
slide_speed = .025
final_xlim = [-1.85, 1.85]
final_ylim = [-1.85, 1.85]
my_func = wiggle_bowl

slide_xs = [1.3, -1, .3, 1.1, -.7, -1.2, .5, -.85, 1, .9,  0, -0.4]
slide_ys = [1.3, 1, 1.3, -.7, -1.2, .5, -.85, 1, .9, 1,  0, 0.4]

def snap_rand(from_x, from_y, slide_xs, slide_ys):
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    # full view
    x = np.linspace(final_xlim[0], final_xlim[1], 100)
    y = np.linspace(final_ylim[0], final_ylim[1], 100)
    X, Y = np.meshgrid(x, y)

    Z = my_func(X, Y)
    min_z = np.min(Z)
    max_z = np.max(Z)

    ax.set_xlim([final_xlim[0], final_xlim[1]])
    ax.set_ylim([final_ylim[0], final_ylim[1]])
    ax.set_zlim([min_z, max_z])

    # window view
    for i in range(len(slide_xs)):
        slide_x = slide_xs[i]
        slide_y = slide_ys[i]
        x = np.linspace(from_x[0] + slide_x, from_x[1] + slide_x, 100)
        y = np.linspace(from_y[0] + slide_y, from_y[1] + slide_y, 100)
        X, Y = np.meshgrid(x, y)
        
        Z = my_func(X, Y)
    
        ax.plot_surface(X, Y, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
    if len(slide_xs) > 0:
        ax.plot_surface(X, Y, Z, color='b', alpha=.9)
        ax.plot_surface(X, Y, np.zeros_like(X) + min_z, color='b', alpha=.9)

    fig.savefig(TEMPFILE_NAME, bbox_inches='tight')
    plt.close()

    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

images = []

for i in range(0, len(slide_xs)+1):
    images.append(snap_rand([-.5, .5], [-.5, .5], slide_xs[:i], slide_ys[:i]))

save_to_gif("005.gif", images, 500)

## Window Slide

In [142]:
FRAMES = 160
slide_speed = .0125
initial_xlim = [-.1, .1]
initial_ylim = [-.1, .1]
final_xlim = [initial_xlim[0] - FRAMES * slide_speed, initial_xlim[1] + FRAMES * slide_speed]
final_ylim = [initial_ylim[0] - FRAMES * slide_speed, initial_ylim[1] + FRAMES * slide_speed]
my_func = wiggle_bowl

def snap(slide, from_x, from_y):
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    # full view
    x = np.linspace(final_xlim[0], final_xlim[1], 100)
    y = np.linspace(final_ylim[0], final_ylim[1], 100)
    X, Y = np.meshgrid(x, y)

    Z = my_func(X, Y)
    min_z = np.min(Z)
    max_z = np.max(Z)

    # ax.plot_surface(X, Y, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
    ax.set_xlim([final_xlim[0], final_xlim[1]])
    ax.set_ylim([final_ylim[0], final_ylim[1]])
    ax.set_zlim([min_z, max_z])

    # window view
    x = np.linspace(from_x[0], from_x[1] + slide * slide_speed, 1000)
    y = np.linspace(from_y[0], from_y[1] + slide * slide_speed, 1000)
    X, Y = np.meshgrid(x, y)

    # Create mask
    width = 2 ** (1/2)
    mask = (Y - X < -width / 2 - .27) | (Y - X > width / 2 + .27)

    # Apply mask to meshgrid
    X_masked = np.ma.masked_where(mask, X)
    Y_masked = np.ma.masked_where(mask, Y)
    
    Z = my_func(X_masked, Y_masked)

    ax.plot_surface(X_masked, Y_masked, Z, color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
    
    x = np.linspace(from_x[0] + slide * slide_speed, from_x[1] + slide * slide_speed, 100)
    y = np.linspace(from_y[0] + slide * slide_speed, from_y[1] + slide * slide_speed, 100)
    X, Y = np.meshgrid(x, y)

    Z = my_func(X, Y)
    ax.plot_surface(X, Y, Z, color='b', alpha=.9)
    ax.plot_surface(X, Y, np.zeros_like(X) + min_z, color='b', alpha=.9)

    fig.savefig(TEMPFILE_NAME, bbox_inches='tight')
    plt.close()

    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

images = []

for i in range(FRAMES):
    images.append(snap(i, [-1.5, -.5], [-1.5, -.5]))

save_to_gif("006.gif", images)

## Slide Strategy

In [139]:
FRAMES = 35
slide_speed = .05
my_func = bowl

images = []

for i in range(FRAMES):
    images.append(snap(i, [-1.8, -1], [-1.8, -1]))

save_to_gif("007.gif", images, 250)

## Reduced-Size Window Slide

In [57]:
FRAMES = 35
slide_speed = .05
final_xlim = [-2, 2]
final_ylim = [-2, 2]
my_func = bowl

images = []

for i in range(FRAMES):
    images.append(snap(slide_speed * i, [-1.8, -1.6], [-1.8, -1.6]))

save_to_gif("008.gif", images, 250)

## Gradient Small Nudge

In [58]:
init = np.array([-4, -4])
learning_rate = .1
b0 = np.arange(-5, 5, 0.1)
b1 = np.arange(-5, 5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = bowl, bowl_gradient

def snap(x, y, pts, losses, grad):
    # Creating figure
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    
    # Creating plot
    ax.plot_surface(x, y, loss(x, y), color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
    ax.plot(np.array(pts)[:,0], np.array(pts)[:,1], losses, 'o-', c=POINT_COLOR, markersize=POINT_SIZE, zorder=10)
    ax.plot(np.array(pts)[-1,0], np.array(pts)[-1,1], -1, 'o', c=POINT_COLOR, alpha=.5, markersize=7, zorder=10)
    # Vector origin location
    X = [pts[-1][0]]
    Y = [pts[-1][1]]
    Z = [-1]

    # Directional vectors
    U = [-grad[0]]
    V = [-grad[1]]
    W = [0]

    ax.quiver(X, Y, Z, U, V, W, color='g')
    fig.savefig(TEMPFILE_NAME, bbox_inches='tight', transparent=True)
    plt.close()
    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

def gradient_descent(x, y, init, learning_rate, epochs):
    images, losses, pts = [], [loss(init[0], init[1])], [init]

    for _ in range(epochs):
        images.append(snap(x, y, pts, losses, gradient(init)))
        init = init - learning_rate * gradient(init)
        losses.append(loss(init[0], init[1]))
        pts.append(init)
    return images

images = gradient_descent(x, y, init, learning_rate, 7)

save_to_gif("009.gif", images, 250)

## Input Space

In [59]:
init = np.array([-4, -4])
learning_rate = .1
b0 = np.arange(-5, 5, 0.1)
b1 = np.arange(-5, 5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = bowl, bowl_gradient

losses, pts, grad = [loss(init[0], init[1])], [init], gradient(init)

# Creating figure
fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
ax = plt.axes(projection ='3d')
ax.view_init(20, -20)

ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))

# Creating plot
ax.plot_surface(x, y, loss(x, y), color=SURFACE_COLOR, alpha=SURFACE_ALPHA)
ax.plot(np.array(pts)[:,0], np.array(pts)[:,1], losses, 'o-', c=POINT_COLOR, markersize=POINT_SIZE, zorder=10)
ax.plot(np.array(pts)[-1,0], np.array(pts)[-1,1], -1, 'o', c=POINT_COLOR, alpha=.5, markersize=7, zorder=10)
# Vector origin location
X = [pts[-1][0]]
Y = [pts[-1][1]]
Z = [-1]

# Directional vectors
U = [-grad[0]]
V = [-grad[1]]
W = [0]

ax.quiver(X, Y, Z, U, V, W, color='g')

ax.plot_surface(x, y, np.zeros_like(loss(x, y)) -1, color='g', alpha=.3)

fig.savefig("010_2.png", bbox_inches='tight', transparent=True)
plt.close()

## Tiny Nudge

In [60]:
init = np.array([-4, -4])
learning_rate = .001
b0 = np.arange(-5, 5, 0.1)
b1 = np.arange(-5, 5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = bowl, bowl_gradient

images = gradient_descent(x, y, init, learning_rate, 100)

save_to_gif("011.gif", images)

## Gradient Big Nudge

In [61]:
init = np.array([1.2, 1.2])
learning_rate = .01
b0 = np.arange(-1.5, 1.5, 0.1)
b1 = np.arange(-1.5, 1.5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = wiggle_bowl, wiggle_bowl_gradient

images = gradient_descent(x, y, init, learning_rate, 14)

save_to_gif("012.gif", images, 250)

## Initial Condition

In [62]:
inits = [
    np.array([-1, -1]),
    np.array([-.85, -.85]),
]

lrs = [
    .025,
    .1,
]

b0 = np.arange(-1.5, 1.5, 0.1)
b1 = np.arange(-1.5, 1.5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = wiggle_bowl, wiggle_bowl_gradient

def snap_init(x, y, pts, losses, grads):
    # Creating figure
    fig = plt.figure(figsize =(14, 9), constrained_layout=True, facecolor=None)
    ax = plt.axes(projection ='3d')
    ax.view_init(20, -20)

    ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
    # Creating plot
    ax.plot_surface(x, y, loss(x, y), color=SURFACE_COLOR, alpha=SURFACE_ALPHA)

    for i in range(len(pts)):
        ax.plot(np.array(pts[i])[:,0], np.array(pts[i])[:,1], losses[i], 'o-', c=POINT_COLOR, markersize=POINT_SIZE, zorder=10)
        ax.plot(np.array(pts[i])[-1,0], np.array(pts[i])[-1,1], -1, 'o', c=POINT_COLOR, alpha=.5, markersize=7, zorder=10)
        # Vector origin location
        X = [pts[i][-1][0]]
        Y = [pts[i][-1][1]]
        Z = [-1]

        # Directional vectors
        U = [-grads[i][0]]
        V = [-grads[i][1]]
        W = [0]

        ax.quiver(X, Y, Z, U, V, W, color='g')

    fig.savefig(TEMPFILE_NAME, bbox_inches='tight', transparent=True)
    plt.close()
    return im.fromarray(np.asarray(im.open(TEMPFILE_NAME)))

def gradient_descent_init(x, y, inits, lrs, epochs):
    images = []
    losses = {}
    grads = []
    pts = {}
    for i in range(len(inits)):
        pts[i] = [inits[i]]
        losses[i] = [loss(inits[i][0], inits[i][1])]
        grads.append(gradient(inits[i]))

    for _ in range(epochs):
        images.append(snap_init(x, y, pts, losses, grads))
        for i in range(len(inits)):
            inits[i] = inits[i] - lrs[i] * grads[i]
            pts[i].append(inits[i])
            grads[i] = gradient(inits[i])
            losses[i].append(loss(inits[i][0], inits[i][1]))
    return images

images = gradient_descent_init(x, y, inits, lrs, 9)

save_to_gif("013.gif", images, 250)

## Wrong Way

In [63]:
init = np.array([-.2, -.5])
learning_rate = 1.15
b0 = np.arange(-5, 5, 0.1)
b1 = np.arange(-5, 5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = bowl, bowl_gradient

images = gradient_descent(x, y, init, learning_rate, 9)

save_to_gif("014.gif", images, 250)

## Overshoot

In [64]:
init = np.array([-1.5, -1.5])
learning_rate = .3
b0 = np.arange(-2, 2, 0.1)
b1 = np.arange(-2, 2, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = wiggle_bowl, wiggle_bowl_gradient

images = gradient_descent(x, y, init, learning_rate, 9)

save_to_gif("015.gif", images, 250)

## Good Overshoot

In [65]:
init = np.array([-1, -1])
learning_rate = 0.02
b0 = np.arange(-2, 2, 0.01)
b1 = np.arange(-2, 2, 0.01)
x, y = np.meshgrid(b0, b1)
loss, gradient = super_wiggle, super_wiggle_gradient

images = gradient_descent(x, y, init, learning_rate, 15)

save_to_gif("016.gif", images, 250)

## Loop

In [66]:
init = np.array([-.5, -.5])
learning_rate = 1.394
b0 = np.arange(-1.5, 1.5, 0.1)
b1 = np.arange(-1.5, 1.5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = wiggle_bowl, wiggle_bowl_gradient

images = gradient_descent(x, y, init, learning_rate, 9)

save_to_gif("018.gif", images, 250)

## Search for Parameters

In [67]:
inits = [
    np.array([1.2, 1.2]),
    np.array([-.85, -.85]),
    np.array([-.9, -.9]),
    np.array([-.3, .3]),
    np.array([.3, -.85]),
    np.array([-.85, .85])
]

lrs = [
    .01,
    .02,
    .05,
    .05,
    .05,
    .1,
]

b0 = np.arange(-1.5, 1.5, 0.1)
b1 = np.arange(-1.5, 1.5, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = wiggle_bowl, wiggle_bowl_gradient

images = gradient_descent_init(x, y, inits, lrs, 9)

save_to_gif("018.gif", images, 250)

## Sandbox

In [68]:
# divide gradient by magnitude
init = np.array([1.5, -1.5])
learning_rate = .5
b0 = np.arange(-2, 2, 0.1)
b1 = np.arange(-2, 2, 0.1)
x, y = np.meshgrid(b0, b1)
loss, gradient = super_wiggle, super_wiggle_gradient

def gradient_descent_norm(x, y, init, learning_rate, epochs):
    images, losses, pts = [], [loss(init[0], init[1])], [init]

    for _ in range(epochs):
        images.append(snap(x, y, pts, losses, gradient(init) / np.linalg.norm(gradient(init))))
        init = init - learning_rate * gradient(init) / np.linalg.norm(gradient(init))
        losses.append(loss(init[0], init[1]))
        pts.append(init)
    return images

images = gradient_descent_norm(x, y, init, learning_rate, 30)

save_to_gif("sandbox.gif", images)