# Lecture 11 Supplementary Notebook

## DSC 40A, Spring 2024

The following cell sets up the necessary imports – don't worry too much about it.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns

from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats("svg")

pd.options.plotting.backend = "plotly"

# DSC 40A preferred styles
pio.templates["dsc40a"] = go.layout.Template(
    layout=dict(
        margin=dict(l=30, r=30, t=30, b=30),
        autosize=True,
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        title=dict(x=0.5, xanchor="center"),
    )
)
pio.templates.default = "simple_white+dsc40a"

from IPython.display import HTML
from ipywidgets import interact, widgets, FloatSlider
pio.renderers.default = 'png'

## Gradient descent

See the Lecture 10 notebook or [dsc40a.com/resources/lectures/lec10](https://dsc40a.com/resources/lectures/lec10) for animations.

$$f(t) = 5t^4 - t^3 - 5t^2 + 2t - 9$$

In [None]:
def f(t):
    return 5 * (t**4) - (t**3) - 5 * (t**2) + 2 * t - 9

def df(t):
    return 20 * (t**3) - 3 * (t**2) - 10 * t + 2

def create_tangent_line(t):
    slope = df(t)
    intercept = f(t) - slope * t
    return lambda x: intercept + slope * x

### Gradient descent update rule

Let's start with an initial guess $t_0 = 0$ and a learning rate $\alpha = 0.01$.

$$t_{i + 1} = t_i - \alpha \frac{df}{dt}(t_i)$$

In [None]:
def minimize_f(df, t0, alpha, verbose=True):
    t = t0
    
    # Iterate until the derivative of f at t is less than 0.0001 in magnitude.
    while np.abs(df(t)) >= 0.0001:
        if verbose:
            print(round(t, 4), round(f(t), 4))
        t = t - alpha * df(t)
    print('Best t:', t)

In [None]:
minimize_f(df, t0=0, alpha=0.01)

What if we start with a different initial guess?

In [None]:
minimize_f(df, t0=1.1, alpha=0.01)

What if we use a different learning rate?

In [None]:
minimize_f(t0=0, alpha=0.1)

Some learning rates are so large that the values of $t$ explode towards infinity! Watch what happens when we use a learning rate of 1:

In [None]:
minimize_f(df, t0=0, alpha=1)

## Convexity

In [None]:
def convexity_visual(a, b, t):
    ts = np.linspace(-20, 20, 1000)

    f = lambda x: x**3 - 3*x**2 + 4*x - 1

    fig = px.line(x=ts, y=f(ts)).update_traces(line=dict(width=8))
    fig.update_layout(xaxis_title='$t$', 
                      yaxis_title='$f(t)$', 
                      width=800, height=600)

    fig.add_trace(go.Scatter(x=[a, b], y=[f(a), f(b)])).update_traces(line=dict(width=8))
    fig.add_trace(go.Scatter(x=[(1-t) * a + t * b], y=[f((1-t) * a + t * b)], mode='markers')).update_traces(marker=dict(size=25))
    fig.add_trace(go.Scatter(x=[(1-t) * a + t * b], y=[(1-t) * f(a) + t * f(b)], mode='markers')).update_traces(marker=dict(size=25))
    fig.update_layout(showlegend=False, title=f't={t}')
    return fig

In [None]:
interact(convexity_visual, a=(-20, 5, 0.1), b=(5, 20, 0.1), t=FloatSlider(min=0, max=1, step=0.01, value=0.5))

## More examples

### Example: Huber loss

For the constant model, $H(x) = h$:

$$L_\text{huber}(y_i, h) = \begin{cases} \frac{1}{2} (y_i - h)^2 & \text{if } | y_i - h| \leq \delta \\ \delta \cdot (| y_i - h | - \frac{1}{2} \delta) & \text{otherwise} \end{cases}$$

In [None]:
y = np.array([72])

In [None]:
def R(h, delta):
    out = 0
    for i in range(len(y)):
        if np.abs(y[i] - h) <= delta:
            out += (1 / 2) * (y[i] - h) ** 2
        else:
            out += delta * (np.abs(y[i] - h) - (1 / 2) * delta)
    return out / len(y)

In [None]:
h = np.linspace(np.min(y) - 10, np.max(y) + 10, 10000)
Rh = [R(hi, delta=1) for hi in h]

For a single point:

In [None]:
px.line(x=h, y=Rh).update_traces(line_color='purple', line_width=4)

In [None]:
def dR(h, delta):
    out = 0
    for i in range(len(y)):
        if np.abs(y[i] - h) <= delta:
            out += -(y[i] - h)
        else:
            out += -delta * (1 if (y[i] - h > 0) else -1)
    return out / len(y)

For a larger dataset:

In [None]:
y = np.array([72, 90, 61, 85, 92])

In [None]:
h = np.linspace(np.min(y) - 10, np.max(y) + 15, 10000)
Rh = [R(hi, delta=1) for hi in h]

In [None]:
px.line(x=h, y=Rh).update_traces(line_color='purple', line_width=4)

$$L_\text{huber}(y_i, h) = \begin{cases} \frac{1}{2} (y_i - h)^2 & \text{if } | y_i - h| \leq \delta \\ \delta \cdot (| y_i - h | - \frac{1}{2} \delta) & \text{otherwise} \end{cases}$$

$$\implies \frac{\partial L}{\partial h}(h) = \begin{cases} -(y_i - h)  & \text{if } | y_i - h| \leq \delta \\ -\delta \cdot \text{sign}(y_i - h) & \text{otherwise} \end{cases}$$

$$\implies \frac{dR_\text{huber}}{dh}(h) = \frac{1}{n} \sum_{i = 1}^n \begin{cases} -(y_i - h)  & \text{if } | y_i - h| \leq \delta \\ -\delta \cdot \text{sign}(y_i - h) & \text{otherwise} \end{cases}$$


Now, let's use gradient descent to minimize. Let's use an initial guess of $h_0 = 25$ and $\alpha = 10$:

In [None]:
minimize_f(dR, t0=25, alpha=10)