# Setup

In [1]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors
from ipywidgets import interact, widgets, interactive, fixed, interact_manual
import random
import matplotlib.colors as mcolors
import time
import colorama
import pickle

with open("Teach_linear_regression.pkl", "rb") as f:
    (
        x, y, COEF_A, COEF_B,
        COEF_A_MIN, COEF_A_MAX,
        COEF_B_MIN, COEF_B_MAX,
        WORST_A_PARAM, WORST_B_PARAM,
        MIN_COST, MAX_COST,
        MIN_VALUE, MAX_VALUE, NB_ELEMENTS,
    ) = pickle.load(f)

In [2]:
def plot_function_with_derivative(f, x_point=1, h=0.5):
    # Generate points for plotting
    MINIMUM_Y = -2
    x = np.linspace(-1, 3, 1000)
    y = f(x)

    # Create the plot
    plt.figure(figsize=(12, 6))

    # Plot main function
    plt.plot(x, y, "b-", label="f(x)")

    # Calculate points for derivative visualization
    x_plus_h = x_point + h
    x_minus_h = x_point - h
    y_point = f(x_point)
    y_plus_h = f(x_plus_h)
    y_minus_h = f(x_minus_h)

    # Plot points
    plt.plot(
        x_point, y_point, "X", color="red", label=f"f({x_point:.2f})", markersize=10
    )
    plt.plot(x_point + h, y_plus_h, "+", color="gray", alpha=0.5, markersize=8)
    plt.plot(x_point - h, y_minus_h, "+", color="gray", alpha=0.5, markersize=8)

    # Plot secant line through the two points
    x_line = np.array([x_point - h, x_point + h])
    y_line = np.array([y_minus_h, y_plus_h])
    plt.plot(x_line, y_line, "-", color="gray", linewidth=2)

    # bottom_horizontal_line_x = np.array([x_minus_h, x_plus_h])
    # bottom_horizontal_line_y = np.array([y_minus_h, y_minus_h])

    top_horizontal_line_x = np.array([x_minus_h, x_plus_h])
    top_horizontal_line_y = np.array([y_plus_h, y_plus_h])

    left_vertical_line_x = np.array([x_minus_h, x_minus_h])
    left_vertical_line_y = np.array([y_minus_h, y_plus_h])

    h_minus_gray_line_x = np.array([x_minus_h, x_minus_h])
    h_minus_gray_line_y = np.array([-10, y_minus_h])

    h_plus_gray_line_x = np.array([x_plus_h, x_plus_h])
    h_plus_gray_line_y = np.array([-10, y_plus_h])

    # right_vertical_line_x = np.array([x_plus_h, x_plus_h])
    # right_vertical_line_y = np.array([y_minus_h, y_plus_h])
    # Add text annotations for dy and dx with their values
    dy = y_plus_h - y_minus_h
    dx = x_plus_h - x_minus_h
    derivative = dy / dx

    # Using matplotlib's text with math mode and custom colors
    plt.text(-0.85, 3.7, r"$\mathbf{dy} = " + f"{dy:.2f}$", color="orange", fontsize=12)
    plt.text(
        -0.85,
        3.3,
        r"$\mathbf{dx} = " + r"= 2 \times h " + f" = {dx:.2f} " + "$",
        color="purple",
        fontsize=12,
    )

    # Add the ratio calculation with colored values and result
    plt.text(
        -0.85,
        2.6,
        r"$\mathbf{"
        + r"\frac{dy}{dx} = "
        + r"\frac{"
        + f"{dy:.2f}"
        + "}{"
        + f"{dx:.2f}"
        + "}} = "
        + f"{derivative:.2f}$",
        color="gray",
        fontsize=14,
    )
    # if y_minus_h > y_plus_h:
    # plt.plot(bottom_horizontal_line_x, bottom_horizontal_line_y, ":", color="purple", linewidth=2, label="Horizontal line")
    # plt.plot(right_vertical_line_x, right_vertical_line_y, ":", color="orange", linewidth=2, label="Vertical line")
    # else:
    plt.text(
        top_horizontal_line_x[0],
        top_horizontal_line_y[0] + 0.1,
        r"$\mathbf{dx} = 2 h$",
        color="purple",
        fontsize=12,
    )
    plt.plot(
        top_horizontal_line_x,
        top_horizontal_line_y,
        "-",
        color="purple",
        linewidth=2,
        label="$dx = 2 h$",
    )
    plt.plot(
        left_vertical_line_x,
        left_vertical_line_y,
        "-",
        color="orange",
        linewidth=2,
        label=r"$dy = \frac{f(x+h) - f(x-h)}{2h}$",
    )
    plt.plot(
        h_minus_gray_line_x,
        h_minus_gray_line_y,
        ":",
        color="gray",
        linewidth=2,
        label="x-h",
    )
    plt.plot(
        h_plus_gray_line_x,
        h_plus_gray_line_y,
        ":",
        color="gray",
        linewidth=2,
        label="x+h",
    )

    plt.text(x_plus_h + 0.05, MINIMUM_Y + 0.5, r"x+h", color="gray", fontsize=12)
    plt.text(x_minus_h - 0.15, MINIMUM_Y + 0.5, r"x-h", color="gray", fontsize=12)
    # Extend the line in both directions
    slope = (y_plus_h - y_minus_h) / (2 * h)
    x_extended = np.array([x_point - 5, x_point + 5])
    y_extended = slope * (x_extended - x_point) + y_point
    # Plot both the segment between points and extended line
    # plt.plot(x_extended, y_extended, '--', color='gray', label='Extended secant')
    plt.grid(True)
    plt.legend()
    plt.title(f"Function with Derivative Approximation (h={h:.2f})")
    plt.xlabel("x")
    plt.ylabel("f(x)")
    plt.xlim(-1, 3)
    plt.ylim(-2, 4)


def chaotic_function(x):
    return (
        np.sin(5 * x**2) * np.cos(x / 2)
        + np.exp(-((x - 3) ** 2) / 10) * np.sin(5 * x)
        + 0.5 * np.tanh(x / 2)
    )

# Teach

## Goal

Determine hot to update theta so we reduce loss


This is : $ \frac{d \text{loss}}{d a} $ & $\frac{d \text{loss}}{d b} $


But, what's the derivate ?

In [3]:

@interact(
    x_point=widgets.FloatSlider(
        value=1,
        min=0.0,
        max=2.0,
        step=0.01,
        description="X:",
    ),
    h=widgets.FloatSlider(
        value=0.15,
        min=0.01,
        max=0.5,
        step=0.01,
        description="h:",
    ),
)
def interactive_derivative(x_point, h):
    # f = lambda x: x**2
    f = chaotic_function
    plot_function_with_derivative(f, x_point, h)
# TODO: add toggle for dy/dx plot ? Bi axis would be nice


interactive(children=(FloatSlider(value=1.0, description='X:', max=2.0, step=0.01), FloatSlider(value=0.15, de…

## Derivate intuition
  - We can use limits to compute an analytical derivate

### $\frac{df(x)}{dx} = \lim_{h \to 0} \frac{f(x+h) - f(x-h)}{2h}$


Let's demonstrate that $\frac{d}{dx}[ax] = a$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{f(x+h) - f(x-h)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{a(x+h) - a(x-h)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{ax + ah - (ax - ah)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{ax + ah - ax + ah}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{2ah}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} a$

### $\frac{d}{dx}[ax] = a$




## So, which derivatives do we need ?

![](./data/derivative_draw_0_high_level.jpg)

Add, multiply, power, constant & chain rule

### Variables :
#### Constant: $\frac{d}{dx}[c] = 0$
#### Multiply: $\frac{d}{dx}[ax] = a$
#### Add: $\frac{d}{dx}[x+c] = 1$
#### Power: $\frac{d}{dx}[x^n] = n \cdot x^{n-1}$

### Individual Functions :
#### Add: $\frac{d}{dx}[f(x) + g(x)] = \frac{d}{dx}f(x) + \frac{d}{dx}g(x)$
#### Multiply: $\frac{d}{dx}[c \cdot f(x)] = c \cdot \frac{d}{dx}f(x)$

### Linking functions : 
### Chain rule: $\frac{d}{dx}[f(g(x))] = f'(g(x)) \cdot g'(x) = \frac{df(x)}{dg(x)} \cdot \frac{dg(x)}{dx}$


<details>
<summary>Click to expand cheat sheet</summary>
<!-- ![Derivatives](./data/derivatives_cheat_sheet.png) -->

<img src="./data/derivatives_cheat_sheet.png" alt="Derivatives cheat sheet">



</details>


## Let's apply it to our Linear Regression

![](./data/derivative_draw_0_high_level.jpg)

### GOAL: 

# $ \frac{d\text{loss}}{dA} $

# $ \frac{d\text{loss}}{dB} $


### Let's use the chain rule to break down the problem !

![](./data/derivative_draw_1.jpg)

What does it mean ?

For each step we can deduce :

# $\frac{d \text{output}}{d \text{input}} $

And we can carry it on :


$$
\frac{d\text{loss}}{da} = \frac{d\text{loss}}{d\Delta} \cdot \frac{d\Delta}{d\hat{y}} \cdot \frac{d\hat{y}}{du} \cdot \frac{du}{dA}
$$

$$
\frac{d\text{loss}}{db} = \frac{d\text{loss}}{d\Delta} \cdot \frac{d\Delta}{d\hat{y}} \cdot \frac{d\hat{y}}{db}
$$



![](./data/derivative_draw_2.jpg)

And we can use them to carry on the $d \text{loss}$


Let's see

![](./data/derivative_draw_3.jpg)


### Let's fill the derivatives : 

![](./data/derivative_draw_4.jpg)

# Let's code it !

In [4]:
# ------------------------------ Base functions ------------------------------ #
def predict(x, thetas):
    a, b = thetas
    y_hat = (a * x) + b
    return y_hat


def get_cost(y, y_hat):
    return ((y_hat - y) ** 2).mean()


In [5]:
# -------------------------------- Derivation -------------------------------- #
def get_d_cost_by_d_a(y, y_hat, a):
    return 2 * (y_hat - y) * x


def get_d_cost_by_d_b(y, y_hat, b):
    return 2 * (y_hat - y)


def get_parameter_derivative(x, y, a, b):
    y_hat = predict(x, (a, b))

    d_cost_by_d_a = get_d_cost_by_d_a(y, y_hat, a).mean()
    d_cost_by_d_b = get_d_cost_by_d_b(y, y_hat, b).mean()

    return d_cost_by_d_a, d_cost_by_d_b


In [12]:
# ----------------------------------- learn ---------------------------------- #
def update_parameters(x, y, a, b, learning_rate):
    d_cost_by_d_a, d_cost_by_d_b = get_parameter_derivative(x, y, a, b)

    a = a - (learning_rate * d_cost_by_d_a)
    b = b - (learning_rate * d_cost_by_d_b)
    return a, b

a, b = 0, 0
for i in range(1_000):
    a, b = update_parameters(x, y, a, b, learning_rate=0.001)
a, b

(1.8476208143819894, 3.3264354715464286)

## Let's print some details

In [11]:
def learn_with_prints(x, y, a, b, learning_rate, nb_iterations):
    mean_loss = get_cost(y, predict(x, (a, b)))
    print(f"Initial    {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")

    for i in range(nb_iterations):
        a, b = update_parameters(x, y, a, b, learning_rate)

        mean_loss = get_cost(y, predict(x, (a, b)))
        if i % 10 == 0:
            print(f"  step {i:3d} {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")
    mean_loss = get_cost(y, predict(x, (a, b)))
    print(f"Final      {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")


learn_with_prints(x, y, 0, 0, 1e-2, 100)

Initial    mean_loss =  21.94 | total loss    2193.53
  step   0 mean_loss =  20.85 | total loss    2085.18
  step  10 mean_loss =  12.59 | total loss    1259.50
  step  20 mean_loss =   7.66 | total loss     765.51
  step  30 mean_loss =   4.70 | total loss     469.96
  step  40 mean_loss =   2.93 | total loss     293.13
  step  50 mean_loss =   1.87 | total loss     187.33
  step  60 mean_loss =   1.24 | total loss     124.02
  step  70 mean_loss =   0.86 | total loss      86.13
  step  80 mean_loss =   0.63 | total loss      63.45
  step  90 mean_loss =   0.50 | total loss      49.88
Final      mean_loss =   0.42 | total loss      42.38


## How does it compare to the finite differentiation method ?


In [7]:
# ----------------------------- Finite difference ---------------------------- #

def get_cost_for_parameter(x, y, a, b):
    return get_cost(y, predict(x, (a, b)))


def get_parameter_finite_difference(x, y, a, b):
    epsilon = 0.5
    
    cost_base = get_cost_for_parameter(x, y, a, b)
    cost_a_minus_epsilon = get_cost_for_parameter(x, y, a - epsilon, b)
    cost_b_minus_epsilon = get_cost_for_parameter(x, y, a, b - epsilon)
    
    cost_a_finite_difference = (cost_base - cost_a_minus_epsilon) / epsilon
    cost_b_finite_difference = (cost_base - cost_b_minus_epsilon) / epsilon

    return cost_a_finite_difference, cost_b_finite_difference


def get_parameter_finite_difference_precision(x, y, a, b):
    epsilon = 0.5

    cost_a_minus_epsilon = get_cost_for_parameter(x, y, a - epsilon, b)
    cost_b_minus_epsilon = get_cost_for_parameter(x, y, a, b - epsilon)
    cost_a_plus_epsilon = get_cost_for_parameter(x, y, a + epsilon, b)
    cost_b_plus_epsilon = get_cost_for_parameter(x, y, a, b + epsilon)

    cost_a_finite_difference = (cost_a_plus_epsilon - cost_a_minus_epsilon) / (2 * epsilon)
    cost_b_finite_difference = (cost_b_plus_epsilon - cost_b_minus_epsilon) / (2 * epsilon)

    return cost_a_finite_difference, cost_b_finite_difference



# Let's race them !

In [8]:
# --------------------------------- Optimize --------------------------------- #


def optimize_parameter(x, y, a, b, learning_rate, nb_iterations, derivative_method):
    print(
        f"\nOptimizing parameter with {colorama.Fore.GREEN}{derivative_method}{colorama.Fore.RESET}"
    )
    start_time = time.time()
    for _ in range(nb_iterations):
        if derivative_method == "derivative":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_derivative(x, y, a, b)
        elif derivative_method == "finite_difference":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_finite_difference(x, y, a, b)
        elif derivative_method == "finite_difference_precision":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_finite_difference_precision(
                x, y, a, b
            )
        a = a - (learning_rate * d_cost_by_d_a)
        b = b - (learning_rate * d_cost_by_d_b)
    end_time = time.time()
    cost = get_cost(y, predict(x, (a, b)))
    print(
        f"Time taken: {colorama.Fore.YELLOW}{end_time - start_time} seconds{colorama.Fore.RESET} \n"
        # f"\twith {derivative_method}\n"
        f"\t{a = }\n"
        f"\t{b = }\n"
        f"\t{colorama.Fore.RED}{cost = }{colorama.Fore.RESET}\n"
    )
    return a, b


print(f"IDEAL: {COEF_A = }")
print(f"IDEAL: {COEF_B = }")
learning_rate = 1e-4
nb_iterations = 100_000


a, b = optimize_parameter(
    x,
    y,
    WORST_A_PARAM,
    WORST_B_PARAM,
    learning_rate,
    nb_iterations,
    "finite_difference",
)
# a, b = optimize_parameter(
#     x, y, WORST_A_PARAM, WORST_B_PARAM, learning_rate, nb_iterations, "finite_difference_precision"
# )
a, b = optimize_parameter(
    x, y, WORST_A_PARAM, WORST_B_PARAM, learning_rate, nb_iterations, "derivative"
)

IDEAL: COEF_A = 2.814443737618271
IDEAL: COEF_B = 3.1947961308492467

Optimizing parameter with [32mfinite_difference[39m
Time taken: [33m2.3501152992248535 seconds[39m 
	a = 1.9148347546029918
	b = 3.8929045616776703
	[31mcost = 0.3640424435322637[39m


Optimizing parameter with [32mderivative[39m
Time taken: [33m1.5655572414398193 seconds[39m 
	a = 2.2477478704529146
	b = 3.4819184668732155
	[31mcost = 0.2870335946983811[39m

