# Setup

In [88]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import random
import matplotlib.colors as mcolors
from ipywidgets import interact, widgets, interactive, fixed, interact_manual
import random
import matplotlib.colors as mcolors
import time
import colorama
import pickle

with open("Teach_linear_regression.pkl", "rb") as f:
    (
        x, y, COEF_A, COEF_B,
        COEF_A_MIN, COEF_A_MAX,
        COEF_B_MIN, COEF_B_MAX,
        WORST_A_PARAM, WORST_B_PARAM,
        MIN_COST, MAX_COST,
        MIN_VALUE, MAX_VALUE, NB_ELEMENTS,
    ) = pickle.load(f)

In [89]:
def get_df_by_dx(f, x, h):
    return (f(x + h) - f(x - h)) / (2 * h)

def plot_function_with_derivative(f, x_point=1, h=0.5, plot_triangle=False, plot_df_by_dx=False):
    # Generate points for plotting
    MINIMUM_Y = -2
    x = np.linspace(-1, 3, 1000)
    y = f(x)

    # Create the plot with twin axes
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax2 = ax1.twinx() if plot_df_by_dx else None

    if plot_df_by_dx:
        # Plot derivative on second axis
        df_by_dx = get_df_by_dx(f, x, h)
        # Split into positive and negative parts
        pos_mask = df_by_dx > 0
        neg_mask = ~pos_mask

        # print(f"{x.shape = }")
        # print(f"{df_by_dx.shape = }")
        # print(f"{x[pos_mask].shape = }")
        # print(f"{df_by_dx[pos_mask].shape = }")
        # print(f"{x[neg_mask].shape = }")
        # print(f"{df_by_dx[neg_mask].shape = }")

        # Plot positive values in green
        ax2.scatter(x[pos_mask], df_by_dx[pos_mask], c='g', marker='.', label="f'(x) > 0", alpha=0.25)
        # Plot negative values in red
        ax2.scatter(x[neg_mask], df_by_dx[neg_mask], c='r', marker='.', label="f'(x) < 0", alpha=0.25)

        ax2.set_ylabel(r"$\frac{df(x)}{dx}$", color="g")
        ax2.tick_params(axis='y', labelcolor="g")

    # Plot main function on first axis
    ax1.plot(x, y, "black", label="f(x)", alpha=0.5)
    # Calculate points for derivative visualization
    x_plus_h = x_point + h
    x_minus_h = x_point - h
    y_point = f(x_point)
    y_plus_h = f(x_plus_h)
    y_minus_h = f(x_minus_h)

    # Plot points
    ax1.plot(
        x_point, y_point, "X", color="orange", label=f"f({x_point:.2f})", markersize=10
    )
    ax1.plot([x_point, x_point], [MINIMUM_Y, y_point], color="orange", alpha=0.5)

    if plot_triangle:
        ax1.plot(x_point + h, y_plus_h, "+", color="gray", alpha=0.5, markersize=8)
        ax1.plot(x_point - h, y_minus_h, "+", color="gray", alpha=0.5, markersize=8)

        # Plot secant line through the two points
        x_line = np.array([x_point - h, x_point + h])
        y_line = np.array([y_minus_h, y_plus_h])
        ax1.plot(x_line, y_line, "-", color="purple", linewidth=2)

        top_horizontal_line_x = np.array([x_minus_h, x_plus_h])
        top_horizontal_line_y = np.array([y_plus_h, y_plus_h])

        left_vertical_line_x = np.array([x_minus_h, x_minus_h])
        left_vertical_line_y = np.array([y_minus_h, y_plus_h])

        h_minus_gray_line_x = np.array([x_minus_h, x_minus_h])
        h_minus_gray_line_y = np.array([-10, y_minus_h])

        h_plus_gray_line_x = np.array([x_plus_h, x_plus_h])
        h_plus_gray_line_y = np.array([-10, y_plus_h])

        # Add text annotations for dy and dx with their values
        dy = y_plus_h - y_minus_h
        dx = x_plus_h - x_minus_h
        derivative = dy / dx

        # Using matplotlib's text with math mode and custom colors
        ax1.text(-0.85, 3.7, r"$\mathbf{dy} = " + f"{dy:.2f}$", color="red", fontsize=12)
        ax1.text(
            -0.85,
            3.3,
            r"$\mathbf{dx} = " + r" 2 \times h " + f" = {dx:.2f} " + "$",
            color="blue",
            fontsize=12,
        )

        # Add the ratio calculation with colored values and result
        ax1.text(
            -0.85,
            2.6,
            r"$\mathbf{"
            + r"\frac{dy}{dx} = "
            + r"\frac{"
            + f"{dy:.2f}"
            + "}{"
            + f"{dx:.2f}"
            + "}} = "
            + f"{derivative:.2f}$",
            color="purple",
            fontsize=14,
        )

        ax1.text(
            top_horizontal_line_x[0],
            top_horizontal_line_y[0] + 0.1,
            r"$\mathbf{dx} = 2 h$",
            color="blue",
            fontsize=12,
        )
        ax1.plot(
            top_horizontal_line_x,
            top_horizontal_line_y,
            "-",
            color="blue",
            linewidth=2,
            label="$dx = 2 h$",
        )
        ax1.plot(
            left_vertical_line_x,
            left_vertical_line_y,
            "-",
            color="red",
            linewidth=2,
            label=r"$dy = \frac{f(x+h) - f(x-h)}{2h}$",
        )
        ax1.plot(
            h_minus_gray_line_x,
            h_minus_gray_line_y,
            ":",
            color="gray",
            linewidth=2,
            label="x-h",
        )
        ax1.plot(
            h_plus_gray_line_x,
            h_plus_gray_line_y,
            ":",
            color="gray",
            linewidth=2,
            label="x+h",
        )

        ax1.text(x_plus_h + 0.05, MINIMUM_Y + 0.5, r"x+h", color="gray", fontsize=12)
        ax1.text(x_minus_h - 0.15, MINIMUM_Y + 0.5, r"x-h", color="gray", fontsize=12)

    ax1.grid(True)
    plt.title(f"Function with Derivative Approximation (h={h:.2f})")
    ax1.set_xlabel("x")
    ax1.set_ylabel("f(x)", color='b')
    ax1.tick_params(axis='y', labelcolor='b')

    ax1.set_xlim(-1, 3)
    ax1.set_ylim(-2, 4)


def chaotic_function(x):
    return (
        np.sin(5 * x**2) * np.cos(x / 2)
        + np.exp(-((x - 3) ** 2) / 10) * np.sin(5 * x)
        + 0.5 * np.tanh(x / 2)
    )

# Teach

## Goal

Determine how to update A & B so we reduce loss


This is : $ \frac{d \text{loss}}{d a} $ & $\frac{d \text{loss}}{d b} $


But, what's the derivate ?

In [90]:
@interact(
    x_point=widgets.FloatSlider(
        value=0.75,
        min=0.0,
        max=2.0,
        step=0.01,
        description="X:",
    ),
    h=widgets.FloatSlider(
        value=0.25,
        min=0.01,
        max=0.5,
        step=0.01,
        description="h:",
    ),
    plot_triangle=widgets.Checkbox(
        value=True,
        description="Plot triangle",
    ),
    plot_df_by_dx=widgets.Checkbox(
        value=False,
        description="Plot df/dx",
    ),
)
def interactive_derivative(x_point, h, plot_triangle, plot_df_by_dx):
    def f(x):
        if isinstance(x, np.ndarray):
            y = np.copy(x)
            for i in range(len(x)):
                if x[i] < 1:
                    y[i] = 2*x[i]
                else:
                    y[i] = x[i]/2
            return y
        else:
            return 2*x if x < 1 else x/2

    f = lambda x: x**2
    # f = lambda x: 2*x[x<1]+1 + 3*x[x>=1]+1
    f = chaotic_function
    plot_function_with_derivative(f, x_point, h, plot_triangle, plot_df_by_dx)

# AJOUTER L'ANGLE

interactive(children=(FloatSlider(value=0.75, description='X:', max=2.0, step=0.01), FloatSlider(value=0.25, d…

## Derivate intuition
  - We can use limits to compute an analytical derivate

### $\frac{df(x)}{dx} = \lim_{h \to 0} \frac{f(x+h) - f(x-h)}{2h}$


Let's demonstrate that $\frac{d}{dx}[ax] = a$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{f(x+h) - f(x-h)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{a(x+h) - a(x-h)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{ax + ah - (ax - ah)}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{ax + ah - ax + ah}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} \frac{2ah}{2h}$

### $\frac{d}{dx}[ax] = \lim_{h \to 0} a$

### $\frac{d}{dx}[ax] = a$




## So, which derivatives do we need ?

![](./data/derivative_draw_0_high_level.jpg)

Add, multiply, power, constant & chain rule.

## Variables derivatives formulas :

#### Constant: $\frac{d}{dx}[c] = 0$

$f(x)=-4$ 

---> $\frac{df(x)}{dx}=0$

#### Multiply: $\frac{d}{dx}[ax] = a$

$f(x)=2x$ 

---> $\frac{df(x)}{dx}=2$

#### Add: $\frac{d}{dx}[x+c] = 1$

$f(x)=x+5$ 

---> $\frac{df(x)}{dx}=1$

#### Power: $\frac{d}{dx}[x^n] = n \cdot x^{n-1}$

$f(x)=x^3$ 

---> $\frac{df(x)}{dx}=3x^2$

## Functions : how to solve ? Chain rule !

$ \frac{df(x)}{dx} = 5 \cdot x^2 + 3 $

To derivate, let's break it down in 3 functions : 

$ f_1(x) = x^2 = y_1 $

$ f_2(y_1) = 5 \cdot y_1 = y_2 $

$ f_3(y_2) = y_2 + 3 = y_3$


These functions are dependent on each-others :

$ f(x) = f_3(f_2(f_1(x))) = y$

The result of one function, is used by the next

$ x \rightarrow [f_1(x) = y_1] \rightarrow [f_2(y_1) = y_2] \rightarrow [f_3(y_2) = y_3] \rightarrow y $

And so is the derivate : 

## $ \frac{df(x)}{dx} = \frac{df_3(f_2(f_1(x)))}{dx} = \frac{df_1(x)}{dx} \cdot \frac{df_2(y_1)}{dy_1} \cdot \frac{df_3(y_2)}{dy_2} $

Every time we look at how the output change, when the input change : 

# $ \frac{d_{out}}{d_{in}} $

And this change is carried over : 

$ x \leftarrow \Delta y_1 \leftarrow \Delta y_2 \leftarrow \Delta y_3 \leftarrow \Delta y $


### Chain rule: $\frac{d}{dx}[f(g(x))] = \frac{df(x)}{dg(x)} \cdot \frac{dg(x)}{dx}$

So let's solve our example : 

## $ f_1(x) = x^2 $

Rule: $\frac{d}{dx}[x^n] = n \cdot x^{n-1}$

Derivative: $\frac{df_1(x)}{dx} = 2x $


## $ f_2(y_1) = 5 \cdot y_1 $

Rule: $\frac{d}{dx}[ax] = a$

Derivative: $\frac{df_2(y_1)}{dy_1} = 5 $


## $ f_3(y_2) = y_2 + 3 $

Rule: $\frac{d}{dx}[x+c] = 1$

Derivative: $\frac{df_3(y_2)}{dy_2} = 1 $


## $\frac{df_3(f_2(f_1(x)))}{dx}$

Rule: $\frac{d}{dx}[f(g(x))] = \frac{df(x)}{dg(x)} \cdot \frac{dg(x)}{dx}$

Derivative: 

$ \frac{df_3(f_2(f_1(x)))}{dx} = \frac{df_1(x)}{dx} \cdot \frac{df_2(y_1)}{dy_1} \cdot \frac{df_3(y_2)}{dy_2} $

$ \frac{df(x)}{dx} = 2x \cdot 5 \cdot 1 $

$ \frac{df(x)}{dx} = 10 x $



<details>
<summary>Click to expand cheat sheet</summary>
<!-- ![Derivatives](./data/derivatives_cheat_sheet.png) -->

<img src="./data/derivatives_cheat_sheet.png" alt="Derivatives cheat sheet">



</details>


## Let's apply it to our Linear Regression

![](./data/derivative_draw_0_high_level.jpg)

### GOAL: 

# $ \frac{d\text{loss}}{dA} $ AND $ \frac{d\text{loss}}{dB} $


### Let's use the chain rule to break down the problem !

![](./data/derivative_draw_1.jpg)

What does it mean ?

For each step we can deduce :

# $\frac{d \text{output}}{d \text{input}} $

And we can carry it on :


$$
\frac{d\text{loss}}{da} = \frac{d\text{loss}}{d\Delta} \cdot \frac{d\Delta}{d\hat{y}} \cdot \frac{d\hat{y}}{du} \cdot \frac{du}{dA}
$$

$$
\frac{d\text{loss}}{db} = \frac{d\text{loss}}{d\Delta} \cdot \frac{d\Delta}{d\hat{y}} \cdot \frac{d\hat{y}}{db}
$$



![](./data/derivative_draw_2.jpg)

And we can use them to carry on the $d \text{loss}$


Let's see

![](./data/derivative_draw_3.jpg)

Todo: Add black line for simplification


### Let's fill the derivatives : 

Constant: $\frac{d}{dx}[c] = 0$

Multiply: $\frac{d}{dx}[ax] = a$

Add: $\frac{d}{dx}[x+c] = 1$

Power: $\frac{d}{dx}[x^n] = n \cdot x^{n-1}$

![](./data/derivative_draw_4.jpg)

# Let's code it !

In [91]:
# ------------------------------ Base functions ------------------------------ #
def predict(x, thetas):
    a, b = thetas
    y_hat = (a * x) + b
    return y_hat


def get_cost(y, y_hat):
    return ((y_hat - y) ** 2).mean()


In [92]:
# -------------------------------- Derivation -------------------------------- #
def get_d_cost_by_d_a(y, y_hat, a):
    return 2 * (y_hat - y) * x


def get_d_cost_by_d_b(y, y_hat, b):
    return 2 * (y_hat - y)


def get_parameter_derivative(x, y, a, b):
    y_hat = predict(x, (a, b))

    d_cost_by_d_a = get_d_cost_by_d_a(y, y_hat, a).mean()
    d_cost_by_d_b = get_d_cost_by_d_b(y, y_hat, b).mean()

    return d_cost_by_d_a, d_cost_by_d_b


In [93]:
# ----------------------------------- learn ---------------------------------- #
def update_parameters(x, y, a, b, learning_rate):
    d_cost_by_d_a, d_cost_by_d_b = get_parameter_derivative(x, y, a, b)

    a = a - (learning_rate * d_cost_by_d_a)
    b = b - (learning_rate * d_cost_by_d_b)
    return a, b

a, b = 0, 0
for i in range(1_000):
    a, b = update_parameters(x, y, a, b, learning_rate=0.001)
a, b

(1.8367774745854646, 3.0880909646325705)

## Let's print some details

In [94]:
def learn_with_prints(x, y, a, b, learning_rate, nb_iterations):
    mean_loss = get_cost(y, predict(x, (a, b)))
    print(f"Initial    {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")

    for i in range(nb_iterations):
        a, b = update_parameters(x, y, a, b, learning_rate)

        mean_loss = get_cost(y, predict(x, (a, b)))
        if i % 10 == 0:
            print(f"  step {i:3d} {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")
    mean_loss = get_cost(y, predict(x, (a, b)))
    print(f"Final      {mean_loss = :6.2f} | total loss {mean_loss * len(x):10.2f}")


learn_with_prints(x, y, 0, 0, 1e-2, 200)

Initial    mean_loss =  19.70 | total loss    1969.87
  step   0 mean_loss =  18.73 | total loss    1873.41
  step  10 mean_loss =  11.38 | total loss    1138.23
  step  20 mean_loss =   6.98 | total loss     698.29
  step  30 mean_loss =   4.35 | total loss     434.99
  step  40 mean_loss =   2.77 | total loss     277.36
  step  50 mean_loss =   1.83 | total loss     182.96
  step  60 mean_loss =   1.26 | total loss     126.39
  step  70 mean_loss =   0.92 | total loss      92.44
  step  80 mean_loss =   0.72 | total loss      72.05
  step  90 mean_loss =   0.60 | total loss      59.75
  step 100 mean_loss =   0.52 | total loss      52.31
  step 110 mean_loss =   0.48 | total loss      47.78
  step 120 mean_loss =   0.45 | total loss      44.98
  step 130 mean_loss =   0.43 | total loss      43.22
  step 140 mean_loss =   0.42 | total loss      42.10
  step 150 mean_loss =   0.41 | total loss      41.35
  step 160 mean_loss =   0.41 | total loss      40.82
  step 170 mean_loss =   0.4

## How does it compare to the finite differentiation method ?


In [95]:
# ----------------------------- Finite difference ---------------------------- #

def get_cost_for_parameter(x, y, a, b):
    return get_cost(y, predict(x, (a, b)))


def get_parameter_finite_difference(x, y, a, b):
    epsilon = 0.5
    
    cost_base = get_cost_for_parameter(x, y, a, b)
    cost_a_minus_epsilon = get_cost_for_parameter(x, y, a - epsilon, b)
    cost_b_minus_epsilon = get_cost_for_parameter(x, y, a, b - epsilon)
    
    cost_a_finite_difference = (cost_base - cost_a_minus_epsilon) / epsilon
    cost_b_finite_difference = (cost_base - cost_b_minus_epsilon) / epsilon

    return cost_a_finite_difference, cost_b_finite_difference


def get_parameter_finite_difference_precision(x, y, a, b):
    epsilon = 0.5

    cost_a_minus_epsilon = get_cost_for_parameter(x, y, a - epsilon, b)
    cost_b_minus_epsilon = get_cost_for_parameter(x, y, a, b - epsilon)
    cost_a_plus_epsilon = get_cost_for_parameter(x, y, a + epsilon, b)
    cost_b_plus_epsilon = get_cost_for_parameter(x, y, a, b + epsilon)

    cost_a_finite_difference = (cost_a_plus_epsilon - cost_a_minus_epsilon) / (2 * epsilon)
    cost_b_finite_difference = (cost_b_plus_epsilon - cost_b_minus_epsilon) / (2 * epsilon)

    return cost_a_finite_difference, cost_b_finite_difference



# Let's race them !

In [96]:
# --------------------------------- Optimize --------------------------------- #


def optimize_parameter(x, y, a, b, learning_rate, nb_iterations, derivative_method):
    print(
        f"\nOptimizing parameter with {colorama.Fore.GREEN}{derivative_method}{colorama.Fore.RESET}"
    )
    start_time = time.time()
    for _ in range(nb_iterations):
        if derivative_method == "derivative":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_derivative(x, y, a, b)
        elif derivative_method == "finite_difference":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_finite_difference(x, y, a, b)
        elif derivative_method == "finite_difference_precision":
            d_cost_by_d_a, d_cost_by_d_b = get_parameter_finite_difference_precision(
                x, y, a, b
            )
        a = a - (learning_rate * d_cost_by_d_a)
        b = b - (learning_rate * d_cost_by_d_b)
    end_time = time.time()
    cost = get_cost(y, predict(x, (a, b)))
    print(
        f"Time taken: {colorama.Fore.YELLOW}{end_time - start_time} seconds{colorama.Fore.RESET} \n"
        # f"\twith {derivative_method}\n"
        f"\t{a = }\n"
        f"\t{b = }\n"
        f"\t{colorama.Fore.RED}{cost = }{colorama.Fore.RESET}\n"
    )
    return a, b


# print(f"IDEAL: {COEF_A = }") # With the noise, it's not necessary true
# print(f"IDEAL: {COEF_B = }") # With the noise, it's not necessary true
learning_rate = 1e-4
nb_iterations = 100_000


a, b = optimize_parameter(
    x,
    y,
    WORST_A_PARAM,
    WORST_B_PARAM,
    learning_rate,
    nb_iterations,
    "finite_difference",
)
# a, b = optimize_parameter(
#     x, y, WORST_A_PARAM, WORST_B_PARAM, learning_rate, nb_iterations, "finite_difference_precision"
# )
a, b = optimize_parameter(
    x, y, WORST_A_PARAM, WORST_B_PARAM, learning_rate, nb_iterations, "derivative"
)


Optimizing parameter with [32mfinite_difference[39m
Time taken: [33m2.256598949432373 seconds[39m 
	a = 2.338760207696468
	b = 3.400629049669035
	[31mcost = 0.4308957239289341[39m


Optimizing parameter with [32mderivative[39m
Time taken: [33m1.4625835418701172 seconds[39m 
	a = 2.6716733235463868
	b = 2.9896429548645806
	[31mcost = 0.33984662235534174[39m

