In [None]:
import torch

In [None]:
torch.manual_seed(42)

$$x_i \in \mathcal{X} \subset \mathcal{R} \\ y_i \in \mathcal{Y} \subset \mathcal{R}$$

In [None]:
# Only one data instance (make an x = 10 and y=2)
...

$$  f: \mathcal{X} \rightarrow \mathcal{Y} \\ \hat{y}_i = f(x; (m, c)) = m\times x + c$$

In [None]:
# Simplest possible model -- Linear

# 1. Define the parameters m and c
...

In [None]:
# 2. Define the model
def linear(x: torch.Tensor) -> torch.Tensor:
    ...

$$ L_{mse} = \frac{1}{2n} \sum_{i=1}^{n} (\hat{y_i} - y_i)^2 $$

In [None]:
# Define the loss function
def mse(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
    ...

$$\theta_i^{T+1} = \theta_i^{T} - \mathbf{\gamma} \frac{\partial\ L(\ldots; \Theta)}{\partial\ \theta_i }  $$

In [None]:
# Set a learning rate parameter
lr = ...

In [None]:
# Compute the prediction, based on input and the model (current parameters)
ypred = ...

print(ypred, y)

In [None]:
# Compute the loss based on the prediction and the true value

...

In [None]:
# Compute and print the gradient (backward call)
...

In [None]:
# See the gradient values. What patterns do we see?
m, m.grad, c, c.grad, x, y

### Deriving the Gradient manually

When thinking of only one input sample (x,y) = (10, 2); we can think of the loss + gradient as follows:

$$ L_{mse} = \frac{1}{2} (\hat{y} - y)^2  = \frac{1}{2} e^2 $$

where 

$$ e = (\hat{y} - y) = m \times x + c - y $$

**Let us compute gradients of parameter w.r.t this term $e$**

$$ \frac{\partial e}{\partial m} = x \\ \ \\
\frac{\partial e}{\partial c} = 1
$$

Then, recall that

$$
    L = \frac{1}{2} e^2
$$

Therefore,

$$
\frac{\partial L}{\partial e} = e
$$

Putting it all together, using chain rule of derivates, we get 

$$
\frac{\partial L}{\partial m} = \frac{\partial L}{\partial e} \times \frac{\partial e}{\partial m} = e x = (m\times x + c - y)x
$$

and

$$
\frac{\partial L}{\partial c} = \frac{\partial L}{\partial e} \times \frac{\partial e}{\partial c} = e = (m\times x + c - y)
$$


### But does this actually hold?

In [None]:
# Implemenet both these formulas

### Gradient Accumulates in the grad attribute

In [None]:
# Simulate the loss backwards again and see what happens to gradients in the parameters

In [None]:
# Current gradient is the same as 'doing it twice'
m.grad/2, c.grad/2

In [None]:
m.grad, c.grad

# Running the entire thing

Hint: press enter to continue. Input 'q' to stop.
Play around with lr to see different things ;)

In [None]:
values_of_loss = []
values_of_m = []
values_of_c = []
values_of_ypred = []

for i in range(2000):
    # Calcualte model predictions
    y_pred = ...
    
    
    # Compare the prediction with our goal
    loss = ...
    print(f"Loss: {loss}\nTrue: {y}\nPred: {y_pred.item()}")
    
    # Reset the gradients before computing new ones
    if m.grad:
        # PS: tensor.grad.zero_() is an inplace operation
        # PPS: You will never do this again in your life.
        m...
        c...
        
    # Compute new gradients: BACKPROPAGATE
    ...
    
    print(f"Parameters before update:\n\tm: {m.item()}\tgrad: {m.grad.item()}\n\tc: {c.item()}\tgrad: {c.grad.item()}")
    with torch.no_grad():
        # Do the actual update
        updated_m = ...
        updated_c = ...
        m.copy_(updated_m)
        c.copy_(updated_c)
        
    print(f"Parametrs after update:\n\tm: {m.item()}\tgrad: {m.grad.item()  if c.grad else None}\n\tc: {c.item()}\tgrad: {c.grad.item() if c.grad else None}")

    # Bookkeeping 
    values_of_ypred.append(y_pred.item())
    values_of_m.append(m.item())
    values_of_c.append(c.item())
    values_of_loss.append(loss.item())

    print('------', i, '------')
    cmd = input().strip()
    if cmd in ['q', 'exit', 'break']:
        break
    
    if loss.item() == 0:
        print('Model fully converged. Stopping.')
        break

In [None]:


from matplotlib import pyplot as plt
def plot_trace(list_of_vals, title: str=None):
    plt.figure(figsize=(10, 6))
    fig = plt.plot(list_of_vals)
    plt.title(title)
    
plot_trace(values_of_loss, "Loss")
plot_trace(values_of_ypred, "y pred")
plot_trace(values_of_m, "param: m")
plot_trace(values_of_c, "param: c")

# Change: Have multiple examples in training data

In [None]:
torch.manual_seed(42)

In [None]:
# TODO: have multiple examples in training data

x = torch.tensor([1., 2., 3., 4., 5.])
y = torch.tensor([2., 4., 6., 8., 10.])

m = torch.randn(1., requires_grad=True)
c = torch.randn(1., requires_grad=True)



In [None]:
y_pred = ...

# This does not give us 'one number'. We need to reduce it to one number
mse(y_pred, y)

In [None]:
def mse_avg(y_pred, y_true):
    ...

In [None]:
# Lets try this again
loss = ...

In [None]:
# Let a learning rate (0.01 ;) )
lr = ...

In [None]:
# Backprop time
...

m, m.grad, c, c.grad

In [None]:
# LETS DO THIS IN A LOOP
values_of_loss = []
values_of_m = []
values_of_c = []
# values_of_ypred = []

for i in range(2000):
    # Calcualte model predictions
    y_pred = fx(x)
    
    
    # Compare the prediction with our goal
    loss = mse_avg(y_pred, y)
    print(f"Loss: {loss}\nTrue: {y}\nPred: {y_pred}")
    
    # Reset the gradients before computing new ones
    if m.grad:
        m.grad.zero_()
        c.grad.zero_()
        
    print(f"Parameters before update:\n\tm: {m.item()}\tgrad: {m.grad.item()}\n\tc: {c.item()}\tgrad: {c.grad.item()}")
    
    # Compute new gradients: BACKPROPAGATE
    loss.backward()
    
    with torch.no_grad():
        m.copy_(m - (lr*m.grad))
        c.copy_(m - (lr*m.grad))
        
        
    print(f"Parametrs after update:\n\tm: {m.item()}\tgrad: {m.grad.item()  if c.grad else None}\n\tc: {c.item()}\tgrad: {c.grad.item() if c.grad else None}")

    # Bookkeeping 
    # values_of_ypred.append(y_pred.item())
    values_of_m.append(m.item())
    values_of_c.append(c.item())
    values_of_loss.append(loss.item())

    print('------', i, '------')
    cmd = input().strip()
    if cmd in ['q', 'exit', 'break']:
        break
    
    # CHANGE: we are less strict here
    if loss.item() < 0.001:
        print('Model fully converged. Stopping.')
        break

In [None]:
from matplotlib import pyplot as plt
def plot_trace(list_of_vals, title: str=None):
    plt.figure(figsize=(10, 6))
    fig = plt.plot(list_of_vals)
    plt.title(title)
    
plot_trace(values_of_loss, "Loss")
plot_trace(values_of_ypred, "y pred")
plot_trace(values_of_m, "param: m")
plot_trace(values_of_c, "param: c")