# Warm Restart



# Topics

- Basics of deep learning -- tensors, and gradients
- Loss
- Backpropagation
- Training loop
- Fitting a curve
- Multi dimensional inputs
- Limitations of polynomial models


In [None]:
import torch
import numpy as np

In [None]:
## Make a tensor with a value of '3.0'. What is its datatype?
a = torch.tensor(3)

a.dtype

torch.int64

In [24]:
## Tensor arithmetic
a = torch.tensor(4.0)
b = torch.tensor(2.5, dtype=torch.float64)

## Adding two tensors (there are many ways)
a+b, torch.add(a, b), a.__add__(b), torch.add(a, b) == a+b

## Multiplying two tensors
a/b, a//b, a+b, a-b, a*b

(tensor(1.6000, dtype=torch.float64),
 tensor(1., dtype=torch.float64),
 tensor(6.5000, dtype=torch.float64),
 tensor(1.5000, dtype=torch.float64),
 tensor(10., dtype=torch.float64))

## Automatic Differentiation

In [None]:
## What is the requires grad flag

a = torch.tensor(4.0, requires_grad=True)
b = torch.tensor(2.5)

a, b

(tensor(4., requires_grad=True), tensor(2.5000))

In [27]:
## Revisiting tensor operations
a/b, a//b, a+b, a-b, a*b

(tensor(1.6000, grad_fn=<DivBackward0>),
 tensor(1., grad_fn=<NotImplemented>),
 tensor(6.5000, grad_fn=<AddBackward0>),
 tensor(1.5000, grad_fn=<SubBackward0>),
 tensor(10., grad_fn=<MulBackward0>))

In [32]:
c = a+b
c.grad_fn

<AddBackward0 at 0x124894700>

## Can we see ze graph?

```bash
$ brew install graphviz
$ pip install torchviz
```


But you can just see it on my screen for now ^^

In [37]:
import torch
from torchviz import make_dot

a = torch.tensor(4.0, requires_grad=True)
b = torch.tensor(2.5) # note to self: add and remove reqGrad here
c = a / b
d = c + b

# Generate the graph
graph = make_dot(d, params={"a": a, "b": b})
graph.render(view=True)

'Digraph.gv.pdf'

## Multidimensionality

In [None]:
## Making a two dimensional tensor.
a1d =  torch.tensor([2,4])
a1d = torch.tensor(np.array([2, 5]))
a2d = torch.tensor([
    [2,4,2.],[3,5,1]
])

## What is its shape?
a2d

tensor([[2, 4, 1],
        [3, 5, 1]])

In [55]:
## Multi-dimensional tensor addition and multiplication

## 2x2 multiplications
a = torch.tensor([[1,2], [2,-1]])
b = torch.tensor([[1,1], [1,1]])

a*b

tensor([[ 1,  2],
        [ 2, -1]])

In [56]:
## Was this an expected result?

a@b

tensor([[3, 3],
        [1, 1]])

In [None]:
## Alternatives 

torch.mm(a,b), torch.matmul(a,b)

(tensor([[3, 3],
         [1, 1]]),
 tensor([[3, 3],
         [1, 1]]))

#### Tensor broadcasting. 

is a rabbit hole we are not going to go into for now. Happy to answer questions if you have some though.

## Naked tensors, linear classificaiton

In [62]:
torch.manual_seed(42)

<torch._C.Generator at 0x1167ae6d0>

$$x_i \in \mathcal{X} \subset \mathcal{R} \\ y_i \in \mathcal{Y} \subset \mathcal{R}$$

In [107]:
# Only one data instance (make an x = 10 and y=2)
x = torch.tensor(10.)
y = torch.tensor(2.)



$$  f: \mathcal{X} \rightarrow \mathcal{Y} \\ \hat{y}_i = f(x; (m, c)) = m\times x + c$$

In [87]:
# Simplest possible model -- Linear

# 1. Define the parameters m and c
m = torch.randn(1, requires_grad=True)
c = torch.randn(1, requires_grad=True)

print(m, c)

tensor([0.4617], requires_grad=True) tensor([0.2674], requires_grad=True)


In [88]:
# 2. Define the model
def linear(x: torch.Tensor) -> torch.Tensor:
    return (m*x) + c

$$ L_{mse} = \frac{1}{2n} \sum_{i=1}^{n} (\hat{y_i} - y_i)^2 $$

In [89]:
# Define the loss function
def mse(y_pred: torch.Tensor, y_true: torch.Tensor) -> torch.Tensor:
    return 0.5*((y_pred-y_true)**2)

$$\theta_i^{T+1} = \theta_i^{T} - \mathbf{\gamma} \frac{\partial\ L(\ldots; \Theta)}{\partial\ \theta_i }  $$

In [90]:
# Set a learning rate parameter
lr = 0.01

In [91]:
# Compute the prediction, based on input and the model (current parameters)
ypred = linear(x)

print(ypred, y)

tensor([4.8839], grad_fn=<AddBackward0>) tensor(2)


In [92]:
graph = make_dot(ypred, params={"m": m, "c": c})
graph.render(view=True)

'Digraph.gv.pdf'

In [93]:
# Compute the loss based on the prediction and the true value

loss = mse(ypred, y)
loss

tensor([4.1585], grad_fn=<MulBackward0>)

In [97]:
# Compute and print the gradient (backward call)
loss.backward()

In [78]:
# See the gradient values. What patterns do we see?
m, m.grad, c, c.grad, x, y

(tensor([-1.1229], requires_grad=True),
 tensor([-134.1489]),
 tensor([-0.1863], requires_grad=True),
 tensor([-13.4149]),
 tensor(10),
 tensor(2))

In [86]:
graph = make_dot(loss, params={"m": m, "c": c})
graph.render(view=True)

'Digraph.gv.pdf'

#### All together now

In [80]:
values_of_loss = []
values_of_m = []
values_of_c = []
values_of_ypred = []

for i in range(2000):
    # Calcualte model predictions
    y_pred = linear(x)
    
    
    # Compare the prediction with our goal
    loss = mse(y_pred, y)
    print(f"Loss: {loss}\nTrue: {y}\nPred: {y_pred.item()}")
    
    # Reset the gradients before computing new ones
    if m.grad:
        # PS: tensor.grad.zero_() is an inplace operation
        # PPS: You will never do this again in your life.
        m.grad.zero_()
        c.grad.zero_()
        
    # Compute new gradients: BACKPROPAGATE
    loss.backward()
    
    print(f"Parameters before update:\n\tm: {m.item()}\tgrad: {m.grad.item()}\n\tc: {c.item()}\tgrad: {c.grad.item()}")
    with torch.no_grad():
        # Do the actual update
        updated_m = m - (lr*m.grad)
        updated_c = c - (lr*c.grad)
        m.copy_(updated_m)
        c.copy_(updated_c)
        
    print(f"Parametrs after update:\n\tm: {m.item()}\tgrad: {m.grad.item()  if c.grad else None}\n\tc: {c.item()}\tgrad: {c.grad.item() if c.grad else None}")

    # Bookkeeping 
    values_of_ypred.append(y_pred.item())
    values_of_m.append(m.item())
    values_of_c.append(c.item())
    values_of_loss.append(loss.item())

    print('------', i, '------')
    cmd = input().strip()
    if cmd in ['q', 'exit', 'break']:
        break
    
    if loss.item() == 0:
        print('Model fully converged. Stopping.')
        break

Loss: tensor([89.9797], grad_fn=<MulBackward0>)
True: 2
Pred: -11.414891242980957
Parameters before update:
	m: -1.1228563785552979	grad: -134.14891052246094
	c: -0.18632829189300537	grad: -13.414891242980957
Parametrs after update:
	m: 0.21863269805908203	grad: -134.14891052246094
	c: -0.052179381251335144	grad: -13.414891242980957
------ 0 ------
Loss: tensor([0.0090], grad_fn=<MulBackward0>)
True: 2
Pred: 2.1341476440429688
Parameters before update:
	m: 0.21863269805908203	grad: 1.3414764404296875
	c: -0.052179381251335144	grad: 0.13414764404296875
Parametrs after update:
	m: 0.20521792769432068	grad: 1.3414764404296875
	c: -0.05352085828781128	grad: 0.13414764404296875
------ 1 ------
Loss: tensor([8.9992e-07], grad_fn=<MulBackward0>)
True: 2
Pred: 1.9986584186553955
Parameters before update:
	m: 0.20521792769432068	grad: -0.013415813446044922
	c: -0.05352085828781128	grad: -0.0013415813446044922
Parametrs after update:
	m: 0.2053520828485489	grad: -0.013415813446044922
	c: -0.0535

## Adding Structure



In [108]:
# Wrapping what we did in a class

class LinearRegressor(torch.nn.Module):

    def __init__(self):
        super().__init__()
        self.m = torch.nn.Parameter(torch.tensor(1.))
        self.c = torch.nn.Parameter(torch.tensor(1.))

    def forward(self, x):
        return (self.m*x) + c

In [131]:
## Model class
class LinearRegressor(torch.nn.Module):

    def __init__(self: "LinearRegressor"):
        super().__init__()
        self.layer_1 = torch.nn.Linear(1, 1)
    
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        return self.layer_1(inputs)
    
model = LinearRegressor()

In [132]:
# Loss should never be defined by us
mse = torch.nn.MSELoss()

In [133]:
## Optimizer
opt = torch.optim.SGD(model.parameters())

opt.param_groups

[{'params': [Parameter containing:
   tensor([[0.7826]], requires_grad=True),
   Parameter containing:
   tensor([-0.7105], requires_grad=True)],
  'lr': 0.001,
  'momentum': 0,
  'dampening': 0,
  'weight_decay': 0,
  'nesterov': False,
  'maximize': False,
  'foreach': None,
  'differentiable': False,
  'fused': None}]

In [134]:
## Playing around with optimizer and parameters
x = torch.randn(1,1)
y = torch.tensor([1.])

## lets emulate a 'batch'
ypred = model(x)
loss = mse(ypred, y)
loss.backward()
print(list(model.parameters()))

opt.step()

print(list(model.parameters()))

[Parameter containing:
tensor([[0.7826]], requires_grad=True), Parameter containing:
tensor([-0.7105], requires_grad=True)]
[Parameter containing:
tensor([[0.7842]], requires_grad=True), Parameter containing:
tensor([-0.7081], requires_grad=True)]


## Training Loop

In [None]:
## Lets sketch out a nice loop

# That's all for now

Next time, we're gonna try some real world problems, and focus on data handling.